In [1]:
# Titanic Classification :
#  Build a predictive model to determine the likelihood of survival for passengers on
#  the Titanic using data science techniques in Python.
# step1 import the libraries to perform the task
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [63]:
# step2 Read the comma separated value file using read_csv()

df=pd.read_csv('Titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.2500,,S
1,2.0,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,3.0,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.9250,,S
3,4.0,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1000,C123,S
4,5.0,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
994,,,,,,,,,,,,
995,,,,,,,,,,,,
996,,,,,,,,,,,,
997,,,,,,,,,,,,


In [3]:
# step3 Check the missing values
df.isnull().sum()

# insight:-
# The column with the most missing values is "Cabin" (795), followed by "Age" (285).

PassengerId    108
Survived       108
Pclass         108
Name           108
Sex            108
Age            285
SibSp          108
Parch          108
Ticket         108
Fare           108
Cabin          795
Embarked       110
dtype: int64

Step4: Exploratory Data Analysis (EDA)

In [4]:
# step4 Handling the missing value
# to check columns are having quantitative data or categorical data
df.info()
# insight:-
# Most features are of numeric data type (float64), indicating continuous or ordinal variables:-
# PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare | mean || median
# Some features are of object data type, suggesting categorical or text variables:-
# Name, Sex, Ticket, Cabin, Embarked | mode

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    float64
 1   Survived     891 non-null    float64
 2   Pclass       891 non-null    float64
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    float64
 7   Parch        891 non-null    float64
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(7), object(5)
memory usage: 93.8+ KB


In [6]:
# for quantitative
mean_p=df['PassengerId'].mean()
mean_p

446.0

In [9]:
median_s=df['Survived'].median()
median_s

0.0

In [12]:
median_p=df['Pclass'].median()
median_p

3.0

In [14]:
median_a=df['Age'].median()
median_a

28.0

In [16]:
median_si=df['SibSp'].median()
median_si

0.0

In [18]:
median_pa=df['Parch'].median()
median_pa

0.0

In [20]:
median_f=df['Fare'].median()
median_f

14.4542

In [None]:
# For categorical variable

In [24]:
mode_n=df['Name'].mode()
mode_n


0                        Abbing, Mr. Anthony
1                Abbott, Mr. Rossmore Edward
2           Abbott, Mrs. Stanton (Rosa Hunt)
3                        Abelson, Mr. Samuel
4      Abelson, Mrs. Samuel (Hannah Wizosky)
                       ...                  
886                  de Mulder, Mr. Theodore
887                de Pelsmaeker, Mr. Alfons
888                del Carlo, Mr. Sebastiano
889          van Billiard, Mr. Austin Blyler
890              van Melkebeke, Mr. Philemon
Name: Name, Length: 891, dtype: object

In [23]:
mode_s=df['Sex'].mode()
mode_s

0    male
Name: Sex, dtype: object

In [25]:
mode_t=df['Ticket'].mode()
mode_t

0        1601
1      347082
2    CA. 2343
Name: Ticket, dtype: object

In [26]:
mode_c=df['Cabin'].mode()
mode_c

0        B96 B98
1    C23 C25 C27
2             G6
Name: Cabin, dtype: object

In [27]:
mode_e=df['Embarked'].mode()
mode_e

0    S
Name: Embarked, dtype: object

In [None]:
# fill the missing value using fillna()

In [None]:
# quantitative

In [28]:
df['PassengerId']=df['PassengerId'].fillna(mean_p)

In [29]:
df['Survived']=df['Survived'].fillna(median_s)

In [30]:
df['Pclass']=df['Pclass'].fillna(median_p)

In [31]:
df['Age']=df['Age'].fillna(median_a)

In [32]:
df['SibSp']=df['SibSp'].fillna(median_si)

In [33]:
df['Parch']=df['Parch'].fillna(median_pa)

In [34]:
df['Fare']=df['Fare'].fillna(median_f)

In [None]:
# categorical

In [47]:

df['Name'].fillna('Abbing, Mr. Anthony',inplace=True)

In [50]:
df['Sex'].fillna('male',inplace=True)

In [51]:
df['Ticket'].fillna('1601',inplace=True)

In [52]:
df['Cabin'].fillna('G6',inplace=True)

In [55]:
df['Embarked'].fillna('s',inplace=True)

In [56]:
df.isnull().sum()
# insight:-
# Fixed all missing value

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

Step5 : Feature Engineering

In [57]:
# Extract relevant features
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

# Convert categorical variables into numerical
X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True)

# Extract the target variable
y = df['Survived']


Step6 : Split Data into Training and Testing Sets

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Step 6: Feature Scaling

In [59]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Step 7: Build and Train the Logistic Regression Model

In [60]:
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)


Step 8: Make Predictions

In [61]:
y_pred = model.predict(X_test_scaled)


Step 9: Evaluate the Model

In [62]:
# Print accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Accuracy is a measure of how well the model predicts both classes (survived or not survived).

# Print confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
# True Positive (TP): 52 passengers were correctly predicted as survivors.
# True Negative (TN): 90 passengers were correctly predicted as non-survivors.
# False Positive (FP): 15 passengers were incorrectly predicted as survivors.
# False Negative (FN): 22 passengers were incorrectly predicted as non-survivors.

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)
# Precision: The proportion of true positive predictions among all positive predictions. In this case,
# precision is the accuracy of survival predictions.

# Recall (Sensitivity): The proportion of true positives identified correctly among all actual positives.
# In this context, it represents the ability of the model to correctly identify survivors.

# F1-score: The harmonic mean of precision and recall. It provides a balanced measure between precision and recall.

# Support: The number of actual occurrences of the class in the specified dataset.

# 90 passengers were correctly predicted as non-survivors (True Negatives).
# 52 passengers were correctly predicted as survivors (True Positives).
# So, the total number of people who survived in the test set is 52.

Accuracy: 0.84
Confusion Matrix:
 [[115  12]
 [ 19  54]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.91      0.88       127
         1.0       0.82      0.74      0.78        73

    accuracy                           0.84       200
   macro avg       0.84      0.82      0.83       200
weighted avg       0.84      0.84      0.84       200

