In [175]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [165]:
# Read the CSV data

df = pd.read_csv(r"Titanic-Dataset.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [166]:
# Cleaning the Data
rm_keys = df.keys()[df.isnull().sum()/len(df) > 0.5]
print(f"Check Null Values:\n", df.isnull().sum())
print(f"\nRemove Columns:", rm_keys)

# We will remove the columns that have null values >=50%
df.drop(columns=rm_keys, inplace=True)

Check Null Values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Remove Columns: Index(['Cabin'], dtype='object')


In [167]:
# Sanitizing the data
# We will replace the NaN values in: 
# 1. Age column based on mean values of Male and Feamles separately
# 2. Embarked column based on most probable embarking location
mask_male, mask_female = df["Sex"].eq("male"), df["Sex"].eq("female")
df.loc[mask_male & df['Age'].isna(), 'Age'] = df["Age"][mask_male].mean()
df.loc[mask_female & df['Age'].isna(), 'Age'] = df["Age"][mask_female].mean()

freq_counts = df["Embarked"].value_counts()
df.fillna({"Embarked": freq_counts.keys()[freq_counts.argmax()]}, inplace=True)

In [168]:
# Preparing the data
# We will now drop irrelevant columns and try encoding the object dtype columns
# Sex -> {Male: 0, Female: 1}
# Embarked -> {S: 0, C: 1, Q: 2}

df.drop(columns=['Name', 'Ticket'], inplace=True)

sex_vals = df["Sex"].value_counts().keys()
df["Sex"] = df["Sex"].map(dict(zip(sex_vals, range(len(sex_vals)))))

emb_vals = df["Embarked"].value_counts().keys()
df["Embarked"] = df["Embarked"].map(dict(zip(emb_vals, range(len(emb_vals)))))

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,0,22.0,1,0,7.25,0
1,2,1,1,1,38.0,1,0,71.2833,1
2,3,1,3,1,26.0,0,0,7.925,0
3,4,1,1,1,35.0,1,0,53.1,0
4,5,0,3,0,35.0,0,0,8.05,0


In [169]:
# Checking Correlation in the Data
corr = df.corr()
display(corr)

# Keys to be considered in the data
x_keys = corr.keys()[
    (abs(corr["Survived"]) > 0.01) & (abs(corr["Survived"]) < 1.00)
]
print(f"Selected Features: {list(x_keys)}")

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.035543,-0.057527,-0.001652,0.012658,-0.030467
Survived,-0.005007,1.0,-0.338481,0.543351,-0.080453,-0.035322,0.081629,0.257307,0.106811
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.330391,0.083081,0.018443,-0.5495,0.045702
Sex,-0.042939,0.543351,-0.1319,1.0,-0.103236,0.114631,0.245489,0.182333,0.116569
Age,0.035543,-0.080453,-0.330391,-0.103236,1.0,-0.23692,-0.182556,0.089079,0.001913
SibSp,-0.057527,-0.035322,0.083081,0.114631,-0.23692,1.0,0.414838,0.159651,-0.059961
Parch,-0.001652,0.081629,0.018443,0.245489,-0.182556,0.414838,1.0,0.216225,-0.078665
Fare,0.012658,0.257307,-0.5495,0.182333,0.089079,0.159651,0.216225,1.0,0.062142
Embarked,-0.030467,0.106811,0.045702,0.116569,0.001913,-0.059961,-0.078665,0.062142,1.0


Selected Features: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']


In [170]:
# Preparing the dataset
X_data = np.array(df[x_keys])
y_data = np.array(df["Survived"])

X_data.shape, y_data.shape

((891, 7), (891,))

In [181]:
# Split the dataset into Train, Test and Validate
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=2)
X_train.shape, X_test.shape

((712, 7), (179, 7))

In [192]:
# Build the model and train it with CV
from sklearn.linear_model import LogisticRegressionCV

model = LogisticRegressionCV(cv=7, random_state=42, solver="liblinear")
model.fit(X_train, y_train)
print(model)

LogisticRegressionCV(cv=7, random_state=42, solver='liblinear')


In [195]:
print(f"Accuracy with Training set: {model.score(X_train, y_train)}" )
print(f"Accuracy with Test set: {model.score(X_test, y_test)}")

Accuracy with Training set: 0.8146067415730337
Accuracy with Test set: 0.7877094972067039
