<a href="https://colab.research.google.com/github/rawbil/models_V1/blob/main/titanic_model/csvs/model1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict Survival in the Titanic

In [65]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore') # Ignore warnings and focus on main output

In [66]:
df = pd.read_csv('../csvs/Titanic-Dataset.csv')
df.info()
df
# NULL ROWS
"""
- Age
- Cabin
- Embarked
"""

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


'\n- Age\n- Cabin\n- Embarked\n'

In [67]:
# Check for empty rows
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [68]:
# Remove NULL age and Embarked rows
df.dropna(subset=["Age", "Embarked"], inplace=True)

# Replace NULL Cabin values with "Unknown"
df.fillna({"Cabin": "Unknown"}, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          712 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        712 non-null    object 
 11  Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 72.3+ KB


In [69]:
# Look for duplicate values
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.shape
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Unknown,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Unknown,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Unknown,S
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,Unknown,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,Unknown,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [70]:
# Re-arrange Columns
new_cols = ['PassengerId', "Name", "Ticket", "Pclass", "Cabin", "Embarked", "SibSp", "Parch", "Age", "Fare", "Sex", "Survived"]
df = df[new_cols]
df

Unnamed: 0,PassengerId,Name,Ticket,Pclass,Cabin,Embarked,SibSp,Parch,Age,Fare,Sex,Survived
0,1,"Braund, Mr. Owen Harris",A/5 21171,3,Unknown,S,1,0,22.0,7.2500,male,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",PC 17599,1,C85,C,1,0,38.0,71.2833,female,1
2,3,"Heikkinen, Miss. Laina",STON/O2. 3101282,3,Unknown,S,0,0,26.0,7.9250,female,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803,1,C123,S,1,0,35.0,53.1000,female,1
4,5,"Allen, Mr. William Henry",373450,3,Unknown,S,0,0,35.0,8.0500,male,0
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,"Rice, Mrs. William (Margaret Norton)",382652,3,Unknown,Q,0,5,39.0,29.1250,female,0
886,887,"Montvila, Rev. Juozas",211536,2,Unknown,S,0,0,27.0,13.0000,male,0
887,888,"Graham, Miss. Margaret Edith",112053,1,B42,S,0,0,19.0,30.0000,female,1
889,890,"Behr, Mr. Karl Howell",111369,1,C148,C,0,0,26.0,30.0000,male,1


In [71]:
# Convert Sex Col to Integer
df["Sex"] = pd.get_dummies(df["Sex"], drop_first=True, dtype=int)
# 1 - Male
# 0 - Female

# Convert Pclass and Embarked to numeric columns
df = pd.get_dummies(df, columns=["Pclass", "Embarked"], drop_first=False, dtype=int)

# I find Ticket Feature inconclusive
df.drop(["Ticket"], axis=1, inplace=True)
df

Unnamed: 0,PassengerId,Name,Cabin,SibSp,Parch,Age,Fare,Sex,Survived,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,1,"Braund, Mr. Owen Harris",Unknown,1,0,22.0,7.2500,1,0,0,0,1,0,0,1
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",C85,1,0,38.0,71.2833,0,1,1,0,0,1,0,0
2,3,"Heikkinen, Miss. Laina",Unknown,0,0,26.0,7.9250,0,1,0,0,1,0,0,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",C123,1,0,35.0,53.1000,0,1,1,0,0,0,0,1
4,5,"Allen, Mr. William Henry",Unknown,0,0,35.0,8.0500,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,"Rice, Mrs. William (Margaret Norton)",Unknown,0,5,39.0,29.1250,0,0,0,0,1,0,1,0
886,887,"Montvila, Rev. Juozas",Unknown,0,0,27.0,13.0000,1,0,0,1,0,0,0,1
887,888,"Graham, Miss. Margaret Edith",B42,0,0,19.0,30.0000,0,1,1,0,0,0,0,1
889,890,"Behr, Mr. Karl Howell",C148,0,0,26.0,30.0000,1,1,1,0,0,1,0,0


In [72]:
# Drop Cabin since it is 77% null
df.drop(["Cabin"], axis=1, inplace=True)

df

Unnamed: 0,PassengerId,Name,SibSp,Parch,Age,Fare,Sex,Survived,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,1,"Braund, Mr. Owen Harris",1,0,22.0,7.2500,1,0,0,0,1,0,0,1
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,38.0,71.2833,0,1,1,0,0,1,0,0
2,3,"Heikkinen, Miss. Laina",0,0,26.0,7.9250,0,1,0,0,1,0,0,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,35.0,53.1000,0,1,1,0,0,0,0,1
4,5,"Allen, Mr. William Henry",0,0,35.0,8.0500,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,"Rice, Mrs. William (Margaret Norton)",0,5,39.0,29.1250,0,0,0,0,1,0,1,0
886,887,"Montvila, Rev. Juozas",0,0,27.0,13.0000,1,0,0,1,0,0,0,1
887,888,"Graham, Miss. Margaret Edith",0,0,19.0,30.0000,0,1,1,0,0,0,0,1
889,890,"Behr, Mr. Karl Howell",0,0,26.0,30.0000,1,1,1,0,0,1,0,0


In [73]:
#The target Survived should be the last column
# for col in df.columns:
#     if col == "Survived":
#         df.columns[-1] = col
# df.loc["Survived"]

# Remove PassengerId  as it represents column indices which are already provided by default
updated_cols = ["SibSp", "Parch", "Age", "Fare", "Sex", "Pclass_1", "Pclass_2", "Pclass_3", "Embarked_C", "Embarked_Q", "Embarked_S", "Survived"]
df = df[updated_cols]
df

Unnamed: 0,SibSp,Parch,Age,Fare,Sex,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Survived
0,1,0,22.0,7.2500,1,0,0,1,0,0,1,0
1,1,0,38.0,71.2833,0,1,0,0,1,0,0,1
2,0,0,26.0,7.9250,0,0,0,1,0,0,1,1
3,1,0,35.0,53.1000,0,1,0,0,0,0,1,1
4,0,0,35.0,8.0500,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,5,39.0,29.1250,0,0,0,1,0,1,0,0
886,0,0,27.0,13.0000,1,0,1,0,0,0,1,0
887,0,0,19.0,30.0000,0,1,0,0,0,0,1,1
889,0,0,26.0,30.0000,1,1,0,0,1,0,0,1


In [74]:
# from sklearn.preprocessing import StandardScaler
# df_copy = df.copy()
# scaler = StandardScaler()
# df = pd.DataFrame(scaler.fit_transform(df_copy), columns=updated_cols[:-1])
# df

In [75]:
# Divide the dataset into training, validation and testing dataset
train, validate, test = np.split(df.sample(frac=1, random_state=42), [int(0.6 * len(df)), int(0.8 * len(df))]) 
print(len(train))


427


In [76]:
# # Scale and oversample
# from sklearn.preprocessing import StandardScaler
# from imblearn.over_sampling import RandomOverSampler
# train_df = train[train.columns[:-1]].values
# 
# 
# def scale_dataset(dataframe, train=train_df, sampler=False, fit=False):
#     X = dataframe[dataframe.columns[:-1]].values
#     y = dataframe[dataframe.columns[-1]].values
#     
#     scaler = StandardScaler()
#     if fit:
#         X = scaler.fit_transform(X)
#     else:
#         scaler.fit_transform(train)
#         X = scaler.transform(X)
#     
#     if sampler:
#         ros = RandomOverSampler()
#         X, y = ros.fit_resample(X,y)
#     
#     # Combine X and y into a single array
#     data = np.hstack((X, np.reshape(y, (-1, 1))))
#     return data, X, y

In [77]:
# train, X_train, y_train = scale_dataset(train, sampler=True, fit=True)
# validate, X_validate, y_validate = scale_dataset(validate)
# test, X_test, y_test = scale_dataset(test)


# Instead of manually scaling and sampling, we can use the imblearn pipeline when we want to include overSampling, or the sklearn pipeline without sampling

In [78]:
X_train = train[train.columns[:-1]].values
y_train = train[train.columns[-1]].values

X_validate = validate[validate.columns[:-1]].values
y_validate = validate[validate.columns[-1]].values

X_test = test[test.columns[:-1]].values
y_test = test[test.columns[-1]].values

In [79]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Define model pipeline to apply transformers sequentially
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('sampler', RandomOverSampler(random_state=42)),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2] # [Manhattan, Euclidean]
}

cv = StratifiedKFold(shuffle=True, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring="recall") # accuracy, f1, roc_auc

# Fit the training dataset into the model
grid_search.fit(X_train, y_train)

best = grid_search.best_estimator_
y_pred = best.predict(X_validate)

print("Best Parameters: ", grid_search.best_params_)
print(classification_report(y_validate, y_pred))
print(confusion_matrix(y_validate, y_pred))

Best Parameters:  {'knn__n_neighbors': 3, 'knn__p': 1, 'knn__weights': 'uniform'}
              precision    recall  f1-score   support

           0       0.79      0.76      0.78        84
           1       0.67      0.71      0.69        58

    accuracy                           0.74       142
   macro avg       0.73      0.73      0.73       142
weighted avg       0.74      0.74      0.74       142

[[64 20]
 [17 41]]


In [80]:
y_pred = best.predict(X_test)
print("Best Parameters: ", grid_search.best_params_)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Best Parameters:  {'knn__n_neighbors': 3, 'knn__p': 1, 'knn__weights': 'uniform'}
              precision    recall  f1-score   support

           0       0.76      0.79      0.77        84
           1       0.68      0.64      0.66        59

    accuracy                           0.73       143
   macro avg       0.72      0.71      0.72       143
weighted avg       0.73      0.73      0.73       143

[[66 18]
 [21 38]]
