In [169]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

In [170]:
df = pd.read_csv("../csvs/Titanic-Dataset.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [171]:
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [172]:
df.dropna(subset=["Age", "Embarked"], inplace=True)
df.drop(["Name", "PassengerId", "Ticket", "Cabin"], axis=1, inplace=True)
# df.info()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  712 non-null    int64  
 1   Pclass    712 non-null    int64  
 2   Sex       712 non-null    object 
 3   Age       712 non-null    float64
 4   SibSp     712 non-null    int64  
 5   Parch     712 non-null    int64  
 6   Fare      712 non-null    float64
 7   Embarked  712 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 50.1+ KB


In [173]:
# Convert Categorical columns to Numerical
df["Sex"] = pd.get_dummies(df["Sex"], dtype=int, drop_first=True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.9250,S
3,1,1,0,35.0,1,0,53.1000,S
4,0,3,1,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
885,0,3,0,39.0,0,5,29.1250,Q
886,0,2,1,27.0,0,0,13.0000,S
887,1,1,0,19.0,0,0,30.0000,S
889,1,1,1,26.0,0,0,30.0000,C


In [174]:
df["Embarked"].unique()

array(['S', 'C', 'Q'], dtype=object)

In [175]:
df = pd.get_dummies(df, columns=["Embarked"], dtype=int, drop_first=False)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.2500,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.9250,0,0,1
3,1,1,0,35.0,1,0,53.1000,0,0,1
4,0,3,1,35.0,0,0,8.0500,0,0,1
...,...,...,...,...,...,...,...,...,...,...
885,0,3,0,39.0,0,5,29.1250,0,1,0
886,0,2,1,27.0,0,0,13.0000,0,0,1
887,1,1,0,19.0,0,0,30.0000,0,0,1
889,1,1,1,26.0,0,0,30.0000,1,0,0


In [176]:
cols = [
    "Pclass",
    "Sex",
    "SibSp",
    "Parch",
    "Fare",
    "Age",
    "Embarked_C",
    "Embarked_Q",
    "Embarked_S",
    "Survived",
]
df = df[cols]
df

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Age,Embarked_C,Embarked_Q,Embarked_S,Survived
0,3,1,1,0,7.2500,22.0,0,0,1,0
1,1,0,1,0,71.2833,38.0,1,0,0,1
2,3,0,0,0,7.9250,26.0,0,0,1,1
3,1,0,1,0,53.1000,35.0,0,0,1,1
4,3,1,0,0,8.0500,35.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
885,3,0,0,5,29.1250,39.0,0,1,0,0
886,2,1,0,0,13.0000,27.0,0,0,1,0
887,1,0,0,0,30.0000,19.0,0,0,1,1
889,1,1,0,0,30.0000,26.0,1,0,0,1


In [177]:
# Split the dataset into training, validation and testing
train, validate, test = np.split(
    df.sample(frac=1, random_state=42), [int(0.6 * len(df)), int(0.8 * len(df))]
)

In [178]:
# Split the datasets into features and targets
X_train = train[train.columns[:-1]].values
y_train = train[train.columns[-1]].values

X_validate = validate[validate.columns[:-1]].values
y_validate = validate[validate.columns[-1]].values

X_test = test[test.columns[:-1]].values
y_test = test[test.columns[-1]].values

In [179]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

In [184]:
# Standardize, Oversample and use a pipeline to fit and train the model
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("sampler", RandomOverSampler(random_state=42)),
        # These settings are required for l1_ratio
        ("lr", LogisticRegression(penalty='elasticnet', solver='saga', max_iter=5000)),
    ] # Always follow this order: scale->sample->model
)

param_grid = {
    "lr__l1_ratio": [0.0, 1.0, 0.5], # 'l1', 'l2', 'elasticnet'
    "lr__C": [0.1, 1.0, 10.0],
    "lr__class_weight": [None, "balanced"]
    
}

cv = StratifiedKFold(shuffle=True, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring="recall") #scoring=f1
# Fit the training dataset
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_validate)

print("Best Parameters: ", grid_search.best_params_)
print(classification_report(y_validate, y_pred))
print(confusion_matrix(y_validate, y_pred))

Best Parameters:  {'lr__C': 1.0, 'lr__class_weight': None, 'lr__l1_ratio': 0.0}
              precision    recall  f1-score   support

           0       0.88      0.85      0.86        84
           1       0.79      0.83      0.81        58

    accuracy                           0.84       142
   macro avg       0.83      0.84      0.83       142
weighted avg       0.84      0.84      0.84       142

[[71 13]
 [10 48]]


In [None]:
y_pred = best_model.predict(X_test)
print("Best Parameters: ", grid_search.best_params_)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Best Parameters:  {'lr__C': 10.0, 'lr__class_weight': None, 'lr__l1_ratio': 0.0}
              precision    recall  f1-score   support

           0       0.80      0.81      0.80        84
           1       0.72      0.71      0.72        59

    accuracy                           0.77       143
   macro avg       0.76      0.76      0.76       143
weighted avg       0.77      0.77      0.77       143

[[68 16]
 [17 42]]


- In the case of a titanic model, False Negatives, where there are false deaths is worse than False Positives, so FN > FP... This is not the case, so we look for other ways to improve the model, or compare with other models(KNN in this case)