In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

In [27]:
df = pd.read_csv("../csvs/Titanic-Dataset.csv")

In [28]:
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [29]:
df.dropna(inplace=True)
df.drop(["Name", "PassengerId", "Ticket", "Cabin"], axis=1, inplace=True)

In [30]:
# Convert Categorical columns to Numerical
df["Sex"] = pd.get_dummies(df["Sex"], dtype=int, drop_first=True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,1,0,38.0,1,0,71.2833,C
3,1,1,0,35.0,1,0,53.1000,S
6,0,1,1,54.0,0,0,51.8625,S
10,1,3,0,4.0,1,1,16.7000,S
11,1,1,0,58.0,0,0,26.5500,S
...,...,...,...,...,...,...,...,...
871,1,1,0,47.0,1,1,52.5542,S
872,0,1,1,33.0,0,0,5.0000,S
879,1,1,0,56.0,0,1,83.1583,C
887,1,1,0,19.0,0,0,30.0000,S


In [31]:
df["Embarked"].unique()

array(['C', 'S', 'Q'], dtype=object)

In [32]:
df = pd.get_dummies(df, columns=["Embarked"], dtype=int, drop_first=False)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
1,1,1,0,38.0,1,0,71.2833,1,0,0
3,1,1,0,35.0,1,0,53.1000,0,0,1
6,0,1,1,54.0,0,0,51.8625,0,0,1
10,1,3,0,4.0,1,1,16.7000,0,0,1
11,1,1,0,58.0,0,0,26.5500,0,0,1
...,...,...,...,...,...,...,...,...,...,...
871,1,1,0,47.0,1,1,52.5542,0,0,1
872,0,1,1,33.0,0,0,5.0000,0,0,1
879,1,1,0,56.0,0,1,83.1583,1,0,0
887,1,1,0,19.0,0,0,30.0000,0,0,1


In [33]:
cols = [
    "Pclass",
    "Sex",
    "SibSp",
    "Parch",
    "Fare",
    "Age",
    "Embarked_C",
    "Embarked_Q",
    "Embarked_S",
    "Survived",
]
df = df[cols]
df

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Age,Embarked_C,Embarked_Q,Embarked_S,Survived
1,1,0,1,0,71.2833,38.0,1,0,0,1
3,1,0,1,0,53.1000,35.0,0,0,1,1
6,1,1,0,0,51.8625,54.0,0,0,1,0
10,3,0,1,1,16.7000,4.0,0,0,1,1
11,1,0,0,0,26.5500,58.0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
871,1,0,1,1,52.5542,47.0,0,0,1,1
872,1,1,0,0,5.0000,33.0,0,0,1,0
879,1,0,0,1,83.1583,56.0,1,0,0,1
887,1,0,0,0,30.0000,19.0,0,0,1,1


In [34]:
# Split the dataset into training, validation and testing
train, validate, test = np.split(
    df.sample(frac=1, random_state=42), [int(0.6 * len(df)), int(0.8 * len(df))]
)

In [35]:
# Split the datasets into features and targets
X_train = train[train.columns[:-1]].values
y_train = train[train.columns[-1]].values

X_validate = validate[validate.columns[:-1]].values
y_validate = validate[validate.columns[-1]].values

X_test = test[test.columns[:-1]].values
y_test = test[test.columns[-1]].values

In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

In [None]:
# Standardize, Oversample and use a pipeline to fit and train the model
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("sampler", RandomOverSampler()),
        # These settings are required for l1_ratio
        ("lr", LogisticRegression(penalty='elasticnet', solver='saga', max_iter=5000)),
    ] # Always follow this order: scale->sample->model
)

param_grid = {
    "lr__l1_ratio": [0.0, 1.0, 0.5], # 'l1', 'l2', 'elasticnet'
    "lr__C": [0.1, 1.0, 10.0],
    "lr__class_weight": [None, "balanced"]
    
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy") #scoring=f1
# Fit the training dataset
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_validate)

print("Best Parameters: ", grid_search.best_params_)
print(classification_report(y_validate, y_pred))

Best Parameters:  {'lr__C': 0.1, 'lr__class_weight': None, 'lr__l1_ratio': 0.5}
              precision    recall  f1-score   support

           0       0.58      0.64      0.61        11
           1       0.84      0.81      0.82        26

    accuracy                           0.76        37
   macro avg       0.71      0.72      0.72        37
weighted avg       0.76      0.76      0.76        37



In [38]:
y_pred = best_model.predict(X_test)
print("Best Parameters: ", grid_search.best_params_)
print(classification_report(y_test, y_pred))

Best Parameters:  {'lr__C': 0.1, 'lr__class_weight': None, 'lr__l1_ratio': 0.5}
              precision    recall  f1-score   support

           0       0.38      0.71      0.50         7
           1       0.92      0.73      0.81        30

    accuracy                           0.73        37
   macro avg       0.65      0.72      0.66        37
weighted avg       0.82      0.73      0.76        37

