# Train models

In [13]:
#libraries
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

#file paths
notebook_dir = Path.cwd()
project_dir = notebook_dir.parent
train_path = project_dir / 'data' / 'processed' / 'train_processed.csv'
models_dir = project_dir / 'models'

### Prepare data for model training

In [14]:
#read data
train = pd.read_csv(train_path)

#define the features
feature_cols = ['Sex_numeric','Age', 'Pclass','Fare','FarePerPerson','HasCabin',
                #Family size
                'Alone','Small family','Large family',
                #Title
                'Title__Master','Title__Miss', 'Title__Mr', 'Title__Mrs',
                #Embarked
                'Embarked__C', 'Embarked__Q', 'Embarked__S',
                #Age Group
                'Age_Adult', 'Age_Child', 'Age_Elderly', 'Age_Teen',
                #Deck
                'Deck_A', 'Deck_B', 'Deck_C',	'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_Unknown',
                'isFemaleFirstClass', 'isMaleThirdClass'               
                ]

X = train[feature_cols] #features
Y = train['Survived'] #target

#train/validation split
X_train, X_val, Y_train, Y_val = train_test_split(
    X,Y,
    test_size=0.2,
    random_state=42
)

#data scaling (Standard Scaler)
scaler = StandardScaler()
scaler.fit(X_train)

#update data after scaling
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

#save scaler
with open(models_dir / 'scaler.pkl','wb') as file:
    pickle.dump(scaler,file)
    
#save features
with open(models_dir / 'features.pkl','wb') as file:
    pickle.dump(feature_cols,file)

### 1. Logistic Regression

In [15]:
#model training - Logistic Regression
lr_model = LogisticRegression(max_iter=100)
lr_model.fit(X_train_scaled,Y_train)

#eval
lr_predictions = lr_model.predict(X_val_scaled)
lr_accuracy = accuracy_score(Y_val, lr_predictions)

print(f"Logistic Regression Accuracy: {lr_accuracy:.2%}")

#print feature name and wage pairs, print bias
feature_weight = sorted(list(zip(feature_cols,lr_model.coef_[0])),key=lambda x: abs(x[1]),reverse=True)
for name,wage in feature_weight:
    print(f'{name:<15}: {wage:+.3f}')
print("Bias:", lr_model.intercept_)

#save model
with open(models_dir / 'lr_model.pkl','wb') as file:
    pickle.dump(lr_model,file)


Logistic Regression Accuracy: 81.01%
Pclass         : -1.189
Sex_numeric    : +0.929
isMaleThirdClass: +0.917
Title__Mr      : -0.755
Large family   : -0.720
Title__Mrs     : +0.646
Deck_E         : +0.337
Alone          : +0.309
Age            : -0.288
Title__Master  : +0.279
Deck_D         : +0.246
Fare           : +0.227
Title__Miss    : +0.216
Deck_B         : +0.188
isFemaleFirstClass: -0.160
Age_Child      : +0.110
Deck_F         : +0.103
Embarked__S    : -0.100
Deck_G         : -0.094
Age_Adult      : -0.093
Embarked__C    : +0.089
Small family   : +0.073
Deck_C         : +0.070
Age_Elderly    : +0.057
Deck_Unknown   : +0.039
HasCabin       : -0.039
Embarked__Q    : +0.036
Age_Teen       : -0.027
Deck_A         : +0.024
FarePerPerson  : -0.011
Bias: [-0.59492512]


### 2. Random Forrest

In [16]:
#model training - Random Forrest (no scaling)
rf_model = RandomForestClassifier(
    n_estimators=500,      # num of trees
    max_depth=10,          # max tree depth
    min_samples_split=5,   # min samples to split
    random_state=42
)

rf_model.fit(X_train,Y_train)

cv_scores = cross_val_score(rf_model, X_train, Y_train, cv=5, scoring='accuracy')
print(f"RF - CV mean: {cv_scores.mean():.2%}, standard deviation: {cv_scores.std():.2%}")


rf_predictions = rf_model.predict(X_val)
rf_accuracy = accuracy_score(Y_val,rf_predictions)

print(f"\nRandom Forrest Accuracy: {rf_accuracy:.2%}")
feature_importance = sorted(list(zip(feature_cols, rf_model.feature_importances_)),
                           key=lambda x: x[1], reverse=True)
for name, importance in feature_importance:
    print(f'{name:<15}: {importance*100:.3f}')

#save model
with open(models_dir / 'rf_model.pkl','wb') as file:
    pickle.dump(rf_model,file)

RF - CV mean: 83.00%, standard deviation: 1.16%

Random Forrest Accuracy: 83.24%
Title__Mr      : 14.129
Sex_numeric    : 13.001
FarePerPerson  : 11.184
Fare           : 10.184
Age            : 9.238
Pclass         : 5.421
Title__Mrs     : 4.932
isFemaleFirstClass: 4.284
isMaleThirdClass: 4.010
Large family   : 3.260
Title__Miss    : 3.057
Small family   : 2.472
Deck_Unknown   : 2.057
HasCabin       : 2.007
Embarked__S    : 1.331
Age_Child      : 1.228
Embarked__C    : 1.161
Alone          : 1.129
Age_Adult      : 0.914
Title__Master  : 0.802
Deck_E         : 0.800
Embarked__Q    : 0.622
Deck_B         : 0.581
Deck_D         : 0.533
Deck_C         : 0.490
Age_Teen       : 0.359
Age_Elderly    : 0.281
Deck_A         : 0.215
Deck_F         : 0.163
Deck_G         : 0.155


### 3. XGB Boost

In [17]:
#model training - XGBoost
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)
xgb_model.fit(X_train, Y_train)
xgb_predictions = xgb_model.predict(X_val)
xgb_accuracy = accuracy_score(Y_val,xgb_predictions)
print(xgb_accuracy)

#save model
with open(models_dir / 'xgb_model.pkl','wb') as file:
    pickle.dump(xgb_model,file)

0.7988826815642458


### 4. MLP from scratch

In [18]:
#model training - MLP from scratch
import sys
sys.path.append(str(models_dir))
from models.mlp_from_scratch import Network

mlp_model = Network([len(feature_cols),32,16,1])

history = mlp_model.train(
    X_train_scaled,
    Y_train.values,
    epochs=500,
    learning_rate=0.001,
    verbose=True
)


mlp_accuracy = mlp_model.evaluate(X_val_scaled, Y_val.values)
print(f"MLP Accuracy: {mlp_accuracy:.2%}")

mlp_model.save_weights(models_dir / 'mlp_model_32_16_1.pkl')

Epoch 0/500, Loss: 0.240178
Epoch 10/500, Loss: 0.118372
Epoch 20/500, Loss: 0.115932
Epoch 30/500, Loss: 0.113586
Epoch 40/500, Loss: 0.111515
Epoch 50/500, Loss: 0.109663
Epoch 60/500, Loss: 0.107727
Epoch 70/500, Loss: 0.106458
Epoch 80/500, Loss: 0.105627
Epoch 90/500, Loss: 0.104964
Epoch 100/500, Loss: 0.104306
Epoch 110/500, Loss: 0.103564
Epoch 120/500, Loss: 0.102764
Epoch 130/500, Loss: 0.101944
Epoch 140/500, Loss: 0.100730
Epoch 150/500, Loss: 0.099692
Epoch 160/500, Loss: 0.098957
Epoch 170/500, Loss: 0.098267
Epoch 180/500, Loss: 0.097576
Epoch 190/500, Loss: 0.096944
Epoch 200/500, Loss: 0.096367
Epoch 210/500, Loss: 0.095818
Epoch 220/500, Loss: 0.095235
Epoch 230/500, Loss: 0.094192
Epoch 240/500, Loss: 0.092420
Epoch 250/500, Loss: 0.091499
Epoch 260/500, Loss: 0.090764
Epoch 270/500, Loss: 0.090102
Epoch 280/500, Loss: 0.089470
Epoch 290/500, Loss: 0.088857
Epoch 300/500, Loss: 0.088250
Epoch 310/500, Loss: 0.087658
Epoch 320/500, Loss: 0.087101
Epoch 330/500, Loss: 