# Train models

In [18]:
#libraries
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

#file paths
notebook_dir = Path.cwd()
project_dir = notebook_dir.parent
train_path = project_dir / 'data' / 'processed' / 'train_processed.csv'
models_dir = project_dir / 'models'

### Prepare data for model training

In [27]:
#read data
train = pd.read_csv(train_path)

#define the features
feature_cols = ['Sex_numeric','Age', 'Pclass','Fare','FarePerPerson','HasCabin',
                #Family size
                'Alone','Small family','Large family',
                #Title
                'Title__Master','Title__Miss', 'Title__Mr', 'Title__Mrs',
                #Embarked
                'Embarked__C', 'Embarked__Q', 'Embarked__S',
                #Age Group
                'Age_Adult', 'Age_Child', 'Age_Elderly', 'Age_Teen',
                #Deck
                'Deck_A', 'Deck_B', 'Deck_C',	'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_Unknown'
                ]

X = train[feature_cols] #features
Y = train['Survived'] #target

#train/validation split
X_train, X_val, Y_train, Y_val = train_test_split(
    X,Y,
    test_size=0.2,
    random_state=42
)

#data scaling (Standard Scaler)
scaler = StandardScaler()
scaler.fit(X_train)

#update data after scaling
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

#save scaler
with open(models_dir / 'scaler.pkl','wb') as file:
    pickle.dump(scaler,file)
    
#save features
with open(models_dir / 'features.pkl','wb') as file:
    pickle.dump(feature_cols,file)

### 1. Logistic Regression

In [None]:
#model training - Logistic Regression
lr_model = LogisticRegression(max_iter=100)
lr_model.fit(X_train_scaled,Y_train)

#eval
lr_predictions = lr_model.predict(X_val_scaled)
lr_accuracy = accuracy_score(Y_val, lr_predictions)

print(f"Logistic Regression Accuracy: {lr_accuracy:.2%}")

#print feature name and wage pairs, print bias
feature_weight = sorted(list(zip(feature_cols,lr_model.coef_[0])),key=lambda x: abs(x[1]),reverse=True)
for name,wage in feature_weight:
    print(f'{name:<15}: {wage:+.3f}')
print("Bias:", lr_model.intercept_)

#save model
with open(models_dir / 'lr_model.pkl','wb') as file:
    pickle.dump(lr_model,file)


### 2. Random Forrest

In [28]:
#model training - Random Forrest (no scaling)
rf_model = RandomForestClassifier(
    n_estimators=500,      # num of trees
    max_depth=10,          # max tree depth
    min_samples_split=5,   # min samples to split
    random_state=42
)

rf_model.fit(X_train,Y_train)

cv_scores = cross_val_score(rf_model, X_train, Y_train, cv=5, scoring='accuracy')
print(f"RF - CV mean: {cv_scores.mean():.2%}, standard deviation: {cv_scores.std():.2%}")


rf_predictions = rf_model.predict(X_val)
rf_accuracy = accuracy_score(Y_val,rf_predictions)

print(f"\nRandom Forrest Accuracy: {rf_accuracy:.2%}")
feature_importance = sorted(list(zip(feature_cols, rf_model.feature_importances_)),
                           key=lambda x: x[1], reverse=True)
for name, importance in feature_importance:
    print(f'{name:<15}: {importance*100:.3f}')

#save model
with open(models_dir / 'rf_model.pkl','wb') as file:
    pickle.dump(rf_model,file)

RF - CV mean: 82.86%, standard deviation: 1.41%

Random Forrest Accuracy: 83.80%
Title__Mr      : 16.663
Sex_numeric    : 13.521
FarePerPerson  : 11.602
Fare           : 10.993
Age            : 9.641
Pclass         : 5.905
Title__Mrs     : 5.575
Title__Miss    : 4.050
Large family   : 3.146
Small family   : 2.757
Deck_Unknown   : 2.459
HasCabin       : 2.186
Embarked__S    : 1.308
Age_Child      : 1.306
Embarked__C    : 1.303
Alone          : 1.270
Deck_E         : 0.984
Age_Adult      : 0.898
Title__Master  : 0.699
Deck_B         : 0.688
Embarked__Q    : 0.645
Deck_C         : 0.605
Deck_D         : 0.540
Age_Teen       : 0.405
Age_Elderly    : 0.292
Deck_A         : 0.220
Deck_G         : 0.180
Deck_F         : 0.139
Deck_T         : 0.019


### 3. XGB Boost

In [None]:
#model training - XGBoost
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)
xgb_model.fit(X_train, Y_train)
xgb_predictions = xgb_model.predict(X_val)
xgb_accuracy = accuracy_score(Y_val,xgb_predictions)
print(xgb_accuracy)

#save model
with open(models_dir / 'xgb_model.pkl','wb') as file:
    pickle.dump(xgb_model,file)

### 4. MLP from scratch

In [7]:
#model training - MLP from scratch
import sys
sys.path.append(str(models_dir))
from models.mlp_from_scratch import Network

mlp_model = Network([len(feature_cols),16,1])

history = mlp_model.train(
    X_train_scaled,
    Y_train.values,
    epochs=500,
    learning_rate=0.001,
    verbose=True
)


mlp_accuracy = mlp_model.evaluate(X_val_scaled, Y_val.values)
print(f"MLP Accuracy: {mlp_accuracy:.2%}")

mlp_model.save_weights(models_dir / 'mlp_model_16_1.pkl')

Epoch 0/500, Loss: 0.232246
Epoch 10/500, Loss: 0.123486
Epoch 20/500, Loss: 0.121964
Epoch 30/500, Loss: 0.121099
Epoch 40/500, Loss: 0.120425
Epoch 50/500, Loss: 0.119829
Epoch 60/500, Loss: 0.119254
Epoch 70/500, Loss: 0.118670
Epoch 80/500, Loss: 0.118057
Epoch 90/500, Loss: 0.117390
Epoch 100/500, Loss: 0.116638
Epoch 110/500, Loss: 0.115818
Epoch 120/500, Loss: 0.114979
Epoch 130/500, Loss: 0.114146
Epoch 140/500, Loss: 0.113319
Epoch 150/500, Loss: 0.112519
Epoch 160/500, Loss: 0.111757
Epoch 170/500, Loss: 0.111043
Epoch 180/500, Loss: 0.110375
Epoch 190/500, Loss: 0.109752
Epoch 200/500, Loss: 0.109173
Epoch 210/500, Loss: 0.108636
Epoch 220/500, Loss: 0.108136
Epoch 230/500, Loss: 0.107670
Epoch 240/500, Loss: 0.107232
Epoch 250/500, Loss: 0.106809
Epoch 260/500, Loss: 0.106367
Epoch 270/500, Loss: 0.105920
Epoch 280/500, Loss: 0.105472
Epoch 290/500, Loss: 0.105024
Epoch 300/500, Loss: 0.104575
Epoch 310/500, Loss: 0.104137
Epoch 320/500, Loss: 0.103706
Epoch 330/500, Loss: 