# Titanic ML: Training the Model

In [488]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [489]:
# importing datasets
train_data = pd.read_csv('data/train_cleaned.csv')
test_data = pd.read_csv('data/test_cleaned.csv')
train_data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


In [490]:
orig_features = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'male', 'Q', 'S']
train_data['Family'] = train_data['SibSp']+train_data['Parch'] + 1
train_data = train_data[orig_features + ['Family']]
train_data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Family
0,0,3,22.0,1,0,7.25,1,0,1,2
1,1,1,38.0,1,0,71.2833,0,0,0,2
2,1,3,26.0,0,0,7.925,0,0,1,1
3,1,1,35.0,1,0,53.1,0,0,1,2
4,0,3,35.0,0,0,8.05,1,0,1,1


In [491]:
train_data.drop(['SibSp', 'Parch', 'Fare', 'Q', 'S'], axis=1, inplace=True)

In [492]:
train_data.head()

Unnamed: 0,Survived,Pclass,Age,male,Family
0,0,3,22.0,1,2
1,1,1,38.0,0,2
2,1,3,26.0,0,1
3,1,1,35.0,0,2
4,0,3,35.0,1,1


In [493]:
# extract the features and target for training and validation
X = train_data.iloc[:, 1:5].values
y = train_data.iloc[:, 0].values

In [494]:
# split the training samples for training and validation
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

In [430]:
# Feature Scaling
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_val = sc.transform(X_val)

## 1) Using Logistic Regression

In [495]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(solver='liblinear', max_iter=4000, verbose=5)
lr_model.fit(X_train, y_train)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=4000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=5,
                   warm_start=False)

In [496]:
# make predictions
y_pred = lr_model.predict(X_val)

In [497]:
# model evaluation
from sklearn.metrics import classification_report, confusion_matrix
print('Classification report:\n', classification_report(y_val, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_val, y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84       109
           1       0.75      0.75      0.75        69

    accuracy                           0.81       178
   macro avg       0.80      0.80      0.80       178
weighted avg       0.81      0.81      0.81       178

Confusion matrix:
 [[92 17]
 [17 52]]


In [498]:
from sklearn.metrics import mean_absolute_error
error = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error, MAE: {error: .4f}')

Mean Absolute Error, MAE:  0.1910


In [499]:
# accuracy
accuracy = 100 - 100*error
print(f'Accuracy in %: {accuracy: .2f}')

Accuracy in %:  80.90


In [500]:
# evaluate the model using R2-score
r2 = lr_model.score(X_train, y_train)
print( f"R2-score: {r2:.4f}" )

R2-score: 0.8059


### Searching for Best Parameters

In [501]:
from sklearn.metrics import accuracy_score

C_param_range = [0.001,0.05,0.1,0.5,1,5,10,50,100]

acc_table = pd.DataFrame(columns = ['C_parameter','Accuracy'])
acc_table['C_parameter'] = C_param_range

j=0
for i in C_param_range:
    
    # Apply logistic regression model to training data
    lr = LogisticRegression(penalty = 'l2', C = i,random_state = 0, solver='liblinear', max_iter=1000)
    lr.fit(X_train, y_train)
    
    # Predict using model
    y_pred = lr.predict(X_val)
    
    # Saving accuracy score in table
    acc_table.iloc[j,1] = accuracy_score(y_val, y_pred)
    j += 1  
    

In [502]:
acc_table


Unnamed: 0,C_parameter,Accuracy
0,0.001,0.61236
1,0.05,0.803371
2,0.1,0.797753
3,0.5,0.797753
4,1.0,0.808989
5,5.0,0.808989
6,10.0,0.803371
7,50.0,0.803371
8,100.0,0.803371


### Using GridSearch

In [503]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2']}
clf = GridSearchCV(LogisticRegression(solver='liblinear', max_iter=1000, random_state=0), param_grid, cv=5)
#GridSearchCV(cv=5, estimator=LogisticRegression(C=1.0, intercept_scaling=1, dual=False, fit_intercept=True, penalty='l2', \
#                                                tol=0.0001),param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]})
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [504]:
clf.best_params_

{'C': 1, 'penalty': 'l1'}

In [505]:
y_pred = clf.predict(X_val)
error = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error, MAE: {error: .4f}')

Mean Absolute Error, MAE:  0.1910


In [506]:
# accuracy
accuracy = 100 - 100*error
print(f'Accuracy in %: {accuracy: .2f}')

Accuracy in %:  80.90


## 2) Using Random Forest Regression

In [507]:
from sklearn.ensemble import RandomForestRegressor
# rf = RandomForestRegressor(n_estimators=160, min_samples_leaf=3, max_features=0.5, oob_score=True, random_state=42)
rf = RandomForestRegressor(n_estimators = 2000, min_samples_split=10, min_samples_leaf=1, max_features='sqrt', \
                                   max_depth=10, random_state = 42, oob_score=True)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=10,
                      min_weight_fraction_leaf=0.0, n_estimators=2000,
                      n_jobs=None, oob_score=True, random_state=42, verbose=0,
                      warm_start=False)

In [508]:
# evaluate the model using R2-score
r2 = rf.score(X_train, y_train)
print( f"R2-score: {r2:.4f}" )

R2-score: 0.6159


In [509]:
# check the model generality using the out-of-the-bag score
oob_r2 = rf.oob_score_
print(f"OOB score {oob_r2:.4f}")

OOB score 0.4349


In [510]:
# verify the mean absolute error
from sklearn.metrics import mean_absolute_error
y_pred = rf.predict(X_val)
error = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error, MAE: {error: .4f}')

Mean Absolute Error, MAE:  0.2399


In [511]:
# accuracy
accuracy = 100 - 100*error
print(f'Accuracy in %: {accuracy: .2f}')

Accuracy in %:  76.01


### Random Search Training

In [416]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [417]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.7min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [418]:
rf_random.best_params_

{'n_estimators': 2000,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': False}

In [419]:
# accuracy
rf_random_pred = rf_random.predict(X_val)
rf_random_error = mean_absolute_error(y_val, rf_random_pred)
print(f'RF random error: {rf_random_error: .4f}')
accuracy = 100 - 100*rf_random_error
print(f'Accuracy in %: {accuracy: .2f}')

RF random error:  0.2623
Accuracy in %:  73.77


### Grid Search with Cross-validation

In [420]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [1, 2, 3],
    'min_samples_leaf': [3, 4, 5, 6, 7, 8],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 400]
}
# Create a based model
rf_cv = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf_cv, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 493 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done 1141 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1586 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1944 out of 1944 | elapsed:  1.8min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth':

In [421]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 100}

In [422]:
# accuracy
grid_search_pred = grid_search.predict(X_val)
grid_search_error = mean_absolute_error(y_val, grid_search_pred)
print(f'Grid Search error: {grid_search_error: .4f}')
accuracy = 100 - 100*grid_search_error
print(f'Accuracy in %: {accuracy: .2f}')

Grid Search error:  0.2578
Accuracy in %:  74.22


## 3) Using Random Forest Classifier

In [512]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 2000, min_samples_split=10, min_samples_leaf=1, max_features='sqrt', \
                                   max_depth=10, random_state = 42, oob_score=True)
rfc.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=None, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [513]:
# verify the mean absolute error
from sklearn.metrics import mean_absolute_error
y_pred = rfc.predict(X_val)
error = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error, MAE: {error: .4f}')

Mean Absolute Error, MAE:  0.1854


In [514]:
# accuracy
accuracy = 100 - 100*error
print(f'Accuracy in %: {accuracy: .2f}')

Accuracy in %:  81.46


### Grid Search with Cross-validation

In [515]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100],
    'max_features': [1, 2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300]
}
# Create a based model
rfc_cv = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rfc_cv, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 472 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done 755 tasks      | elapsed:   42.9s
[Parallel(n_jobs=-1)]: Done 1120 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1215 out of 1215 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [516]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 1,
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 300}

In [517]:
# verify the mean absolute error
from sklearn.metrics import mean_absolute_error
y_pred = grid_search.predict(X_val)
error = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error, MAE: {error: .4f}')

Mean Absolute Error, MAE:  0.1854


In [518]:
# accuracy
accuracy = 100 - 100*error
print(f'Accuracy in %: {accuracy: .2f}')

Accuracy in %:  81.46
