In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score

In [3]:
from time import time

## Load dataset

In [4]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Downton, Mr. William James",male,,,,1601.0,,C23 C25 C27,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


## Data preparation

In [8]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

df['Family_cnt'] = df['SibSp'] + df['Parch']

df['Cabin_ind'] = np.where(df['Cabin'].isnull(), 0, 1)

gender_num = {'male': 0, 'female': 1}
df['Sex'] = df['Sex'].map(gender_num)

df.drop(['SibSp', 'Parch', 'Cabin', 'Embarked', 'PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Family_cnt,Cabin_ind
0,0,3,0,22.0,7.25,1,0
1,1,1,1,38.0,71.2833,1,1
2,1,3,1,26.0,7.925,0,0
3,1,1,1,35.0,53.1,1,1
4,0,3,0,35.0,8.05,0,0


## Train / Validation / Test data split

In [9]:
features = df.drop('Survived', axis=1)
labels = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

for dataset in [y_train, y_val, y_test]:
    print(round(len(dataset) / len(labels), 2))

0.6
0.2
0.2


## Model training and evaluation using GridSearchCV

In [10]:
def print_best_score(hp_optimizer):
    print('BEST SCORE: {} - PARAMS: {}\n'.format(round(hp_optimizer.best_score_, 3), hp_optimizer.best_params_))

    means = hp_optimizer.cv_results_['mean_test_score']
    stds = hp_optimizer.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, hp_optimizer.cv_results_['params']):
        print('Score: {} (+/-{}) for Params: {}'.format(round(mean, 3), round(std * 2, 3), params))

### Model 1: Logistic Regression

In [11]:
lr = LogisticRegression(solver='liblinear')
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

CV_1 = GridSearchCV(lr, parameters, cv=5, iid=False)
CV_1.fit(X_train, y_train.values.ravel())

print_best_score(CV_1)

BEST SCORE: 0.798 - PARAMS: {'C': 1}

Score: 0.678 (+/-0.092) for Params: {'C': 0.001}
Score: 0.704 (+/-0.099) for Params: {'C': 0.01}
Score: 0.796 (+/-0.13) for Params: {'C': 0.1}
Score: 0.798 (+/-0.123) for Params: {'C': 1}
Score: 0.794 (+/-0.118) for Params: {'C': 10}
Score: 0.794 (+/-0.118) for Params: {'C': 100}
Score: 0.794 (+/-0.118) for Params: {'C': 1000}


In [12]:
LR = CV_1.best_estimator_
LR

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

### Model 2: Support Vector Machine

In [13]:
svc = SVC(gamma='scale')
parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 100]
}

CV_2 = GridSearchCV(svc, parameters, cv=5, iid=False)
CV_2.fit(X_train, y_train.values.ravel())

print_best_score(CV_2)

BEST SCORE: 0.796 - PARAMS: {'C': 0.1, 'kernel': 'linear'}

Score: 0.796 (+/-0.116) for Params: {'C': 0.1, 'kernel': 'linear'}
Score: 0.654 (+/-0.062) for Params: {'C': 0.1, 'kernel': 'rbf'}
Score: 0.796 (+/-0.116) for Params: {'C': 1, 'kernel': 'linear'}
Score: 0.661 (+/-0.05) for Params: {'C': 1, 'kernel': 'rbf'}
Score: 0.796 (+/-0.116) for Params: {'C': 100, 'kernel': 'linear'}
Score: 0.788 (+/-0.113) for Params: {'C': 100, 'kernel': 'rbf'}


In [14]:
SVM = CV_2.best_estimator_
SVM

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### Model 3: Multilayer Perceptron

In [15]:
mlp = MLPClassifier(solver='lbfgs')
parameters = {
    'hidden_layer_sizes': [(10,), (50,), (100,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

CV_3 = GridSearchCV(mlp, parameters, cv=5, iid=False)
CV_3.fit(X_train, y_train.values.ravel())

print_best_score(CV_3)

BEST SCORE: 0.811 - PARAMS: {'activation': 'logistic', 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling'}

Score: 0.717 (+/-0.226) for Params: {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'constant'}
Score: 0.64 (+/-0.053) for Params: {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling'}
Score: 0.581 (+/-0.327) for Params: {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive'}
Score: 0.557 (+/-0.25) for Params: {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
Score: 0.618 (+/-0.024) for Params: {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling'}
Score: 0.689 (+/-0.174) for Params: {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive'}
Score: 0.575 (+/-0.201) for Params: {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
Score: 0.628 (+/-0.274) for Params: {'activation': 'relu', 'hi

In [16]:
MLP = CV_3.best_estimator_
MLP

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(50,), learning_rate='invscaling',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

### Model 4: Random Forest

In [17]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

CV_4 = GridSearchCV(rf, parameters, cv=5, iid=False)
CV_4.fit(X_train, y_train.values.ravel())

print_best_score(CV_4)

BEST SCORE: 0.831 - PARAMS: {'max_depth': 4, 'n_estimators': 50}

Score: 0.749 (+/-0.121) for Params: {'max_depth': 2, 'n_estimators': 5}
Score: 0.79 (+/-0.133) for Params: {'max_depth': 2, 'n_estimators': 50}
Score: 0.799 (+/-0.111) for Params: {'max_depth': 2, 'n_estimators': 250}
Score: 0.809 (+/-0.125) for Params: {'max_depth': 4, 'n_estimators': 5}
Score: 0.831 (+/-0.113) for Params: {'max_depth': 4, 'n_estimators': 50}
Score: 0.822 (+/-0.109) for Params: {'max_depth': 4, 'n_estimators': 250}
Score: 0.815 (+/-0.074) for Params: {'max_depth': 8, 'n_estimators': 5}
Score: 0.824 (+/-0.079) for Params: {'max_depth': 8, 'n_estimators': 50}
Score: 0.813 (+/-0.076) for Params: {'max_depth': 8, 'n_estimators': 250}
Score: 0.815 (+/-0.039) for Params: {'max_depth': 16, 'n_estimators': 5}
Score: 0.82 (+/-0.046) for Params: {'max_depth': 16, 'n_estimators': 50}
Score: 0.815 (+/-0.032) for Params: {'max_depth': 16, 'n_estimators': 250}
Score: 0.794 (+/-0.067) for Params: {'max_depth': 32, 'n_

In [18]:
RF = CV_4.best_estimator_
RF

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Model 5: Gradient Boosting

In [19]:
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}

CV_5 = GridSearchCV(gb, parameters, cv=5, iid=False)
CV_5.fit(X_train, y_train.values.ravel())

print_best_score(CV_5)

BEST SCORE: 0.841 - PARAMS: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}

Score: 0.624 (+/-0.005) for Params: {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 5}
Score: 0.796 (+/-0.116) for Params: {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}
Score: 0.796 (+/-0.116) for Params: {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 250}
Score: 0.811 (+/-0.118) for Params: {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 500}
Score: 0.624 (+/-0.005) for Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5}
Score: 0.811 (+/-0.071) for Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Score: 0.829 (+/-0.076) for Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}
Score: 0.841 (+/-0.079) for Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
Score: 0.624 (+/-0.005) for Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 5}
Score: 0.82 (+/-0.052) for Params: {'learning

In [20]:
GB = CV_5.best_estimator_
GB

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

## Evaluation of best models on validation dataset

In [21]:
mdl_names = ['LR', 'SVM', 'MLP', 'RF', 'GB']
mdl_list = [LR, SVM, MLP, RF, GB]
models = dict(zip(mdl_names, mdl_list))

In [22]:
def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred), 3)
    recall = round(recall_score(labels, pred), 3)
    print('{} -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}ms'.format(name,
                                                                                   accuracy,
                                                                                   precision,
                                                                                   recall,
                                                                                   round((end - start)*1000, 1)))

In [23]:
for name, mdl in models.items():
    evaluate_model(name, mdl, X_val, y_val)

LR -- Accuracy: 0.77 / Precision: 0.707 / Recall: 0.631 / Latency: 296.7ms
SVM -- Accuracy: 0.747 / Precision: 0.672 / Recall: 0.6 / Latency: 15.6ms
MLP -- Accuracy: 0.775 / Precision: 0.736 / Recall: 0.6 / Latency: 15.4ms
RF -- Accuracy: 0.815 / Precision: 0.833 / Recall: 0.615 / Latency: 31.3ms
GB -- Accuracy: 0.815 / Precision: 0.808 / Recall: 0.646 / Latency: 15.6ms


## Final model evaluation on test dataset

In [24]:
evaluate_model('Gradient Boosting', models['GB'], X_test, y_test)

Gradient Boosting -- Accuracy: 0.816 / Precision: 0.852 / Recall: 0.684 / Latency: 31.3ms
