# 'Titanic - Machine Learning From Disaster' Kaggle Competition

Submission notebook used to obtain a final accuracy of 0.79665, placing in top 5% on leaderboard.

In [1]:
#import relevant libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import statistics
import matplotlib

### 1). Data cleaning and exploration

In [2]:
#read dataset
data_train = pd.read_csv("./Data/train.csv")
data_test = pd.read_csv("./Data/test.csv")
data_train.set_index('PassengerId', inplace=True)
data_test.set_index('PassengerId', inplace=True)

In [3]:
data_train['TrainTest'] = 1
data_test['TrainTest'] = 0
data_test['Survived'] = np.NaN
data_all = pd.concat([data_train,data_test], sort=True)

In [4]:
#how complete is the data?
data_all.isna().sum()

Age           263
Cabin        1014
Embarked        2
Fare            1
Name            0
Parch           0
Pclass          0
Sex             0
SibSp           0
Survived      418
Ticket          0
TrainTest       0
dtype: int64

### 2). Feature engineering

In [5]:
data_all['Title'] = data_all['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
data_all['CabinCount'] = data_all['Cabin'].apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
data_all['CabinDeck'] = data_all['Cabin'].apply(lambda x: str(x)[0] if not pd.isna(x) else 'None')
data_all['TicketLetters'] = data_all['Ticket'].apply(lambda x: ''.join(x.split(' ')[:-1]).replace('.','').replace('/','').lower() if len(x.split(' ')[:-1]) > 0 else 'None')
data_all['IsTicketNumber'] = data_all['Ticket'].apply(lambda x: 1 if x.isnumeric() else 0)

data_all.drop(columns = ['Ticket', 'Name', 'Cabin'], inplace=True)

data_all.dropna(subset = ['Embarked'], inplace = True)

In [6]:
data_all['Age'] = data_all['Age'].fillna(data_all[data_all['TrainTest'] == 1]['Age'].median())
data_all['Fare'] = data_all['Fare'].fillna(data_all[data_all['TrainTest'] == 1]['Fare'].median())

#log norm of fare
data_all['FareNorm'] = np.log(data_all['Fare']+1)
data_all.drop(columns = ['Fare'], inplace=True)

In [7]:
#double check imputation was successful
data_all.isna().sum()

Age                 0
Embarked            0
Parch               0
Pclass              0
Sex                 0
SibSp               0
Survived          418
TrainTest           0
Title               0
CabinCount          0
CabinDeck           0
TicketLetters       0
IsTicketNumber      0
FareNorm            0
dtype: int64

In [8]:
data_all.dtypes

Age               float64
Embarked           object
Parch               int64
Pclass              int64
Sex                object
SibSp               int64
Survived          float64
TrainTest           int64
Title              object
CabinCount          int64
CabinDeck          object
TicketLetters      object
IsTicketNumber      int64
FareNorm          float64
dtype: object

In [9]:
#we then need to handle the categorical data using OneHotEncoder

data_all['Pclass'] = data_all['Pclass'].astype(str)

all_dummies = pd.get_dummies(data_all[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'FareNorm', 'Embarked', 'CabinCount', 'CabinDeck', 'IsTicketNumber', 'Title', 'TicketLetters', 'TrainTest']])

X_train = all_dummies[all_dummies['TrainTest'] == 1].drop(['TrainTest'], axis=1)
X_test = all_dummies[all_dummies['TrainTest'] == 0].drop(['TrainTest'], axis=1)

y_train = data_all[data_all['TrainTest'] == 1]['Survived']

In [10]:
#check that encoding has been successful
all_dummies.dtypes

Age                     float64
SibSp                     int64
Parch                     int64
FareNorm                float64
CabinCount                int64
                         ...   
TicketLetters_stono2      uint8
TicketLetters_stonoq      uint8
TicketLetters_swpp        uint8
TicketLetters_wc          uint8
TicketLetters_wep         uint8
Length: 77, dtype: object

### 3). Data preprocessing for modelling

In [11]:
#Scale data 
scale = StandardScaler()
all_dummies_scaled = all_dummies.copy()
all_dummies_scaled[['Age','SibSp','Parch','FareNorm']] = scale.fit_transform(all_dummies_scaled[['Age','SibSp','Parch','FareNorm']])

X_train_scaled = all_dummies_scaled[all_dummies_scaled['TrainTest'] == 1].drop(['TrainTest'], axis=1)
X_test_scaled = all_dummies_scaled[all_dummies_scaled['TrainTest'] == 0].drop(['TrainTest'], axis=1)

y_train = data_all[data_all['TrainTest'] == 1]['Survived']

### 4). Build basic model

In [12]:
#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100)

n_scores = cross_val_score(clf, X_train_scaled, y_train, cv=5)

#report performance
print('Accuracy: %.3f (%.3f)' % (statistics.mean(n_scores), statistics.stdev(n_scores)))

Accuracy: 0.803 (0.037)


### 5). Model tuning

In [13]:
clf = RandomForestClassifier()
param_grid_random =  {'n_estimators': [100, 500, 1000], 
                                  'bootstrap': [True, False],
                                  'max_depth': [3, 5, 10, 20, 50, 75, 100, None],
                                  'max_features': ['auto', 'sqrt'],
                                  'min_samples_leaf': [1, 2, 4, 10],
                                  'min_samples_split': [2, 5, 10]}

clf_randomsearch = RandomizedSearchCV(clf, param_distributions=param_grid_random, n_iter=100, cv=5, n_jobs=-1)
best_clf_randomsearch = clf_randomsearch.fit(X_train_scaled, y_train)

In [14]:
print(best_clf_randomsearch.best_score_)
print(best_clf_randomsearch.best_params_)

0.8346456692913385
{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}


In [15]:
param_grid = {'n_estimators': [80, 90, 100, 110, 120],
               'criterion': ['gini','entropy'],
                                  'bootstrap': [True],
                                  'max_depth': [90, 100, 110],
                                  'max_features': ['auto', 'sqrt', 10],
                                  'min_samples_leaf': [1, 2, 3],
                                  'min_samples_split': [8, 10, 12]}

clf_gridsearch = GridSearchCV(clf, param_grid = param_grid, cv = 5, n_jobs = -1)
best_clf_gridsearch = clf_gridsearch.fit(X_train_scaled, y_train)

In [16]:
print(best_clf_gridsearch.best_score_)
print(best_clf_gridsearch.best_params_)

0.8402699662542182
{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 90, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 80}


In [17]:
##identify most important features
#best_clf = best_clf_gridsearch.best_estimator_.fit(X_train_scaled, y_train)
#feat_importances = pd.Series(best_clf.feature_importances_, index=X_train_scaled.columns)
#feat_importances.nlargest(20).plot(kind='barh');

In [18]:
##select features with more than 0.05 importance
#top_features = list(feat_importances.sort_values(ascending=False)[:18].index)

In [19]:
clf_optimised = best_clf_gridsearch.best_estimator_

n_scores_optimised = cross_val_score(clf_optimised, X_train_scaled, y_train, cv=5)

#report performance
print('Accuracy: %.3f (%.3f)' % (statistics.mean(n_scores_optimised), statistics.stdev(n_scores_optimised)))

Accuracy: 0.828 (0.026)


### 6). Results

In [20]:
final_model = best_clf_gridsearch.best_estimator_.fit(X_train_scaled, y_train)

In [21]:
y_pred = final_model.predict(X_test_scaled)

In [22]:
to_upload = pd.DataFrame({'PassengerId': X_test_scaled.index, 'Survived': list(y_pred)})
to_upload = to_upload.astype({"PassengerId": int, "Survived": int})
to_upload.to_csv('RB_submission_1.csv', index=False)