# Machine Learning in Titanic Dataset

### Import libraries and load data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
train.shape

(891, 12)

### Details of dataset

In [4]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
sex = []
for s in train["Sex"]:
    if s == "male":
        sex.append(1)
    else:
        sex.append(0)
train["Sex"] = sex
train["Age"] = train["Age"].fillna(0)

In [7]:
relative = np.array(train['SibSp']) + np.array(train["Parch"])
train["Relative"] = relative

In [8]:
Data = train[["PassengerId", "Pclass", "Sex", "Age", "Relative"]].set_index("PassengerId")
Data.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Relative
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,1,22.0,1
2,1,0,38.0,1
3,3,0,26.0,0
4,1,0,35.0,1
5,3,1,35.0,0


In [9]:
survive = train[["PassengerId", "Survived"]].set_index("PassengerId")
survive.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
1,0
2,1
3,1
4,1
5,0


In [10]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 4 columns):
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
Relative    891 non-null int64
dtypes: float64(1), int64(3)
memory usage: 34.8 KB


### Splitting Dataset

In [11]:
from sklearn.model_selection import train_test_split

d_train, d_val, s_train, s_val = train_test_split(np.array(Data), np.array(survive).flatten(), test_size=0.2, random_state=5)

### Choosing model

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [13]:
models = []
models.append(('DTC', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('LR', LogisticRegression(solver="liblinear", multi_class="auto")))
models.append(("MPL", MLPClassifier(alpha=1, max_iter=1000)))
models.append(("SGDC", SGDClassifier()))
models.append(('SVC', SVC(gamma="auto")))

In [14]:
scores = []
names = []
for name, model in models:
    fold = KFold(n_splits=5, random_state=7)
    cv_score = cross_val_score(model, d_train, s_train, cv=fold, scoring="accuracy")
    scores.append(cv_score)
    names.append(name)
    mod = "%s: %0.3f (%0.3f)" % (name, cv_score.mean(), cv_score.std())
    print(mod)

DTC: 0.792 (0.030)
GNB: 0.791 (0.046)
KNN: 0.749 (0.026)
LDA: 0.777 (0.052)
LR: 0.786 (0.052)
MPL: 0.806 (0.035)
SGDC: 0.664 (0.053)
SVC: 0.788 (0.030)


### Training Model

In [15]:
model = MLPClassifier(alpha=1, max_iter=100000)#LogisticRegression(solver='liblinear', multi_class="auto", random_state=42)

model.fit(d_train, s_train)

MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=100000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [16]:
import joblib

joblib.dump(model, "Titanic_MPL_model.pkl")

['Titanic_MPL_model.pkl']

In [17]:
cv_score = cross_val_score(model, d_train, s_train, cv=5, scoring="accuracy")
cv_score.mean()

0.808770422465358

In [18]:
model.score(d_train, s_train)

0.8146067415730337

In [19]:
pred = cross_val_predict(model, d_train, s_train, cv=5)
accuracy_score(s_train, pred)

0.8061797752808989

In [20]:
confusion_matrix(s_train, pred)

array([[388,  50],
       [ 88, 186]], dtype=int64)

In [21]:
print(classification_report(s_train, pred))

              precision    recall  f1-score   support

           0       0.82      0.89      0.85       438
           1       0.79      0.68      0.73       274

    accuracy                           0.81       712
   macro avg       0.80      0.78      0.79       712
weighted avg       0.80      0.81      0.80       712



### Tuning model parameters

In [22]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pca = PCA()
pipe = Pipeline(steps=[('logistic', MLPClassifier(max_iter=1000))])#LogisticRegression(multi_class="auto", random_state=100)

param_grid = {
    'logistic__alpha':[1, 10, 100, 0.1, 0.01, 0.001],
    'logistic__tol':[0.1, 0.01, 1, 10],
    #'logistic__intercept_scaling':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    #'logistic__solver':['liblinear', 'lbfgs'],
    'logistic__learning_rate_init':[1, 10, 100, 0.1, 0.01, 0.001],
    'logistic__epsilon':[1e-08, 1e-06, 0.0001, 0.001, 0.1],
    'logistic__power_t':[0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 1.0],
}
GS = GridSearchCV(pipe, param_grid)
GS.fit(d_train, s_train)
print("Best parameter (CV score = %0.3f):" % GS.best_score_)
print(GS.best_params_)



Best parameter (CV score = 0.820):
{'logistic__alpha': 0.01, 'logistic__epsilon': 1e-08, 'logistic__learning_rate_init': 0.01, 'logistic__power_t': 0.5, 'logistic__tol': 0.01}


In [23]:
GS.score(d_train, s_train)

0.8174157303370787

In [None]:
predict = cross_val_predict(GS, d_train, s_train.ravel(), cv=5)
accuracy_score(s_train, predict)



In [None]:
print(classification_report(s_train, predict))

In [None]:
GS.score(d_val, s_val)

In [None]:
cv_score = cross_val_score(GS, d_val, s_val, cv=5, scoring="accuracy")
cv_score.mean()

In [None]:
predict = cross_val_predict(GS, d_val, s_val, cv=5)

accuracy_score(s_val, predict)

In [None]:
print(classification_report(s_val, predict))

### To predict a new data