# Titanic survival classification problem
- Train a RandForest classifier using the output of the two middle NN layers of a DL model.

## Data processing

In [9]:
import seaborn as sns
import numpy as np
import pandas as pd
import sklearn.model_selection
from keras.models import Model, load_model
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, precision_score, accuracy_score,recall_score, auc, confusion_matrix
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

In [10]:
# data load
dataset = sns.load_dataset("titanic")
dataframe, test_dataframe = sklearn.model_selection.train_test_split(dataset, train_size=0.7, random_state=1)
dataframe.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
114,0,3,female,17.0,0,0,14.4583,C,Third,woman,False,,Cherbourg,no,True
874,1,2,female,28.0,1,0,24.0,C,Second,woman,False,,Cherbourg,yes,False
76,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
876,0,3,male,20.0,0,0,9.8458,S,Third,man,True,,Southampton,no,True
674,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True


In [11]:
print(dataframe['deck'].dtypes)
dataframe = dataframe.astype({"deck": str})
test_dataframe = test_dataframe.astype({"deck": str})
print(dataframe['deck'].dtypes)

category
object


In [12]:
# data processing
for i in dataframe.index:
    if dataframe['deck'][i] == 'nan':
        dataframe['deck'][i] = 'Z'

for i in test_dataframe.index:
    if test_dataframe['deck'][i] == 'nan':
        test_dataframe['deck'][i] = 'Z'

train_median = dataframe['age'].median()
for i in dataframe.index:
    if dataframe['age'][i] != dataframe['age'][i]:
        dataframe['age'][i] = train_median

train_median = test_dataframe['age'].median()
for i in test_dataframe.index:
    if test_dataframe['age'][i] != test_dataframe['age'][i]:
        test_dataframe['age'][i] = train_median

X = dataframe[['sex', 'pclass', 'age', 'deck']]
y = dataframe[['alive']]
X_ts = test_dataframe[['sex', 'pclass', 'age', 'deck']]
y_ts = test_dataframe[['alive']]
X_ts.head(10)

Unnamed: 0,sex,pclass,age,deck
862,female,1,48.0,D
223,male,3,28.0,Z
84,female,2,17.0,Z
680,female,3,28.0,Z
535,female,2,7.0,Z
623,male,3,21.0,Z
148,male,2,36.5,F
3,female,1,35.0,C
34,male,1,28.0,Z
241,female,3,28.0,Z


In [13]:
# normalization
normalization = [X['age'].mean(), X['age'].std()]
print("Age normalization --> " + str(normalization))

X['age'] = (X['age'] - normalization[0]) / normalization[1]
X_ts['age'] = (X_ts['age'] - normalization[0]) / normalization[1]

X_dum = pd.get_dummies(X)
X_ts_dum = pd.get_dummies(X_ts)

Age normalization --> [29.81099518459069, 13.283963797935623]


In [14]:
# dummification
integer_encoded = label_encoder.fit_transform(y)
y = integer_encoded.reshape(len(integer_encoded), 1)

integer_encoded_ts = label_encoder.fit_transform(y_ts)
y_ts = integer_encoded_ts.reshape(len(integer_encoded_ts), 1)

## Transfer Learning

In [15]:
# load model and split it
loaded_model = load_model("titanic_TransfLearn_model")
input_layer = loaded_model.layers[0]
output_layers = [layer for layer in loaded_model.layers[1:-1]]
models = [Model(inputs=input_layer.output, outputs=output_lay.output) for output_lay in output_layers]
features = [model.predict(X_dum) for model in models]
test_features = [model.predict(X_ts_dum) for model in models]

In [16]:
features_01 = pd.DataFrame(features[0])
features_02 = pd.DataFrame(features[1])
test_features_01 = pd.DataFrame(test_features[0])
test_features_02 = pd.DataFrame(test_features[1])

In [17]:
features_01.head(5)

Unnamed: 0,0,1,2,3,4
0,0.0,0.980654,0.0,0.765917,1.211592
1,0.0,1.262967,0.0,0.308354,0.993355
2,0.0,0.0,0.0,0.492762,2.618959
3,0.0,0.0,0.0,0.604887,2.290854
4,0.0,0.0,0.060178,0.176156,1.988248


In [18]:
rand_forest_01 = RandomForestClassifier(n_estimators=30, min_samples_leaf=3)
rand_forest_01.fit(features_01, y)
rand_forest_02 = RandomForestClassifier(n_estimators=30, min_samples_leaf=3)
rand_forest_02.fit(features_02, y)

RandomForestClassifier(min_samples_leaf=3, n_estimators=30)

In [19]:
print(rand_forest_01.score(test_features_01, y_ts))
print(rand_forest_02.score(test_features_02, y_ts))

0.7649253731343284
0.753731343283582


In [20]:
test_y_pred_01 = rand_forest_01.predict(test_features_01)
test_y_pred_02 = rand_forest_02.predict(test_features_02)

In [21]:
fpr, tpr, _ = roc_curve(y_ts, test_y_pred_01)
print("Max ROC layer2:")
print(auc(fpr, tpr))

predictions = np.where(test_y_pred_01 > .5, 1, 0)
print("\nConfusion matrix layer2:")
print(confusion_matrix(y_true=y_ts, y_pred=predictions))

fpr, tpr, _ = roc_curve(y_ts, test_y_pred_02)
print("---------------------------------------")
print("Max ROC layer2:")
print(auc(fpr, tpr))

predictions = np.where(test_y_pred_02 > .5, 1, 0)
print("\nConfusion matrix layer2:")
print(confusion_matrix(y_true=y_ts, y_pred=predictions))

Max ROC layer2:
0.7412048877522024

Confusion matrix layer2:
[[139  14]
 [ 49  66]]
---------------------------------------
Max ROC layer2:
0.72924126172208

Confusion matrix layer2:
[[138  15]
 [ 51  64]]
