In [1]:
import pandas as pd
import numpy as np

## Read the CSV

In [2]:
# loading the dataset
df = pd.read_csv("resources/train_transformed_2.csv")
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Family_Size,Age*Class,Fare_Per_Person,Sex_female,...,Title_Mrs,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
0,0,3,22,1,0,7.2500,1,66,3.62500,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,38,1,0,71.2833,1,38,35.64165,1,...,1,0,0,1,0,0,0,0,0,0
2,1,3,26,0,0,7.9250,0,78,7.92500,1,...,0,0,0,0,0,0,0,0,0,1
3,1,1,35,1,0,53.1000,1,35,26.55000,1,...,1,0,0,1,0,0,0,0,0,0
4,0,3,35,0,0,8.0500,0,105,8.05000,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0,2,27,0,0,13.0000,0,54,13.00000,0,...,0,0,0,0,0,0,0,0,0,1
885,1,1,19,0,0,30.0000,0,19,30.00000,1,...,0,0,1,0,0,0,0,0,0,0
886,0,3,17,1,2,23.4500,3,51,5.86250,1,...,0,0,0,0,0,0,0,0,0,1
887,1,1,26,0,0,30.0000,0,26,30.00000,0,...,0,0,0,1,0,0,0,0,0,0


In [3]:
# Set features. This will also be used as your x values.
selected_features = df.drop("Survived", axis=1)
feature_names = selected_features.columns

## Create a Train Test Split

In [4]:
# assign X(data) and y(target)
data = selected_features
target = df["Survived"].values.reshape(-1, 1)
print(data.shape, target.shape)

(889, 25) (889, 1)


In [5]:
# split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [6]:
X_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Family_Size,Age*Class,Fare_Per_Person,Sex_female,Embarked_C,...,Title_Mrs,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
375,3,22,0,0,7.2500,0,66,7.2500,1,0,...,0,0,0,0,0,0,0,0,0,1
457,2,50,0,0,10.5000,0,100,10.5000,1,0,...,0,0,0,0,0,0,0,0,0,1
731,2,36,0,0,0.0000,0,72,0.0000,0,0,...,0,0,0,0,0,0,0,0,0,1
506,1,20,0,0,26.5500,0,20,26.5500,0,0,...,0,0,0,0,0,0,0,0,0,1
828,3,15,1,0,14.4542,1,45,7.2271,1,1,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,3,35,0,0,7.7750,0,105,7.7750,0,0,...,0,0,0,0,0,0,0,0,0,1
270,3,25,0,0,0.0000,0,75,0.0000,0,0,...,0,0,0,0,0,0,0,0,0,1
860,1,48,0,0,25.9292,0,48,25.9292,1,0,...,1,0,0,0,1,0,0,0,0,0
435,3,21,2,2,34.3750,4,63,6.8750,1,0,...,0,0,0,0,0,0,0,0,0,1


## Pre-processing

In [7]:
# Scale the data
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# create a random forest classifier to automatically calculate feature importance
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

  rf = rf.fit(X_train_scaled, y_train)


0.7892376681614349

In [9]:
# sort the features by their importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.13641443595494784, 'Title_Mr'),
 (0.13097232879322585, 'Fare_Per_Person'),
 (0.12326299703612961, 'Age*Class'),
 (0.11082314132353002, 'Fare'),
 (0.11004461303213184, 'Age'),
 (0.10163003910146391, 'Sex_female'),
 (0.048400686671050894, 'Pclass'),
 (0.04327249212066818, 'Family_Size'),
 (0.03385496264254996, 'Title_Mrs'),
 (0.029600653946345794, 'Title_Miss'),
 (0.029588362196938726, 'SibSp'),
 (0.019115826902763047, 'Deck_U'),
 (0.016424252084756046, 'Parch'),
 (0.014424520326170438, 'Embarked_S'),
 (0.01209850073436342, 'Embarked_C'),
 (0.008339249388576577, 'Deck_E'),
 (0.006423273836886031, 'Deck_D'),
 (0.0063508553600376575, 'Title_Master'),
 (0.005593352319741864, 'Embarked_Q'),
 (0.005091562057849453, 'Deck_B'),
 (0.0044219685090037845, 'Deck_C'),
 (0.0026561029926275093, 'Deck_A'),
 (0.000808381913527879, 'Deck_F'),
 (0.00020141994938714558, 'Deck_T'),
 (0.00018602080532640475, 'Deck_G')]

In [10]:
print(X_train_scaled.shape, X_test_scaled.shape)

(666, 25) (223, 25)


In [11]:
# use tree-based estimator coupled with SelectFromModel meta-transformer...
# ...to compute impurity-based feature importances and discard irrelevant features 
from sklearn.feature_selection import SelectFromModel
sfmodel = SelectFromModel(rf, prefit=True)
X_train_scaled = sfmodel.transform(X_train_scaled)
X_test_scaled = sfmodel.transform(X_test_scaled)
print(X_train_scaled.shape, X_test_scaled.shape)

(666, 8) (223, 8)


In [12]:
sfmodel.get_support()

array([ True,  True, False, False,  True,  True,  True,  True,  True,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False])

In [13]:
# print the names of the most important features
for feature_list_index in sfmodel.get_support(indices=True):
    print(feature_names[feature_list_index])

Pclass
Age
Fare
Family_Size
Age*Class
Fare_Per_Person
Sex_female
Title_Mr


In [14]:
# one-hot encode the target labels, setting each row's class vlaue to an array
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_train_categorical

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]], dtype=float32)

## Train the model

In [15]:
# create a sequential model
from tensorflow.keras.models import Sequential
model = Sequential()

In [16]:
# add first hidden layer, number of inputs and number of nodes specified
from tensorflow.keras.layers import Dense
number_inputs = 8
number_hidden_nodes = 80
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))

In [17]:
# add last hidden layer, specify activation function for classification and number of classes(labels) to predict
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

In [18]:
# model summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 80)                720       
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 162       
Total params: 882
Trainable params: 882
Non-trainable params: 0
_________________________________________________________________


In [19]:
# compile model using a loss function (categorical crossentroy for categorical data), an optimizer, and additional metric
# output layer in this instance is using software for logistic regression (i.e. categorical)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
             metrics=['accuracy'])

In [20]:
# fit(train) the model using the training data
# in this instance, we chose 1000 iterations/epochs, shuffle data, and increase detail printed each training cycle
model.fit(X_train_scaled, y_train_categorical,
         epochs=1000, shuffle=True, verbose=2)

Epoch 1/1000
21/21 - 0s - loss: 0.6336 - accuracy: 0.6471
Epoch 2/1000
21/21 - 0s - loss: 0.5008 - accuracy: 0.7688
Epoch 3/1000
21/21 - 0s - loss: 0.4487 - accuracy: 0.8123
Epoch 4/1000
21/21 - 0s - loss: 0.4293 - accuracy: 0.8198
Epoch 5/1000
21/21 - 0s - loss: 0.4187 - accuracy: 0.8213
Epoch 6/1000
21/21 - 0s - loss: 0.4120 - accuracy: 0.8228
Epoch 7/1000
21/21 - 0s - loss: 0.4052 - accuracy: 0.8228
Epoch 8/1000
21/21 - 0s - loss: 0.4013 - accuracy: 0.8273
Epoch 9/1000
21/21 - 0s - loss: 0.3978 - accuracy: 0.8213
Epoch 10/1000
21/21 - 0s - loss: 0.3943 - accuracy: 0.8273
Epoch 11/1000
21/21 - 0s - loss: 0.3916 - accuracy: 0.8288
Epoch 12/1000
21/21 - 0s - loss: 0.3889 - accuracy: 0.8273
Epoch 13/1000
21/21 - 0s - loss: 0.3867 - accuracy: 0.8288
Epoch 14/1000
21/21 - 0s - loss: 0.3856 - accuracy: 0.8333
Epoch 15/1000
21/21 - 0s - loss: 0.3839 - accuracy: 0.8333
Epoch 16/1000
21/21 - 0s - loss: 0.3813 - accuracy: 0.8378
Epoch 17/1000
21/21 - 0s - loss: 0.3803 - accuracy: 0.8408
Epoch 

Epoch 140/1000
21/21 - 0s - loss: 0.3396 - accuracy: 0.8604
Epoch 141/1000
21/21 - 0s - loss: 0.3399 - accuracy: 0.8559
Epoch 142/1000
21/21 - 0s - loss: 0.3404 - accuracy: 0.8544
Epoch 143/1000
21/21 - 0s - loss: 0.3392 - accuracy: 0.8589
Epoch 144/1000
21/21 - 0s - loss: 0.3386 - accuracy: 0.8604
Epoch 145/1000
21/21 - 0s - loss: 0.3398 - accuracy: 0.8559
Epoch 146/1000
21/21 - 0s - loss: 0.3381 - accuracy: 0.8559
Epoch 147/1000
21/21 - 0s - loss: 0.3393 - accuracy: 0.8589
Epoch 148/1000
21/21 - 0s - loss: 0.3380 - accuracy: 0.8634
Epoch 149/1000
21/21 - 0s - loss: 0.3407 - accuracy: 0.8559
Epoch 150/1000
21/21 - 0s - loss: 0.3380 - accuracy: 0.8574
Epoch 151/1000
21/21 - 0s - loss: 0.3384 - accuracy: 0.8544
Epoch 152/1000
21/21 - 0s - loss: 0.3384 - accuracy: 0.8529
Epoch 153/1000
21/21 - 0s - loss: 0.3381 - accuracy: 0.8574
Epoch 154/1000
21/21 - 0s - loss: 0.3382 - accuracy: 0.8544
Epoch 155/1000
21/21 - 0s - loss: 0.3378 - accuracy: 0.8574
Epoch 156/1000
21/21 - 0s - loss: 0.3377

Epoch 277/1000
21/21 - 0s - loss: 0.3269 - accuracy: 0.8589
Epoch 278/1000
21/21 - 0s - loss: 0.3281 - accuracy: 0.8559
Epoch 279/1000
21/21 - 0s - loss: 0.3267 - accuracy: 0.8649
Epoch 280/1000
21/21 - 0s - loss: 0.3265 - accuracy: 0.8589
Epoch 281/1000
21/21 - 0s - loss: 0.3262 - accuracy: 0.8619
Epoch 282/1000
21/21 - 0s - loss: 0.3254 - accuracy: 0.8604
Epoch 283/1000
21/21 - 0s - loss: 0.3277 - accuracy: 0.8664
Epoch 284/1000
21/21 - 0s - loss: 0.3266 - accuracy: 0.8589
Epoch 285/1000
21/21 - 0s - loss: 0.3253 - accuracy: 0.8694
Epoch 286/1000
21/21 - 0s - loss: 0.3262 - accuracy: 0.8604
Epoch 287/1000
21/21 - 0s - loss: 0.3273 - accuracy: 0.8574
Epoch 288/1000
21/21 - 0s - loss: 0.3282 - accuracy: 0.8589
Epoch 289/1000
21/21 - 0s - loss: 0.3240 - accuracy: 0.8589
Epoch 290/1000
21/21 - 0s - loss: 0.3275 - accuracy: 0.8544
Epoch 291/1000
21/21 - 0s - loss: 0.3248 - accuracy: 0.8649
Epoch 292/1000
21/21 - 0s - loss: 0.3253 - accuracy: 0.8649
Epoch 293/1000
21/21 - 0s - loss: 0.3262

Epoch 414/1000
21/21 - 0s - loss: 0.3160 - accuracy: 0.8649
Epoch 415/1000
21/21 - 0s - loss: 0.3170 - accuracy: 0.8664
Epoch 416/1000
21/21 - 0s - loss: 0.3151 - accuracy: 0.8724
Epoch 417/1000
21/21 - 0s - loss: 0.3161 - accuracy: 0.8649
Epoch 418/1000
21/21 - 0s - loss: 0.3157 - accuracy: 0.8679
Epoch 419/1000
21/21 - 0s - loss: 0.3155 - accuracy: 0.8649
Epoch 420/1000
21/21 - 0s - loss: 0.3161 - accuracy: 0.8709
Epoch 421/1000
21/21 - 0s - loss: 0.3172 - accuracy: 0.8604
Epoch 422/1000
21/21 - 0s - loss: 0.3158 - accuracy: 0.8664
Epoch 423/1000
21/21 - 0s - loss: 0.3182 - accuracy: 0.8619
Epoch 424/1000
21/21 - 0s - loss: 0.3164 - accuracy: 0.8634
Epoch 425/1000
21/21 - 0s - loss: 0.3145 - accuracy: 0.8679
Epoch 426/1000
21/21 - 0s - loss: 0.3152 - accuracy: 0.8679
Epoch 427/1000
21/21 - 0s - loss: 0.3159 - accuracy: 0.8694
Epoch 428/1000
21/21 - 0s - loss: 0.3157 - accuracy: 0.8679
Epoch 429/1000
21/21 - 0s - loss: 0.3178 - accuracy: 0.8634
Epoch 430/1000
21/21 - 0s - loss: 0.3154

Epoch 551/1000
21/21 - 0s - loss: 0.3092 - accuracy: 0.8694
Epoch 552/1000
21/21 - 0s - loss: 0.3089 - accuracy: 0.8664
Epoch 553/1000
21/21 - 0s - loss: 0.3089 - accuracy: 0.8694
Epoch 554/1000
21/21 - 0s - loss: 0.3080 - accuracy: 0.8709
Epoch 555/1000
21/21 - 0s - loss: 0.3075 - accuracy: 0.8694
Epoch 556/1000
21/21 - 0s - loss: 0.3101 - accuracy: 0.8679
Epoch 557/1000
21/21 - 0s - loss: 0.3096 - accuracy: 0.8694
Epoch 558/1000
21/21 - 0s - loss: 0.3067 - accuracy: 0.8709
Epoch 559/1000
21/21 - 0s - loss: 0.3089 - accuracy: 0.8694
Epoch 560/1000
21/21 - 0s - loss: 0.3085 - accuracy: 0.8664
Epoch 561/1000
21/21 - 0s - loss: 0.3085 - accuracy: 0.8679
Epoch 562/1000
21/21 - 0s - loss: 0.3099 - accuracy: 0.8664
Epoch 563/1000
21/21 - 0s - loss: 0.3084 - accuracy: 0.8694
Epoch 564/1000
21/21 - 0s - loss: 0.3085 - accuracy: 0.8664
Epoch 565/1000
21/21 - 0s - loss: 0.3073 - accuracy: 0.8679
Epoch 566/1000
21/21 - 0s - loss: 0.3070 - accuracy: 0.8649
Epoch 567/1000
21/21 - 0s - loss: 0.3083

Epoch 688/1000
21/21 - 0s - loss: 0.3021 - accuracy: 0.8649
Epoch 689/1000
21/21 - 0s - loss: 0.3065 - accuracy: 0.8709
Epoch 690/1000
21/21 - 0s - loss: 0.3092 - accuracy: 0.8664
Epoch 691/1000
21/21 - 0s - loss: 0.3034 - accuracy: 0.8694
Epoch 692/1000
21/21 - 0s - loss: 0.3053 - accuracy: 0.8649
Epoch 693/1000
21/21 - 0s - loss: 0.3025 - accuracy: 0.8709
Epoch 694/1000
21/21 - 0s - loss: 0.3034 - accuracy: 0.8679
Epoch 695/1000
21/21 - 0s - loss: 0.3023 - accuracy: 0.8724
Epoch 696/1000
21/21 - 0s - loss: 0.3015 - accuracy: 0.8709
Epoch 697/1000
21/21 - 0s - loss: 0.3032 - accuracy: 0.8694
Epoch 698/1000
21/21 - 0s - loss: 0.3020 - accuracy: 0.8679
Epoch 699/1000
21/21 - 0s - loss: 0.3036 - accuracy: 0.8694
Epoch 700/1000
21/21 - 0s - loss: 0.3028 - accuracy: 0.8679
Epoch 701/1000
21/21 - 0s - loss: 0.3046 - accuracy: 0.8634
Epoch 702/1000
21/21 - 0s - loss: 0.3032 - accuracy: 0.8709
Epoch 703/1000
21/21 - 0s - loss: 0.3036 - accuracy: 0.8694
Epoch 704/1000
21/21 - 0s - loss: 0.3034

Epoch 825/1000
21/21 - 0s - loss: 0.3003 - accuracy: 0.8604
Epoch 826/1000
21/21 - 0s - loss: 0.3001 - accuracy: 0.8724
Epoch 827/1000
21/21 - 0s - loss: 0.2980 - accuracy: 0.8679
Epoch 828/1000
21/21 - 0s - loss: 0.2980 - accuracy: 0.8694
Epoch 829/1000
21/21 - 0s - loss: 0.3010 - accuracy: 0.8679
Epoch 830/1000
21/21 - 0s - loss: 0.2967 - accuracy: 0.8694
Epoch 831/1000
21/21 - 0s - loss: 0.3002 - accuracy: 0.8709
Epoch 832/1000
21/21 - 0s - loss: 0.2994 - accuracy: 0.8709
Epoch 833/1000
21/21 - 0s - loss: 0.2987 - accuracy: 0.8709
Epoch 834/1000
21/21 - 0s - loss: 0.2971 - accuracy: 0.8694
Epoch 835/1000
21/21 - 0s - loss: 0.3003 - accuracy: 0.8664
Epoch 836/1000
21/21 - 0s - loss: 0.3021 - accuracy: 0.8724
Epoch 837/1000
21/21 - 0s - loss: 0.2967 - accuracy: 0.8739
Epoch 838/1000
21/21 - 0s - loss: 0.2968 - accuracy: 0.8754
Epoch 839/1000
21/21 - 0s - loss: 0.2969 - accuracy: 0.8724
Epoch 840/1000
21/21 - 0s - loss: 0.2970 - accuracy: 0.8694
Epoch 841/1000
21/21 - 0s - loss: 0.2989

Epoch 962/1000
21/21 - 0s - loss: 0.2921 - accuracy: 0.8724
Epoch 963/1000
21/21 - 0s - loss: 0.2929 - accuracy: 0.8754
Epoch 964/1000
21/21 - 0s - loss: 0.2953 - accuracy: 0.8694
Epoch 965/1000
21/21 - 0s - loss: 0.2950 - accuracy: 0.8679
Epoch 966/1000
21/21 - 0s - loss: 0.2952 - accuracy: 0.8709
Epoch 967/1000
21/21 - 0s - loss: 0.2927 - accuracy: 0.8754
Epoch 968/1000
21/21 - 0s - loss: 0.2938 - accuracy: 0.8709
Epoch 969/1000
21/21 - 0s - loss: 0.2952 - accuracy: 0.8649
Epoch 970/1000
21/21 - 0s - loss: 0.2937 - accuracy: 0.8754
Epoch 971/1000
21/21 - 0s - loss: 0.2934 - accuracy: 0.8784
Epoch 972/1000
21/21 - 0s - loss: 0.2931 - accuracy: 0.8754
Epoch 973/1000
21/21 - 0s - loss: 0.2933 - accuracy: 0.8724
Epoch 974/1000
21/21 - 0s - loss: 0.2939 - accuracy: 0.8709
Epoch 975/1000
21/21 - 0s - loss: 0.2964 - accuracy: 0.8694
Epoch 976/1000
21/21 - 0s - loss: 0.2952 - accuracy: 0.8649
Epoch 977/1000
21/21 - 0s - loss: 0.2925 - accuracy: 0.8739
Epoch 978/1000
21/21 - 0s - loss: 0.2949

<tensorflow.python.keras.callbacks.History at 0x21956184580>

In [21]:
# test ability to predict new data points (validity) by using the testing data to evaluate the model
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

7/7 - 0s - loss: 0.6364 - accuracy: 0.8027
Loss: 0.6364274024963379, Accuracy: 0.8026905655860901


In [22]:
predictions = list(np.argmax(model.predict(X_test_scaled), axis=-1))

In [23]:
y_test_list = list(y_test)
pred_df = pd.DataFrame({"Predictions": predictions, "Actual": y_test_list})
pred_df["Actual"] = pred_df["Actual"].str[0]
pred_df

Unnamed: 0,Predictions,Actual
0,0,0
1,1,1
2,1,1
3,0,0
4,1,1
...,...,...
218,0,0
219,0,0
220,0,0
221,1,1


## Test the model

In [None]:
# loading the dataset
test_data = pd.read_csv("resources/test_transformed_2.csv")
test_data

In [None]:
# create new variables for testing data
X_test = test_data[["Pclass", "Age", "Fare", "Family_Size", "Age*Class", "Fare_Per_Person", "Sex_female", "Title_Mr"]]

In [None]:
# Scale the data
X_test_scaled = X_scaler.transform(X_test)

In [None]:
predictions = list(np.argmax(model.predict(X_test_scaled), axis=-1))

In [None]:
output = pd.DataFrame({'PassengerID': test_data.PassengerId, 'Survived': predictions})

In [None]:
# export submission
output.to_csv('submissions/Hunter_submission18.csv', index=False)

In [None]:
# save the model using the HDF5 binary format with the extension .h5
#model.save("models/titanic_DeepLearning.h5")