# Kaggle Titanic

### Data preparation

Column | Missing Data | Feature Scaling | Comment
--- |:---:|:---:|:---
PassengerId|||Needed for submission
Survived|||Training label
Pclass||Rescaling (0-1)|
Name|||Ignored
Sex||Encoded as 0 (M) or 1 (F)|
Age|Impute with mean|Rescaling (0-1)|
SibSp|Impute with mean|Rescaling (0-1)|
Parch|Impute with mean|Rescaling (0-1)|
Ticket|||Ignored
Cabin|||Ignored
Embarked|||Ignored

### Model
  * 15 hidden layers of 128
  * Dropout .4
  * loss='categorical_crossentropy', optimizer='adam'
  * 500 epochs


### Kaggle score
  * 0.75598


In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

In [5]:
# load data
train_data = pd.read_csv(r"./data/train.csv")
test_data = pd.read_csv(r"./data/test.csv")

In [6]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
# Delete the data we don't need
cols = ["Name", "Ticket", "Embarked", "Cabin", "Fare"]
train_data.drop(cols, axis=1, inplace=True)
test_data.drop(cols, axis=1, inplace=True)
train_data.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
0,1,0,3,male,22.0,1,0
1,2,1,1,female,38.0,1,0
2,3,1,3,female,26.0,0,0
3,4,1,1,female,35.0,1,0
4,5,0,3,male,35.0,0,0
5,6,0,3,male,,0,0
6,7,0,1,male,54.0,0,0
7,8,0,3,male,2.0,3,1
8,9,1,3,female,27.0,0,2
9,10,1,2,female,14.0,1,0


In [9]:
# Fill the NaNs in Age, SibSp and Parch with the mean of the training data of that column
def fillna_n(col,n):
    col.fillna(n, inplace=True)
    
mean_age = train_data["Age"].mean()
mean_sibsp = train_data["SibSp"].mean()
mean_parch = train_data["Parch"].mean()

fillna_n(train_data["Age"], mean_age)
fillna_n(test_data["Age"], mean_age)
fillna_n(train_data["SibSp"], mean_sibsp)
fillna_n(test_data["SibSp"], mean_sibsp)
fillna_n(train_data["Parch"], mean_parch)
fillna_n(test_data["Parch"], mean_parch)


In [10]:
# What NaNs do we still have?
print(train_data.isnull().sum())
print(test_data.isnull().sum())
train_data[train_data.isnull().any(axis=1)]
test_data[test_data.isnull().any(axis=1)]

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
dtype: int64
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
dtype: int64


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch


In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(["male","female"])
train_data["Sex"] = le.transform(train_data["Sex"]) 
test_data["Sex"] = le.transform(test_data["Sex"]) 


In [12]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for data in [train_data, test_data]:
    data["Age"] = scaler.fit_transform(data["Age"].values.reshape(-1,1))
    data["SibSp"] = scaler.fit_transform(data["SibSp"].values.reshape(-1,1))
    data["Pclass"] = scaler.fit_transform(data["Pclass"].values.reshape(-1,1))

train_data.head()



Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
0,1,0,1.0,1,0.271174,0.125,0
1,2,1,0.0,0,0.472229,0.125,0
2,3,1,1.0,0,0.321438,0.0,0
3,4,1,0.0,0,0.434531,0.125,0
4,5,0,1.0,1,0.434531,0.0,0


In [13]:
train_data.to_csv("./data/train_prepped.csv")
test_data.to_csv("./data/test_prepped.csv")

In [14]:
#save PassengerId for evaluation and remove from data
test_passenger_id=test_data["PassengerId"]
train_data.drop("PassengerId", axis=1, inplace=True)
test_data.drop("PassengerId", axis=1, inplace=True)

y = pd.get_dummies(train_data['Survived'])
y.head()

Unnamed: 0,0,1
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0


In [15]:
x = train_data.drop("Survived", axis=1)
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch
0,1.0,1,0.271174,0.125,0
1,0.0,0,0.472229,0.125,0
2,1.0,0,0.321438,0.0,0
3,0.0,0,0.434531,0.125,0
4,1.0,1,0.434531,0.0,0


In [16]:
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

Using TensorFlow backend.


In [17]:
model = Sequential()
model.add(Dense(input_dim=x.shape[1], units=128, kernel_initializer='normal', bias_initializer='zeros'))
model.add(Activation('relu'))

for i in range (0,15):
    model.add(Dense(units=128, kernel_initializer='normal', bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dropout(.4))
    
model.add(Dense(units=2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
model.fit(x.values, y.values, epochs=500, verbose=2)

Epoch 1/500
1s - loss: 0.6807 - acc: 0.6105
Epoch 2/500
0s - loss: 0.6569 - acc: 0.6162
Epoch 3/500
0s - loss: 0.5829 - acc: 0.6229
Epoch 4/500
0s - loss: 0.6496 - acc: 0.6723
Epoch 5/500
0s - loss: 0.5637 - acc: 0.6914
Epoch 6/500
0s - loss: 0.6105 - acc: 0.7542
Epoch 7/500
0s - loss: 0.5920 - acc: 0.7295
Epoch 8/500
0s - loss: 0.5434 - acc: 0.7924
Epoch 9/500
0s - loss: 0.5670 - acc: 0.7643
Epoch 10/500
0s - loss: 0.5333 - acc: 0.7980
Epoch 11/500
0s - loss: 0.5217 - acc: 0.7811
Epoch 12/500
0s - loss: 0.5272 - acc: 0.7991
Epoch 13/500
0s - loss: 0.5248 - acc: 0.7879
Epoch 14/500
0s - loss: 0.5470 - acc: 0.7879
Epoch 15/500
0s - loss: 0.5179 - acc: 0.8070
Epoch 16/500
0s - loss: 0.5064 - acc: 0.8148
Epoch 17/500
0s - loss: 0.5004 - acc: 0.8092
Epoch 18/500
0s - loss: 0.4967 - acc: 0.8103
Epoch 19/500
0s - loss: 0.5122 - acc: 0.8215
Epoch 20/500
0s - loss: 0.5136 - acc: 0.8002
Epoch 21/500
0s - loss: 0.4928 - acc: 0.8103
Epoch 22/500
0s - loss: 0.4840 - acc: 0.8092
Epoch 23/500
0s - l

0s - loss: 0.4552 - acc: 0.7901
Epoch 183/500
0s - loss: 0.4878 - acc: 0.7834
Epoch 184/500
0s - loss: 0.4981 - acc: 0.7666
Epoch 185/500
0s - loss: 0.4769 - acc: 0.7823
Epoch 186/500
0s - loss: 0.4506 - acc: 0.8058
Epoch 187/500
0s - loss: 0.4527 - acc: 0.7924
Epoch 188/500
0s - loss: 0.4740 - acc: 0.7969
Epoch 189/500
0s - loss: 0.4897 - acc: 0.7845
Epoch 190/500
0s - loss: 0.4587 - acc: 0.7980
Epoch 191/500
0s - loss: 0.4500 - acc: 0.8193
Epoch 192/500
0s - loss: 0.4245 - acc: 0.8137
Epoch 193/500
0s - loss: 0.4419 - acc: 0.8137
Epoch 194/500
0s - loss: 0.4512 - acc: 0.8047
Epoch 195/500
0s - loss: 0.4494 - acc: 0.8204
Epoch 196/500
0s - loss: 0.4402 - acc: 0.8036
Epoch 197/500
0s - loss: 0.4408 - acc: 0.8114
Epoch 198/500
0s - loss: 0.4405 - acc: 0.8058
Epoch 199/500
0s - loss: 0.4262 - acc: 0.8204
Epoch 200/500
0s - loss: 0.4408 - acc: 0.8249
Epoch 201/500
0s - loss: 0.4346 - acc: 0.8159
Epoch 202/500
0s - loss: 0.4427 - acc: 0.7935
Epoch 203/500
0s - loss: 0.4513 - acc: 0.7924
Ep

0s - loss: 0.3973 - acc: 0.8215
Epoch 362/500
0s - loss: 0.4021 - acc: 0.8328
Epoch 363/500
0s - loss: 0.4049 - acc: 0.8272
Epoch 364/500
0s - loss: 0.4281 - acc: 0.8171
Epoch 365/500
0s - loss: 0.4268 - acc: 0.8283
Epoch 366/500
0s - loss: 0.4013 - acc: 0.8171
Epoch 367/500
0s - loss: 0.4092 - acc: 0.8092
Epoch 368/500
0s - loss: 0.4189 - acc: 0.8260
Epoch 369/500
0s - loss: 0.4331 - acc: 0.8025
Epoch 370/500
0s - loss: 0.4114 - acc: 0.8025
Epoch 371/500
0s - loss: 0.4165 - acc: 0.8249
Epoch 372/500
0s - loss: 0.4180 - acc: 0.8103
Epoch 373/500
0s - loss: 0.4150 - acc: 0.8103
Epoch 374/500
0s - loss: 0.4086 - acc: 0.8092
Epoch 375/500
0s - loss: 0.3910 - acc: 0.8238
Epoch 376/500
0s - loss: 0.4059 - acc: 0.8204
Epoch 377/500
0s - loss: 0.4206 - acc: 0.8305
Epoch 378/500
0s - loss: 0.4065 - acc: 0.8227
Epoch 379/500
0s - loss: 0.4102 - acc: 0.8328
Epoch 380/500
0s - loss: 0.4592 - acc: 0.8058
Epoch 381/500
0s - loss: 0.5012 - acc: 0.7912
Epoch 382/500
0s - loss: 0.5332 - acc: 0.7497
Ep

<keras.callbacks.History at 0x7fa3d0657940>

In [19]:
p_survived = model.predict_classes(test_data.values)

 32/418 [=>............................] - ETA: 0s

In [20]:
submission = pd.DataFrame()
submission['PassengerId'] = test_passenger_id
submission['Survived'] = p_survived
submission.to_csv('./data/titanic_keras_cs.csv', index=False)