# Kaggle Titanic

### Data preparation

Column | Missing Data | Feature Scaling | Comment
--- |:---:|:---:|:---
PassengerId|||Needed for submission
Survived|||Training label
Pclass||As is|
Name|||Ignored
Sex||Encoded as 0(<) or 1(F)|
Age|Impute with mean|As is|
SibSp|Impute with mean|As is|
Parch|Impute with mean|As is|
Ticket|||Ignored
Cabin|||Ignored
Embarked|||Ignored

### Model
  * 15 hidden layers of 128
  * Dropout .4
  * loss='categorical_crossentropy', optimizer='adam'
  * 500 epochs


### Kaggle score
  * 0.75598


### Change history
Score|Comment
---|:---
0.75598|First attempt
0.73684|Changed Pclass and Sex to one hot encoding
0.72248|Undid the last change and rescaled Parch as well
0.77990|

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

  return f(*args, **kwds)


In [2]:
# load data
train_data = pd.read_csv(r"./data/train.csv")
test_data = pd.read_csv(r"./data/test.csv")

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# Delete the data we don't need
cols = ["Name", "Ticket", "Embarked", "Cabin", "Fare"]
train_data.drop(cols, axis=1, inplace=True)
test_data.drop(cols, axis=1, inplace=True)
train_data.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
0,1,0,3,male,22.0,1,0
1,2,1,1,female,38.0,1,0
2,3,1,3,female,26.0,0,0
3,4,1,1,female,35.0,1,0
4,5,0,3,male,35.0,0,0
5,6,0,3,male,,0,0
6,7,0,1,male,54.0,0,0
7,8,0,3,male,2.0,3,1
8,9,1,3,female,27.0,0,2
9,10,1,2,female,14.0,1,0


In [6]:
# Fill the NaNs in Age, SibSp and Parch with the mean of the training data of that column
def fillna_n(col,n):
    col.fillna(n, inplace=True)
    
mean_age = train_data["Age"].mean()
mean_sibsp = train_data["SibSp"].mean()
mean_parch = train_data["Parch"].mean()

fillna_n(train_data["Age"], mean_age)
fillna_n(test_data["Age"], mean_age)
fillna_n(train_data["SibSp"], mean_sibsp)
fillna_n(test_data["SibSp"], mean_sibsp)
fillna_n(train_data["Parch"], mean_parch)
fillna_n(test_data["Parch"], mean_parch)


In [7]:
# What NaNs do we still have?
print(train_data.isnull().sum())
print(test_data.isnull().sum())
train_data[train_data.isnull().any(axis=1)]
test_data[test_data.isnull().any(axis=1)]

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
dtype: int64
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
dtype: int64


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch


In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(["male","female"])
train_data["Sex"] = le.transform(train_data["Sex"]) 
test_data["Sex"] = le.transform(test_data["Sex"])
# train_data = pd.get_dummies(train_data,columns=["Sex","Pclass"])
# test_data = pd.get_dummies(test_data,columns=["Sex","Pclass"])
# train_data.head()

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# for data in [train_data, test_data]:
#     data["Age"] = scaler.fit_transform(data["Age"].values.reshape(-1,1))
#     data["SibSp"] = scaler.fit_transform(data["SibSp"].values.reshape(-1,1))
#     data["Parch"] = scaler.fit_transform(data["SibSp"].values.reshape(-1,1))
#     data["Pclass"] = scaler.fit_transform(data["Pclass"].values.reshape(-1,1))

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
0,1,0,3,1,22.0,1,0
1,2,1,1,0,38.0,1,0
2,3,1,3,0,26.0,0,0
3,4,1,1,0,35.0,1,0
4,5,0,3,1,35.0,0,0


In [10]:
train_data.to_csv("./data/train_prepped.csv")
test_data.to_csv("./data/test_prepped.csv")

In [11]:
#save PassengerId for evaluation and remove from data
test_passenger_id=test_data["PassengerId"]
train_data.drop("PassengerId", axis=1, inplace=True)
test_data.drop("PassengerId", axis=1, inplace=True)

y = pd.get_dummies(train_data['Survived'])
y.head()

Unnamed: 0,0,1
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0


In [12]:
x = train_data.drop("Survived", axis=1)
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch
0,3,1,22.0,1,0
1,1,0,38.0,1,0
2,3,0,26.0,0,0
3,1,0,35.0,1,0
4,3,1,35.0,0,0


In [13]:
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

Using TensorFlow backend.


In [14]:
model = Sequential()
model.add(Dense(input_dim=x.shape[1], units=128, kernel_initializer='normal', bias_initializer='zeros'))
model.add(Activation('relu'))

for i in range (0,15):
    model.add(Dense(units=128, kernel_initializer='normal', bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dropout(.4))
    
model.add(Dense(units=2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
model.fit(x.values, y.values, epochs=500, verbose=2)

Epoch 1/500
0s - loss: 0.6815 - acc: 0.6083
Epoch 2/500
0s - loss: 0.6693 - acc: 0.6162
Epoch 3/500
0s - loss: 0.6658 - acc: 0.6162
Epoch 4/500
0s - loss: 0.6675 - acc: 0.6162
Epoch 5/500
0s - loss: 0.6513 - acc: 0.6162
Epoch 6/500
0s - loss: 0.6452 - acc: 0.6162
Epoch 7/500
0s - loss: 0.6236 - acc: 0.6162
Epoch 8/500
0s - loss: 0.6019 - acc: 0.6364
Epoch 9/500
0s - loss: 0.5785 - acc: 0.7273
Epoch 10/500
0s - loss: 0.6205 - acc: 0.7104
Epoch 11/500
0s - loss: 0.5590 - acc: 0.7688
Epoch 12/500
0s - loss: 0.5727 - acc: 0.7688
Epoch 13/500
0s - loss: 0.5363 - acc: 0.7778
Epoch 14/500
0s - loss: 0.5819 - acc: 0.7576
Epoch 15/500
0s - loss: 0.5162 - acc: 0.7957
Epoch 16/500
0s - loss: 0.5278 - acc: 0.7991
Epoch 17/500
0s - loss: 0.5360 - acc: 0.7845
Epoch 18/500
0s - loss: 0.5199 - acc: 0.7935
Epoch 19/500
0s - loss: 0.5460 - acc: 0.7677
Epoch 20/500
0s - loss: 0.5400 - acc: 0.7834
Epoch 21/500
0s - loss: 0.5146 - acc: 0.7980
Epoch 22/500
0s - loss: 0.5002 - acc: 0.8193
Epoch 23/500
0s - l

0s - loss: 0.4428 - acc: 0.8316
Epoch 183/500
0s - loss: 0.4405 - acc: 0.8114
Epoch 184/500
0s - loss: 0.4358 - acc: 0.8238
Epoch 185/500
0s - loss: 0.4542 - acc: 0.8215
Epoch 186/500
0s - loss: 0.4368 - acc: 0.8081
Epoch 187/500
0s - loss: 0.4167 - acc: 0.8305
Epoch 188/500
0s - loss: 0.4207 - acc: 0.8103
Epoch 189/500
0s - loss: 0.4188 - acc: 0.8193
Epoch 190/500
0s - loss: 0.4296 - acc: 0.8193
Epoch 191/500
0s - loss: 0.4090 - acc: 0.8182
Epoch 192/500
0s - loss: 0.4468 - acc: 0.8227
Epoch 193/500
0s - loss: 0.4381 - acc: 0.8182
Epoch 194/500
0s - loss: 0.4090 - acc: 0.8384
Epoch 195/500
0s - loss: 0.4327 - acc: 0.8036
Epoch 196/500
0s - loss: 0.4209 - acc: 0.8283
Epoch 197/500
0s - loss: 0.4237 - acc: 0.8249
Epoch 198/500
0s - loss: 0.4391 - acc: 0.8260
Epoch 199/500
0s - loss: 0.4332 - acc: 0.8103
Epoch 200/500
0s - loss: 0.4040 - acc: 0.8283
Epoch 201/500
0s - loss: 0.4316 - acc: 0.8204
Epoch 202/500
0s - loss: 0.4151 - acc: 0.8260
Epoch 203/500
0s - loss: 0.4133 - acc: 0.8204
Ep

0s - loss: 0.4316 - acc: 0.8002
Epoch 362/500
0s - loss: 0.4110 - acc: 0.8328
Epoch 363/500
0s - loss: 0.4312 - acc: 0.8193
Epoch 364/500
0s - loss: 0.4159 - acc: 0.8215
Epoch 365/500
0s - loss: 0.4137 - acc: 0.8148
Epoch 366/500
0s - loss: 0.4103 - acc: 0.8227
Epoch 367/500
0s - loss: 0.4099 - acc: 0.8283
Epoch 368/500
0s - loss: 0.4165 - acc: 0.8395
Epoch 369/500
0s - loss: 0.4317 - acc: 0.8316
Epoch 370/500
0s - loss: 0.4169 - acc: 0.8238
Epoch 371/500
0s - loss: 0.4238 - acc: 0.8238
Epoch 372/500
0s - loss: 0.4350 - acc: 0.8070
Epoch 373/500
0s - loss: 0.4491 - acc: 0.7980
Epoch 374/500
0s - loss: 0.4249 - acc: 0.8182
Epoch 375/500
0s - loss: 0.4371 - acc: 0.8171
Epoch 376/500
0s - loss: 0.4245 - acc: 0.8148
Epoch 377/500
0s - loss: 0.4650 - acc: 0.8126
Epoch 378/500
0s - loss: 0.4544 - acc: 0.8126
Epoch 379/500
0s - loss: 0.4345 - acc: 0.8328
Epoch 380/500
0s - loss: 0.4508 - acc: 0.8159
Epoch 381/500
0s - loss: 0.4467 - acc: 0.8081
Epoch 382/500
0s - loss: 0.4396 - acc: 0.8249
Ep

<keras.callbacks.History at 0x7f397db04ac8>

In [16]:
p_survived = model.predict_classes(test_data.values)

 32/418 [=>............................] - ETA: 1s

In [17]:
submission = pd.DataFrame()
submission['PassengerId'] = test_passenger_id
submission['Survived'] = p_survived
submission.to_csv('./data/submission.csv', index=False)