In [2]:
import numpy as n
import pandas as pd
from keras.models import Sequential
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout

In [3]:
raw_train = pd.read_csv("./train.csv", index_col=0)
raw_train["is_test"] = 0
raw_test = pd.read_csv("./test.csv", index_col=0)
raw_test["is_test"] = 1

In [8]:
all_data = pd.concat([raw_train, raw_test], axis=0, sort=False)

# Functions to preprocess the data

In [14]:
def get_title_last_name(name):
    full_name = name.str.split(", ", n=0, expand=True)
    last_name = full_name[0]
    titles = full_name[1].str.split(",", n=0, expand=True)
    titles = titles[0]
    return titles

def get_titles_from_names(df):
    df["Title"] = get_title_last_name(df["Name"])
    df = df.drop(["Name"], axis=1)
    return df

def get_dummy_cats(df):
    return pd.get_dummies(df, columns=["Title", "Pclass", "Sex", "Embarked", "Cabin", "Cabin_letter"])

def get_cabin_letter(df):
    df["Cabin"].fillna("Z", inplace=True)
    df["Cabin_letter"] = df["Cabin"].str[0]
    return df

def process_data(df):
    df = get_titles_from_names(df)
    df["Embarked"].fillna("S", inplace=True)
    df = get_cabin_letter(df)
    df = df.drop(["Ticket", "Fare"], axis=1)
    df = get_dummy_cats(df)
    return df

In [15]:
proc_data = process_data(all_data)

In [21]:
proc_train = proc_data[proc_data["is_test"] == 0]
proc_test = proc_data[proc_data["is_test"] == 1]

# Build Network to predict missing ages

In [22]:
for_age_train = proc_data.drop(["Survived", "is_test"], axis=1).dropna(axis=0)
X_train_age = for_age_train.drop("Age", axis=1)
y_train_age = for_age_train["Age"]

In [23]:
tmodel = Sequential()
tmodel.add(Dense(input_dim=X_train_age.shape[1], units=128, kernel_initializer="normal", bias_initializer="zeros"))
tmodel.add(Activation("relu"))

for i in range(0, 8):
    tmodel.add(Dense(units=64, kernel_initializer="normal", bias_initializer="zeros"))
    tmodel.add(Activation("relu"))
    tmodel.add(Dropout(0.25))
    
tmodel.add(Dense(units=1))
tmodel.add(Activation("linear"))

tmodel.compile(loss="mean_squared_error", optimizer="rmsprop")

In [24]:
tmodel.fit(X_train_age.values, y_train_age.values, epochs=600, verbose=2)

Epoch 1/600
 - 1s - loss: 561.1886
Epoch 2/600
 - 0s - loss: 249.7009
Epoch 3/600
 - 0s - loss: 235.9208
Epoch 4/600
 - 0s - loss: 223.4413
Epoch 5/600
 - 0s - loss: 199.4707
Epoch 6/600
 - 0s - loss: 204.7505
Epoch 7/600
 - 0s - loss: 159.3040
Epoch 8/600
 - 0s - loss: 172.6639
Epoch 9/600
 - 0s - loss: 154.3369
Epoch 10/600
 - 0s - loss: 134.6231
Epoch 11/600
 - 0s - loss: 135.1139
Epoch 12/600
 - 0s - loss: 118.8031
Epoch 13/600
 - 0s - loss: 113.5410
Epoch 14/600
 - 0s - loss: 111.5692
Epoch 15/600
 - 0s - loss: 120.7680
Epoch 16/600
 - 0s - loss: 101.8734
Epoch 17/600
 - 0s - loss: 98.2487
Epoch 18/600
 - 0s - loss: 90.7652
Epoch 19/600
 - 0s - loss: 88.7988
Epoch 20/600
 - 0s - loss: 98.3835
Epoch 21/600
 - 0s - loss: 77.5850
Epoch 22/600
 - 0s - loss: 86.4292
Epoch 23/600
 - 0s - loss: 86.3338
Epoch 24/600
 - 0s - loss: 77.9686
Epoch 25/600
 - 0s - loss: 77.7754
Epoch 26/600
 - 0s - loss: 86.0785
Epoch 27/600
 - 0s - loss: 69.8729
Epoch 28/600
 - 0s - loss: 76.1407
Epoch 29/600


<keras.callbacks.History at 0x7fa35c3106a0>

In [26]:
train_data = proc_train
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,is_test,Title_Capt. Edward Gifford,Title_Col. Archibald IV,Title_Col. John,Title_Col. John Jacob,Title_Col. Oberst Alfons,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
18,1.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
20,1.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
27,0.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
29,1.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
30,0.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
32,1.0,,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
33,1.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
37,1.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
43,0.0,,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [27]:
to_pred = train_data.loc[train_data['Age'].isnull()].drop(['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
train_data['Age'].loc[train_data['Age'].isnull()] = p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [28]:
test_data = proc_test
to_pred = test_data.loc[test_data['Age'].isnull()].drop(['Age', 'Survived', 'is_test'], axis=1)
p = tmodel.predict(to_pred.values)
test_data['Age'].loc[test_data['Age'].isnull()] = p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [29]:
train_data.loc[train_data['Age'].isnull()]

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,is_test,Title_Capt. Edward Gifford,Title_Col. Archibald IV,Title_Col. John,Title_Col. John Jacob,Title_Col. Oberst Alfons,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [30]:
test_data.loc[test_data['Age'].isnull()]

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,is_test,Title_Capt. Edward Gifford,Title_Col. Archibald IV,Title_Col. John,Title_Col. John Jacob,Title_Col. Oberst Alfons,...,Cabin_Z,Cabin_letter_A,Cabin_letter_B,Cabin_letter_C,Cabin_letter_D,Cabin_letter_E,Cabin_letter_F,Cabin_letter_G,Cabin_letter_T,Cabin_letter_Z
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [31]:
y = pd.get_dummies(train_data['Survived'])
y.head()

Unnamed: 0_level_0,0.0,1.0
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,0,1
3,0,1
4,0,1
5,1,0


In [32]:
X = train_data.drop(['Survived', 'is_test'], axis=1)

In [35]:
X.shape

(891, 1350)

In [36]:
y.shape

(891, 2)

In [34]:
# create model
model = Sequential()
model.add(Dense(input_dim=X.shape[1], units=128,
                kernel_initializer='normal', bias_initializer='zeros'))
model.add(Activation('relu'))

for i in range(0, 15):
    model.add(Dense(units=128, 
                    kernel_initializer='normal', bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dropout(0.4))
    
model.add(Dense(units=2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [37]:
model.fit(X.values, y.values, epochs=500, verbose=2)

Epoch 1/500
 - 1s - loss: 0.6815 - acc: 0.6117
Epoch 2/500
 - 0s - loss: 0.6693 - acc: 0.6162
Epoch 3/500
 - 0s - loss: 0.6681 - acc: 0.6162
Epoch 4/500
 - 0s - loss: 0.6715 - acc: 0.6162
Epoch 5/500
 - 0s - loss: 0.6698 - acc: 0.6162
Epoch 6/500
 - 0s - loss: 0.6672 - acc: 0.6162
Epoch 7/500
 - 0s - loss: 0.6674 - acc: 0.6162
Epoch 8/500
 - 0s - loss: 0.6668 - acc: 0.6162
Epoch 9/500
 - 0s - loss: 0.6664 - acc: 0.6162
Epoch 10/500
 - 0s - loss: 0.6500 - acc: 0.6162
Epoch 11/500
 - 0s - loss: 0.6323 - acc: 0.6162
Epoch 12/500
 - 0s - loss: 0.5498 - acc: 0.6532
Epoch 13/500
 - 0s - loss: 0.5835 - acc: 0.7598
Epoch 14/500
 - 0s - loss: 0.5163 - acc: 0.7946
Epoch 15/500
 - 0s - loss: 0.4463 - acc: 0.8305
Epoch 16/500
 - 0s - loss: 0.3517 - acc: 0.8788
Epoch 17/500
 - 0s - loss: 0.2993 - acc: 0.8934
Epoch 18/500
 - 0s - loss: 0.3135 - acc: 0.8934
Epoch 19/500
 - 0s - loss: 0.3539 - acc: 0.8676
Epoch 20/500
 - 0s - loss: 0.2837 - acc: 0.9046
Epoch 21/500
 - 0s - loss: 0.2486 - acc: 0.9102
E

<keras.callbacks.History at 0x7fa35c310e10>

In [38]:
test_data.columns

Index(['Survived', 'Age', 'SibSp', 'Parch', 'is_test',
       'Title_Capt. Edward Gifford', 'Title_Col. Archibald IV',
       'Title_Col. John', 'Title_Col. John Jacob', 'Title_Col. Oberst Alfons',
       ...
       'Cabin_Z', 'Cabin_letter_A', 'Cabin_letter_B', 'Cabin_letter_C',
       'Cabin_letter_D', 'Cabin_letter_E', 'Cabin_letter_F', 'Cabin_letter_G',
       'Cabin_letter_T', 'Cabin_letter_Z'],
      dtype='object', length=1352)

In [39]:
p_survived = model.predict_classes(test_data.drop(['Survived', 'is_test'], axis=1).values)

In [40]:
p_survived

array([0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0,

In [41]:
submission = pd.DataFrame()
submission['PassengerId'] = test_data.index
submission['Survived'] = p_survived

In [42]:
submission.shape

(418, 2)

In [43]:
submission.to_csv('titanic_keras_cs.csv', index=False)

In [None]:
#!kaggle competitions submit -c titanic -f titanic_keras_cs.csv -m "initial submit"

In [None]:
# reference: https://www.kaggle.com/cstahl12/titanic-with-keras