In [455]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from tensorflow.keras.optimizers import SGD, Adam
from keras.utils import np_utils
import os
from time import gmtime, strftime
import datetime
from keras.callbacks import TensorBoard
import tensorflow as tf
from keras import regularizers

In [467]:
# network and training
NB_EPOCH = 1000
BATCH_SIZE = 100
VERBOSE = 0
NB_CLASSES = 1 # Survived
OPTIMIZER = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
N_HIDDEN = 128
VALIDATION_SPLIT=0.1 # how much TRAIN is reserved for VALIDATION
RESHAPED = 784
DROPOUT = 0.3
REGULARIZER=regularizers.l2(0.003)

In [426]:
df = pd.read_csv('data/titanic_train.csv')
df
df_test = pd.read_csv('data/titanic_test.csv')
# df_test

In [427]:
df['Cabin'] = df['Cabin'].fillna('CABIN-NULL')
df['Embarked'] = df['Embarked'].fillna('EMBARKED-NULL')

df_test['Cabin'] = df_test['Cabin'].fillna('CABIN-NULL')
df_test['Embarked'] = df_test['Embarked'].fillna('EMBARKED-NULL')

labelEncoder = preprocessing.LabelEncoder()
df['Sex'] = labelEncoder.fit_transform(df['Sex'])
df_test['Sex'] = labelEncoder.transform(df_test['Sex'])
# labelEncoder1 = preprocessing.LabelEncoder()
# labelEncoder2 = preprocessing.LabelEncoder()
# labelEncoder3 = preprocessing.LabelEncoder()
# df['Sex'] = labelEncoder1.fit_transform(df['Sex'])
# df['Cabin'] = labelEncoder2.fit_transform(df['Cabin'])
# df['Embarked'] = labelEncoder3.fit_transform(df['Embarked'])

In [428]:
oneHotEncoder1 = preprocessing.OneHotEncoder(categories='auto', sparse=False)
oneHotEncoder1.fit(pd.concat([df['Cabin'],df_test['Cabin']]).values.reshape(-1, 1))
cabin = oneHotEncoder1.transform(df['Cabin'].values.reshape(-1, 1))
arrays = oneHotEncoder1.categories_

series = pd.Series(arrays[0], dtype=str)
lists = series

cabin = pd.DataFrame(cabin, columns=lists)
# cabin

In [429]:
cabin_test = oneHotEncoder1.transform(df_test['Cabin'].values.reshape(-1, 1))
arrays = oneHotEncoder1.categories_

series = pd.Series(arrays[0], dtype=str)
lists = series

cabin_test = pd.DataFrame(cabin_test, columns=lists)
# cabin_test

In [430]:
oneHotEncoder2 = preprocessing.OneHotEncoder(categories='auto', sparse=False)
oneHotEncoder2.fit(pd.concat([df['Embarked'],df_test['Embarked']]).values.reshape(-1, 1))
embarked = oneHotEncoder2.transform(df['Embarked'].values.reshape(-1, 1))
arrays = oneHotEncoder2.categories_
series = pd.Series(arrays[0], dtype=str)
lists = series

embarked = pd.DataFrame(embarked, columns=lists)
# embarked

In [431]:
embarked_test = oneHotEncoder2.transform(df_test['Embarked'].values.reshape(-1, 1))
arrays = oneHotEncoder2.categories_
series = pd.Series(arrays[0], dtype=str)
lists = series

embarked_test = pd.DataFrame(embarked_test, columns=lists)
# embarked_test

In [432]:
df = pd.concat([df, cabin, embarked], axis=1)
df_test = pd.concat([df_test, cabin_test, embarked_test], axis=1)

In [433]:
col = df.columns
col = col[col != 'Survived']
col = col[col != 'PassengerId']
col = col[col != 'Name']
col = col[col != 'Ticket']
col = col[col != 'Cabin']
col = col[col != 'Embarked']
col 

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'A10', 'A11', 'A14',
       'A16',
       ...
       'F2', 'F33', 'F38', 'F4', 'G6', 'T', 'C', 'EMBARKED-NULL', 'Q', 'S'],
      dtype='object', length=197)

In [336]:
# df[col]

In [434]:
X_np = np.array(df[col].fillna(0))
y_np = df['Survived'].values

X_submit = np.array(df_test[col].fillna(0))

In [435]:
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=VALIDATION_SPLIT)

In [476]:
model = Sequential()
model.add(Dense(N_HIDDEN, kernel_regularizer=REGULARIZER, input_shape=(197,)))
#model.add(BatchNormalization())
model.add(Activation('tanh'))
#model.add(Dropout(DROPOUT))
model.add(Dense(N_HIDDEN, kernel_regularizer=REGULARIZER))
# model.add(BatchNormalization())
model.add(Activation('tanh'))
#model.add(Dropout(DROPOUT))
model.add(Dense(N_HIDDEN, kernel_regularizer=REGULARIZER))
# model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dense(NB_CLASSES))
# model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy'])

Model: "sequential_31"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_114 (Dense)           (None, 128)               25344     
                                                                 
 activation_113 (Activation)  (None, 128)              0         
                                                                 
 dense_115 (Dense)           (None, 128)               16512     
                                                                 
 activation_114 (Activation)  (None, 128)              0         
                                                                 
 dense_116 (Dense)           (None, 128)               16512     
                                                                 
 activation_115 (Activation)  (None, 128)              0         
                                                                 
 dense_117 (Dense)           (None, 1)               

In [477]:
log_dir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
epochs = 0

In [478]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
model.fit(X_train, y_train,
batch_size=BATCH_SIZE, epochs=epochs + NB_EPOCH,
initial_epoch=epochs,
callbacks=[tensorboard_callback],
verbose=VERBOSE, validation_split=VALIDATION_SPLIT)
epochs += NB_EPOCH

In [479]:
score = model.evaluate(X_train, y_train, verbose=VERBOSE)
print("Train score:", score[0])
print('Train accuracy:', score[1])

score = model.evaluate(X_test, y_test, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])

Train score: 0.3379055857658386
Train accuracy: 0.8838951587677002
Test score: 0.47415390610694885
Test accuracy: 0.8444444537162781


In [90]:
score = model.evaluate(X_train, y_train, verbose=VERBOSE)
print("Train score:", score[0])
print('Train accuracy:', score[1])

score = model.evaluate(X_test, y_test, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])

Train score: 0.47332677245140076
Train accuracy: 0.7994012236595154
Test score: 0.5273887515068054
Test accuracy: 0.7578475475311279


In [480]:
survived = model.predict(X_submit, batch_size=BATCH_SIZE, verbose=VERBOSE, steps=None).round()
submit = pd.DataFrame({'PassengerId':df_test['PassengerId'],
              'Survived':survived.reshape(418)})

In [481]:
submit['Survived'] = submit['Survived'].astype('int')

In [482]:
submit.to_csv("output/titanic_submission.csv", index=False)