# Titanic: Machine Learning from Disaster

#### Import Dependencies

In [None]:
import tensorflow as tf
config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.3
tf.Session(config=config)

import keras
from keras.models import *
from keras.layers import *
from keras import optimizers
from keras.applications.resnet50 import ResNet50
from keras.applications.vgg16 import VGG16
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.backend import tf as ktf
from keras.callbacks import EarlyStopping
from tqdm import tqdm

import numpy as np
import pandas as pd
from IPython.display import clear_output

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
import csv
import os
from PIL import Image
%matplotlib inline
from jupyterthemes import jtplot

jtplot.style()

# jtplot.style()
%matplotlib inline

np.random.seed(1)


## Exploratory Data Analysis and Data Cleaning

In [None]:
data = pd.read_csv('train.csv')
# test_data = pd.read_csv('test.csv')

In [None]:
data.head()

In [None]:
data['Fare'] /= np.max(data['Fare'])
# data['Fare_cat'] = 0
# data.loc[data['Fare'] <= 7.91, 'Fare_cat'] = 0
# data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare_cat'] = 1
# data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare_cat'] = 2
# data.loc[(data['Fare'] > 31) & (data['Fare'] <= 513), 'Fare_cat'] = 3

In [None]:
data['Initial'] = data.Name.str.extract('([A-Za-z]+)\.', expand=True)
data.head()

In [None]:
data.groupby('Initial')['Name'].count()

In [None]:
# there are some errors in data, let's fix them
data['Initial'].replace(['Mlle', 'Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss', 'Miss', 'Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'], inplace=True)

In [None]:
data.groupby('Initial')['Age'].mean()

In [None]:
## Assigning the NaN Values with the Ceil values of the mean ages
data.loc[(data.Age.isnull())&(data.Initial=='Mr'),'Age']=33
data.loc[(data.Age.isnull())&(data.Initial=='Mrs'),'Age']=36
data.loc[(data.Age.isnull())&(data.Initial=='Master'),'Age']=5
data.loc[(data.Age.isnull())&(data.Initial=='Miss'),'Age']=22
data.loc[(data.Age.isnull())&(data.Initial=='Other'),'Age']=46

In [None]:
data.isnull().any() #check for nan values in age

In [None]:
data['Embarked'].fillna('S',inplace=True)

In [None]:
data['Age'] /= np.max(data['Age'])
# data.loc[data['Age']<=16,'Age_band']=0
# data.loc[(data['Age']>16)&(data['Age']<=32),'Age_band']=1
# data.loc[(data['Age']>32)&(data['Age']<=48),'Age_band']=2
# data.loc[(data['Age']>48)&(data['Age']<=64),'Age_band']=3
# data.loc[data['Age']>64,'Age_band']=4

In [None]:
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
data['Sex'].replace(['male','female'],[0,1],inplace=True)
data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
data['Initial'].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)

In [None]:
data['Family_Size'] /= np.max(data['Family_Size'])
data['Embarked'] /= 2
data['Initial'] /= 4


In [None]:
data.head(3)

### Preparing train and test set

In [None]:
# train, test = train_test_split(data, test_size=0.3,random_state=0,stratify=data['Survived'])

X_train = data[['Pclass', 'Sex', 'Age', 'Embarked', 'Initial','Fare', 'Family_Size']].values
X_train = X_train.astype(float)

Y_train = data['Survived'].values
# Y_train = Y_train.reshape(Y_train.shape[0])

print(X_train.shape, Y_train.shape)

In [None]:
def plot_training(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(len(acc))
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1,)
    plt.plot(epochs, acc)
    plt.plot(epochs, val_acc)
    plt.legend(['train', 'val'], loc='upper left')
    plt.title(' accuracy')


    plt.subplot(1, 2, 2)
    
    plt.plot(epochs, loss)
    plt.plot(epochs, val_loss)
    plt.legend(['train', 'val'], loc='upper left')
    plt.title('loss')
    plt.show()

In [None]:
def myModel(input_shape):
    inp_image = Input(input_shape)
    X = Dense(2048)(inp_image)
    X = Activation('relu')(X)
    X = Dropout(0.5)(X)
    X = Dense(2048)(X)
    X = Activation('relu')(X)
    X = Dropout(0.5)(X)
#     X = Dense(1024)(X)
#     X = Activation('relu')(X)
#     X = Dropout(0.5)(X)
    X = Dense(1, activation='sigmoid')(X)
    
    model = Model(inputs=inp_image, outputs=X)
    
    model.compile(loss='binary_crossentropy',
                 optimizer='sgd',
                 metrics=['accuracy'])
    
    return model

In [None]:
model = myModel(X_train.shape[1:])

In [None]:
history = model.fit(X_train, Y_train, epochs=10, batch_size=10, verbose=1, validation_split=0.1,
                   callbacks=[EarlyStopping(monitor='val_acc', patience=2, verbose=1)])

plot_training(history)

In [None]:
test_data = pd.read_csv('test.csv')

In [None]:
test_data.isnull().sum()

In [None]:
test_data['Initial'] = test_data.Name.str.extract('([A-Za-z]+)\.', expand=True)
test_data.head()

In [None]:
test_data['Initial'].replace(['Col', 'Dona','Dr', 'Ms', 'Rev'], ['Other', 'Miss', 'Mr', 'Miss', 'Other'], inplace=True)
test_data.groupby('Initial')['Age'].count()

In [None]:
## Assigning the NaN Values with the Ceil values of the mean ages
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Mr'),'Age']=33
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Mrs'),'Age']=39
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Master'),'Age']=7
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Miss'),'Age']=22
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Other'),'Age']=43

In [None]:
test_data['Age'] /= np.max(test_data['Age'])

# test_data['Age_band']=0
# test_data.loc[test_data['Age']<=16,'Age_band']=0
# test_data.loc[(test_data['Age']>16)&(test_data['Age']<=32),'Age_band']=1
# test_data.loc[(test_data['Age']>32)&(test_data['Age']<=48),'Age_band']=2
# test_data.loc[(test_data['Age']>48)&(test_data['Age']<=64),'Age_band']=3
# test_data.loc[test_data['Age']>64,'Age_band']=4

In [None]:
test_data['Fare'] /= np.max(test_data['Fare'])
# test_data['Fare_cat'] = 0
# test_data.loc[test_data['Fare'] <= 7.91, 'Fare_cat'] = 0
# test_data.loc[(test_data['Fare'] > 7.91) & (test_data['Fare'] <= 14.454), 'Fare_cat'] = 1
# test_data.loc[(test_data['Fare'] > 14.454) & (test_data['Fare'] <= 31), 'Fare_cat'] = 2
# test_data.loc[(test_data['Fare'] > 31) & (test_data['Fare'] <= 513), 'Fare_cat'] = 3

In [None]:
test_data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
test_data['Sex'].replace(['male','female'],[0,1],inplace=True)
test_data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
test_data['Initial'].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)

In [None]:
test_data['Family_Size'] /= np.max(test_data['Family_Size'])
test_data['Embarked'] /= 2
test_data['Initial'] /= 4


In [None]:
X_test = test_data[['Pclass', 'Sex', 'Age', 'Embarked', 'Initial', 'Fare', 'Family_Size']].values
X_test.shape

In [None]:
preds = model.predict(X_test)

In [None]:
Y = pd.DataFrame()
Y['PassengerId'] = test_data['PassengerId']
Y['Survived'] = preds
Y['Survived'] = np.where(Y['Survived'] > 0.5, 1, 0)
Y.to_csv('subs.csv', index=False)