# Titanic: Machine Learning from Disaster

### Import Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
from jupyterthemes import jtplot
import csv
jtplot.style()
%matplotlib inline

np.random.seed(1)

## Exploratory Data Analysis and Data Cleaning

In [None]:
data = pd.read_csv('train.csv')
# test_data = pd.read_csv('test.csv')

In [None]:
data.head()

In [None]:
#check total null values in each column
print(data.isnull().sum())

In [None]:
# plot of survival
f, ax = plt.subplots(1,figsize=(10,8))
data['Survived'].value_counts().plot.pie(autopct='%1.1f%%',ax=ax);
data['Survived'].value_counts()

In [None]:
# see survival and sex relation
data.groupby(['Sex','Survived'])['Survived'].count().plot(kind='bar');

In [None]:
pd.crosstab(data.Pclass, data.Survived, margins=True)

In [None]:
pd.crosstab([data.Sex, data.Survived], data.Pclass,margins=True)

In [None]:
print('Oldest Passenger was of:',data['Age'].max(),'Years')
print('Youngest Passenger was of:',data['Age'].min(),'Years')
print('Average Age on the ship:',data['Age'].mean(),'Years')

In [None]:
data['Initial'] = data.Name.str.extract('([A-Za-z]+)\.', expand=True)
data.head()

In [None]:
data.groupby('Initial')['Name'].count()

In [None]:
# there are some errors in data, let's fix them
data['Initial'].replace(['Mlle', 'Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss', 'Miss', 'Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'], inplace=True)

In [None]:
data.groupby('Initial')['Age'].mean()

In [None]:
## Assigning the NaN Values with the Ceil values of the mean ages
data.loc[(data.Age.isnull())&(data.Initial=='Mr'),'Age']=33
data.loc[(data.Age.isnull())&(data.Initial=='Mrs'),'Age']=36
data.loc[(data.Age.isnull())&(data.Initial=='Master'),'Age']=5
data.loc[(data.Age.isnull())&(data.Initial=='Miss'),'Age']=22
data.loc[(data.Age.isnull())&(data.Initial=='Other'),'Age']=46

In [None]:
data.Age.isnull().any() #check for nan values in age

In [None]:
data['Embarked'].fillna('S',inplace=True)

In [None]:
data['Age_band']=0
data.loc[data['Age']<=16,'Age_band']=0
data.loc[(data['Age']>16)&(data['Age']<=32),'Age_band']=1
data.loc[(data['Age']>32)&(data['Age']<=48),'Age_band']=2
data.loc[(data['Age']>48)&(data['Age']<=64),'Age_band']=3
data.loc[data['Age']>64,'Age_band']=4

In [None]:
data['Sex'].replace(['male','female'],[0,1],inplace=True)
data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
data['Initial'].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)

In [None]:
data['Age_band'].value_counts().to_frame()

In [None]:
data.head(2)

## Predictive Modeling

In [None]:
train, test = train_test_split(data, test_size=0.3,random_state=0,stratify=data['Survived'])

X_train = train[['Pclass', 'Sex', 'Age_band', 'Embarked', 'Initial']].values
X_train = X_train.T.astype(float)
X_test = test[['Pclass', 'Sex', 'Age_band', 'Embarked', 'Initial']].values
X_test = X_test.T.astype(float)

Y_train = train['Survived'].values
Y_train = Y_train.reshape(1, Y_train.shape[0])

Y_test = test['Survived'].values
Y_test = Y_test.reshape(1, Y_test.shape[0])


In [None]:
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

### DNN 

In [None]:
def Initialize_parameters_deep(layer_dims):
    np.random.seed(3)
    parameters = {}
    for l in range(1, len(layer_dims)):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
    return parameters

In [None]:
def sigmoid(Z):
    return 1 / (1 + np.exp(-1 * Z))
def relu(Z):
    return np.maximum(0, Z)

In [None]:
def linear_activation_forward(A_prev, W, b, activation):
    Z = np.dot(W, A_prev) + b
    linear_cache = (A_prev, W, b)

    if activation == 'sigmoid':
        A = sigmoid(Z)
    elif activation == 'relu':
        A = relu(Z)
        
    activation_cache = Z
    cache = (linear_cache, activation_cache)
    return A, cache

In [None]:
def forward_propogation(X, parameters):
    
    A_prev = X
    L = len(parameters)//2
    caches = []
    for l in range(1, L):
        Wl = parameters['W' + str(l)]
        bl = parameters['b' + str(l)]
        A_prev, cache = linear_activation_forward(A_prev, Wl, bl, 'relu')
        caches.append(cache)

    AL, cache = linear_activation_forward(A_prev, parameters['W' + str(L)], parameters['b' + str(L)], 'sigmoid')
    caches.append(cache)
    return AL, caches


In [None]:
#np.mulliply is diff than X*Y
def compute_cost(AL, Y):
    m = Y.shape[1]
    cost = -1 / m * np.sum((Y * np.log(AL) + ((1 - Y) * np.log(1 - AL))))
    cost = np.squeeze(cost)
    return cost


In [None]:
def sigmoid_backward(dA, activation_cache):
    Z = activation_cache
    A = sigmoid(Z)
    dZ = dA * A * (1 - A)
    return dZ

In [None]:
def relu_backward(dA, activation_cache):
    Z = activation_cache
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0
    return dZ

In [None]:
def linear_activation_backward(dA, cache, activation):
    
    linear_cache, activation_cache = cache
    
    if activation == 'sigmoid':
        dZ = sigmoid_backward(dA, activation_cache)
    elif activation == 'relu':
        dZ = relu_backward(dA, activation_cache)
        
    A_prev, W, b = linear_cache
    m = A_prev.shape[1]
    dW = 1 / m * np.dot(dZ, A_prev.T)
    db = 1 / m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db


In [None]:
def backward_propogation(AL, Y, caches):
    L = len(caches)
    grads = {}
    dAL = - np.divide(Y, AL) + np.divide(1 - Y, 1 - AL)
    grads['dA' + str(L)], grads['dW' + str(L)], grads['db' + str(L)] = linear_activation_backward(dAL, caches[L-1], 'sigmoid')
    A_prev = AL
    for l in range(L-1, 0, -1):
        cache = caches[l-1]
        dA = grads['dA' + str(l+1)]
        dA_prev, dW, db = linear_activation_backward(dA, cache, 'relu')
        grads['dA' + str(l)] = dA_prev
        grads['dW' + str(l)] = dW
        grads['db' + str(l)] = db
        
    return grads

In [None]:
def update_parameters(parameters, grads, learning_rate):
    for l in range(1, len(parameters)//2 + 1 ):
        parameters['W' + str(l)] -= learning_rate * grads['dW' + str(l)]
        parameters['b' + str(l)] -= learning_rate * grads['db' + str(l)]
    return parameters



In [None]:
def the_model(X, Y, layers_dims, learning_rate, num_iterations, print_cost=True):
    
    np.random.seed(1)
    costs = []
    
    parameters = Initialize_parameters_deep(layers_dims)
#     parameters = np.load('parameters.npy').item()
    for i in range(num_iterations+1):
        AL, caches = forward_propogation(X, parameters)
        cost = compute_cost(AL, Y)
        grads = backward_propogation(AL, Y, caches)
        parameters = update_parameters(parameters, grads, learning_rate)
        if (i%50000==0):
            print('Cost at iteration %s is %s' %(i, cost))
        if(i%10000==0):
            costs.append(cost)
            
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    np.save("parameters", parameters)
    return parameters


In [None]:
def predictAccuracy(X, Y, parameters):
    
    m = X.shape[1]
    p = np.zeros((1, m))
    probas, caches = forward_propogation(X, parameters)
    
    # convert probas to 0/1 predictions
    for i in range(0, probas.shape[1]):
        if probas[0, i] > 0.4:
            p[0, i] = 1
        else:
            p[0, i] = 0
    
    print("Accuracy: "  + str(np.sum((p == Y)) / m))
        
    return np.squeeze(p)

In [None]:
%%time
layers_dims = [5, 10, 1]
parameters = the_model(X_train, Y_train, layers_dims, learning_rate=0.001, num_iterations=200000, print_cost=True)

In [None]:
%%time
prob = predictAccuracy(X_train, Y_train, parameters)

In [None]:
%%time
prob = predictAccuracy(X_test, Y_test, parameters)

In [None]:
np.save("parameters-v1", parameters)

## Evaluation Time!

### Test Data cleaning

In [None]:
test_data = pd.read_csv('test.csv')

In [None]:
test_data.isnull().sum()

In [None]:
test_data['Initial'] = test_data.Name.str.extract('([A-Za-z]+)\.', expand=True)
test_data.head()

In [None]:
test_data.groupby('Initial')['Age'].count()

In [None]:
test_data['Initial'].replace(['Col', 'Dona','Dr', 'Ms', 'Rev'], ['Other', 'Miss', 'Mr', 'Miss', 'Other'], inplace=True)
test_data.groupby('Initial')['Age'].count()

In [None]:
test_data.groupby('Initial')['Age'].mean()

In [None]:
## Assigning the NaN Values with the Ceil values of the mean ages
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Mr'),'Age']=33
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Mrs'),'Age']=39
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Master'),'Age']=7
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Miss'),'Age']=22
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Other'),'Age']=43

In [None]:
test_data['Age_band']=0
test_data.loc[test_data['Age']<=16,'Age_band']=0
test_data.loc[(test_data['Age']>16)&(test_data['Age']<=32),'Age_band']=1
test_data.loc[(test_data['Age']>32)&(test_data['Age']<=48),'Age_band']=2
test_data.loc[(test_data['Age']>48)&(test_data['Age']<=64),'Age_band']=3
test_data.loc[test_data['Age']>64,'Age_band']=4

In [None]:
data['Age_band'].value_counts().to_frame()

In [None]:
test_data['Sex'].replace(['male','female'],[0,1],inplace=True)
test_data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
test_data['Initial'].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)

### Run Model on Test data

In [None]:
X = test_data[['Pclass', 'Sex', 'Age_band', 'Embarked', 'Initial']].values
X = X.T.astype(float)
X.shape

In [None]:
def predict(X, parameters):
    m = X.shape[1]
    p = np.zeros((1, m))
    probas, caches = forward_propogation(X, parameters)
    
    for i in range(0, probas.shape[1]):
        if probas[0, i] > 0.4:
            p[0, i] = 1
        else:
            p[0, i] = 0

    return np.squeeze(p)

In [None]:
Y = predict(X, parameters)

#### Generate csv file for submission

In [None]:
with open('submission-v1.csv', 'w') as file:
    writer = csv.writer(file)
    writer.writerow(['PassengerId', 'Survived'])
    for index, row in test_data.iterrows():
        writer.writerow([row['PassengerId'], int(Y[index])])    