In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as pt
import seaborn as sns
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing data

In [None]:
input_ads = pd.read_csv('../input/titanic/train_data.csv')
input_ads.drop(columns=['Unnamed: 0','Title_1','Title_2','Title_3','Title_4'],inplace=True) #Dropping un-necessary columns
#-----------------------------------------------------------------
print(input_ads.shape)
input_ads.head()

# Null Check

In [None]:
pd.DataFrame(input_ads.isnull().sum()).T

# Description of the data

In [None]:
input_ads.describe()

# Description of target variable

In [None]:
#Total survived vs not-survived split in the training data
input_ads['Survived'].value_counts()

# Manipulation of data into train-test

In [None]:
target = 'Survived' #To predict

#--------------------------------------------------------------------------------
#Splitting into X & Y datasets (supervised training)
X = input_ads[[cols for cols in list(input_ads.columns) if target not in cols]]
y = input_ads[target]

#--------------------------------------------------------------------------------
#Since test data is already placed in the input folder separately, we will just import it
test_ads = pd.read_csv('../input/titanic/test_data.csv')
test_ads.drop(columns=['Unnamed: 0','Title_1','Title_2','Title_3','Title_4'],inplace=True) #Dropping un-necessary columns

#Splitting into X & Y datasets (supervised training)
X_test = test_ads[[cols for cols in list(test_ads.columns) if target not in cols]]
y_test = test_ads[target]

print('Train % of total data:',100 * X.shape[0]/(X.shape[0] + X_test.shape[0]))
#--------------------------------------------------------------------------------
#Manipulation of datasets for convenience and consistency
X_arr = np.array(X)
X_test_arr = np.array(X_test)

y_arr = np.array(y).reshape(X_arr.shape[0],1)
y_test_arr = np.array(y_test).reshape(X_test_arr.shape[0],1)

#--------------------------------------------------------------------------------
#Basic Summary
print(X_arr.shape)
print(X_test_arr.shape)
print(y_arr.shape)

# Standard scaling the x-data

In [None]:
from sklearn.preprocessing import StandardScaler

#----------------------------------------------------------
scaler = StandardScaler()
X_arr = scaler.fit_transform(X_arr)
X_test_arr = scaler.transform(X_test_arr)

#----------------------------------------------------------
X_arr[0:3]

# Artificial Neural Network (ANN) from Scratch

## UDFs for activation, initialization, layer_propagation

In [None]:
#All popular activation functions
def activation_fn(z,type_):
    
    #print('Activation : ',type_)
    
    if type_=='linear':
        activated_arr = z
    
    elif type_=='sigmoid':
        activated_arr = 1/(1+np.exp(-z))
    
    elif type_=='relu': 
        activated_arr = np.maximum(np.zeros(z.shape),z)
    
    elif type_=='tanh':
        activated_arr = (np.exp(z)-np.exp(-z))/(np.exp(z)+np.exp(-z))
    
    elif type_=='leaky_relu':
        activated_arr = np.maximum(0.01*z,z)
    
    elif type_=='softmax':
        exp_ = np.exp(z)
        exp_sum = np.sum(exp_)
        activated_arr = exp_/exp_sum
        
    return activated_arr

#----------------------------------------------------------------------------------------------------------------------------
#Initialization of params
def generate_param_grid(a_prev,n_hidden,hidden_size_list):
    
    parameters = {}
    features = a_prev.shape[0] #Total features
    n_examples = a_prev.shape[1]
    
    for n_hidden_idx in range(1,n_hidden+1):
        
        n_hidden_nodes = hidden_size_list[n_hidden_idx] #Should start from 0
        
        #print('#------------ Layer :',n_hidden_idx,'---- Size :',n_hidden_nodes,'---- Prev features :',features,'------#')

        parameters['w' + str(n_hidden_idx)] = np.random.rand(n_hidden_nodes,features) * 0.1 #Xavier Initialization
        parameters['b' + str(n_hidden_idx)] = np.zeros((n_hidden_nodes,1)) * 0.1
        
        features = n_hidden_nodes
    
    return parameters#Return randomly initiated params
    
#---------------------------------------------------------------------------------------------------------------------------
#Propagation between z and activation
def layer_propagation(a_prev,w,b,activation):
    
    #print(a_prev.shape)
    #print(w.shape)
    #print(b.shape)
    
    z_ = np.dot(w,a_prev) + b
    
    a = activation_fn(z=z_,
                      type_=activation)
    
    return z_,a

## UDF for forward propagation

In [None]:
def forward_propagation(params_dict,data_x,data_y,n_hidden,hidden_size_list,activation_list):
    
    cache = {'a0' : data_x.T}
    a = data_x.T.copy()
    
    for layer_idx in range(1,n_hidden+1):
        
        #print('#---------- Layer :',layer_idx,'-- No of Nodes :',hidden_size_list[layer_idx])
        #nodes = hidden_size_list[layer_idx]
        activation_ = activation_list[layer_idx]
        w_ = params_dict['w'+str(layer_idx)]
        b_ = params_dict['b'+str(layer_idx)]
        
        z,a = layer_propagation(a_prev=a,
                                 w=w_,
                                 b=b_,
                                 activation=activation_)
        
        cache['z'+str(layer_idx)] = z
        cache['a'+str(layer_idx)] = a
    
    return cache,a

## UDF for cost calculation, gradient calculation & back-propagation 

In [None]:
#Calculation of the total cost incurred by the model
def cost_calculation(activation_list,y_true,y_pred):
    
    if activation_list[-1]=='sigmoid':
        #print('sig')
        m = y_true.shape[1]
        cost = (-1/m) * np.sum((y_true * np.log(y_pred)) + ((1-y_true) * np.log(1 - y_pred)))
        
    elif activation_list[-1]=='linear':
        
        m = y_true.shape[1]
        cost = (1/m) * np.sum(np.square(y_true-y_pred))
        
     ##-------------------->> Softmax to be added <<----------------------
    
    return cost

#Gradient of the activation functions wrt corresponding z
#--------------------------------------------------------------------------------------------
#Gradient for each activation type
def grad_fn_dz(activation,a):
    
    if activation=='linear':
        grad = 1
     
    elif activation=='sigmoid':
        grad = a*(1-a)
        
    elif activation=='tanh':
        grad = np.square(1-a)
        
    elif activation=='relu':
        grad = np.where(a>=0,1,0)
    
    elif activation=='leaky_relu':
        grad = np.where(a>=0,1,0.01)
    
    ##-------------------->> Softmax to be added <<----------------------
    
    return grad
        
#--------------------------------------------------------------------------------------------
#UDF for gradient of loss function wrt last layer
def dL_last_layer(activation_list,y_true,y_pred):
    
    if activation_list[-1]=='sigmoid':
        
        #print('Last Layer y true shape :',y_true.shape)
        #print('Last Layer y pred shape :',y_pred.shape)
        
        grad_final_layer = -((y_true/y_pred) - ((1-y_true)/(1-y_pred)))
        #print('Last Layer gradient shape :',grad_final_layer.shape)
        
    elif activation_list[-1]=='linear':
        
        grad_final_layer = - 2 * (y_true-y_pred) #Check the sign
        
    return grad_final_layer

#--------------------------------------------------------------------------------------------
#Back=Propagation         
def back_propagation(cache,params_dict,data_x,data_y,n_hidden,hidden_size_list,activation_list,y_pred):
    
    grads_cache = {}
    #db_cache = {}
    
    da = dL_last_layer(activation_list=activation_list,
                             y_true=data_y.T,
                             y_pred=y_pred)
    #print('Final da shape :',da.shape)
    
    m = data_y.shape[0] #Data in the batches
    
    #print('dm in backprop :',m)
    for layer_idx in list(reversed(range(1,n_hidden+1))):
        
        #print('# -------- Layer :',layer_idx,'-------- Size :',hidden_size_list[layer_idx],'--------#')
        
        activation_ = activation_list[layer_idx]
        a = cache['a'+str(layer_idx)]
        a_prev = cache['a'+str(layer_idx-1)]
        w = params_dict['w'+str(layer_idx)]
        
#         print('Shape of a:',a.shape)
#         print('Shape of a_prev:',a_prev.shape)
#         print('SHape of w:',w.shape)
        
        #z = 
        
        dz =  da * (grad_fn_dz(activation=activation_,a=a))
        
        #print('dz shape :',dz.shape)
                     
        dw = (1/m) * np.dot(dz, a_prev.T)
        #print('dw shape :',dw.shape)
        grads_cache['dw'+str(layer_idx)] = dw
                     
        db = (1/m) * np.sum(dz, axis=1,keepdims=True)
        #print('db shape :',db.shape)
        grads_cache['db'+str(layer_idx)] = db
        
        da = np.dot(w.T,dz)
        #print('da shape :',da.shape)

    return grads_cache

## UDF for updating weights through gradient descent

In [None]:
def update_weights(params,grads_cache,alpha,n_hidden):
    
    for layer_idx in list(reversed(range(1,n_hidden+1))):
        
        #print('#---- layer :',layer_idx,'----#')
        
        dw = grads_cache['dw'+str(layer_idx)]
        db = grads_cache['db'+str(layer_idx)]
        
#         print('dw shape :',dw.shape)
#         print('db shape :',db.shape)
#         print('w shape :',params['w'+str(layer_idx)].shape)
        
        params['w'+str(layer_idx)] -= alpha * dw
        params['b'+str(layer_idx)] -= alpha * db

    
    return params

## UDF for predictions

In [None]:
def prediction(params,test_x,n_hidden,hidden_size_list,activation_list,threshold):
    
    #-----------------------------------------------------------------
    #Forward Propagation on trained weights
    cache,y_pred = forward_propagation(params_dict=params,
                                  data_x=test_x,
                                  data_y=None,
                                  n_hidden=n_hidden,
                                  hidden_size_list=hidden_size_list,
                                  activation_list=activation_list)
    #print(cache)
    preds = np.where(y_pred>threshold,1,0).astype(float)
    return cache,np.round(y_pred,4),preds

## Stochastic Gradient Descent (SGD) for training of the ANN 

In [None]:
def ANN_train_sgd(data_x_overall,data_y_overall,batch_size,alpha,n_iters,n_hidden,hidden_size_list,activation_list):
    
    print('Total training rows :',data_x_overall.shape[0])
    
    #----------------------------------------------------------------------------------------
    #Creating x-y batches according to the provided batch_size
    
    n_batches = data_x_overall.shape[0]//batch_size
    print('Total Batches to create in each epoch/iter :',n_batches)
    
    batches_x = np.array_split(data_x_overall,n_batches)
    print('Total Batches of X:',len(batches_x))

    batches_y = np.array_split(data_y_overall,n_batches)
    print('Total Batches of y:',len(batches_y))
    #-------------------------------------------------------------------------------------------
    cost_history = [] #Record of cost through epochs

    #-------------------------------------------------------------------------------------------
    #Initialization of params
    params_dict = generate_param_grid(a_prev=data_x_overall.T,
                             n_hidden=n_hidden,
                             hidden_size_list=hidden_size_list)
    print('#----------------- Initial params ------------------#')
    print(params_dict)
    initial_params_abcd = params_dict.copy()
    
    #-------------------------------------------------------------------------------------------
    cache_tray = []

    for epoch in range(n_iters):

        if (epoch>0) & (epoch%100==0):
            print('#----------------------------------- Epoch :',epoch,'--------------------------------------#')
            print('cost :',cost)
            
        for j in range(len(batches_x)): #For each batch created for each epoch/iter
            
            #-------------------------------------------------------------------------
            #For each batch of data
            data_x = batches_x[j]
            data_y = batches_y[j]

            #-------------------------------------------------------------------------
            #Forward Propagation
            cache,y_pred = forward_propagation(params_dict=params_dict,
                                          data_x=data_x,
                                          data_y=data_y,
                                          n_hidden=n_hidden,
                                          hidden_size_list=hidden_size_list,
                                          activation_list=activation_list)
            #print(np.max(y_pred))
            #cache_tray.append(cache)
            #-------------------------------------------------------------------------
            #Cost calculation
            cost = cost_calculation(activation_list=activation_list,
                             y_true=data_y.T,
                             y_pred=y_pred)

            #cost_history.append(cost)
            #print('cost :',cost)

            #-------------------------------------------------------------------------
            #Back Propagation
            grads_cache_ = back_propagation(cache=cache,
                                           params_dict=params_dict,
                                           data_x=data_x,
                                           data_y=data_y,
                                           n_hidden=n_hidden,
                                           hidden_size_list=hidden_size_list,
                                           activation_list=activation_list,
                                           y_pred=y_pred)

            #------------------------------------------------------------------------
            #Updating weights
            params_dict = update_weights(params=params_dict,
                                         grads_cache=grads_cache_,
                                         alpha=alpha,
                                         n_hidden=n_hidden)
            
        cost_history.append(cost) #Appending cost after each epoch


    return initial_params_abcd,params_dict,grads_cache_,cost_history,y_pred,cache_tray


# Training the model by invoking the above UDF

In [None]:
#Defining hyper-parameters for ANN
#--------------------------------------------------------------------------------------------------------------------------
n_hidden = 2       #No of hidden layers
alpha = 0.003      #Learning_rate
n_iters = 501      #Total epochs
hidden_size_list = [0,3,1]               #first element will be 0 and not counted in hidden layers
activation_list = [0,'relu','sigmoid']   #first element will be 0 and not counted in hidden layers
batch_size = 25    #Batch wise gradient descent

#--------------------------------------------------------------------------------------------------------------------------
initial_params_train,params_dict_train,grads,cost_history_train,y_pred_train,cache_tray = ANN_train_sgd(data_x_overall=X_arr,
                                                                                                       data_y_overall=y_arr,
                                                                                                       batch_size=batch_size,
                                                                                                       alpha=alpha,
                                                                                                       n_iters=n_iters,
                                                                                                       n_hidden=n_hidden,
                                                                                                       hidden_size_list=hidden_size_list,
                                                                                                       activation_list=activation_list)

# Cost-Epoch plot for the manual ANN training 

In [None]:
#Cost plot over epochs (1 value at end of each epoch) - over the last batch
ax = sns.lineplot(x=list(range(n_iters)),y=cost_history_train)
ax.set(xlabel='epochs',ylabel='cost',title='Cost vs epoch plot for Manual ANN')

# Predict on the test data

In [None]:
cache,preds_proba,manual_preds = prediction(params=params_dict_train,
                                            test_x=X_test_arr,
                                            n_hidden=n_hidden,
                                            hidden_size_list=hidden_size_list,
                                            activation_list=activation_list,
                                            threshold=0.5)

#-------------------------------------------------------------------------------------------
print('Shape of prediction array :',preds_proba.shape)
print('Unique predictions :',np.unique(manual_preds))
print('Unique of predict proba :',np.unique(preds_proba),'\n')

print('#--------------------- Evaluation ----------------------#')
#Evaluation of the predictions
print('ROC AUC of test set :',roc_auc_score(y_test_arr.ravel(),manual_preds.ravel()))
print('Accuracy of test set :',accuracy_score(y_test_arr.ravel(),manual_preds.ravel()))

# Benchmarking with Keras functional API

## Importing necessary libraries

In [None]:
import tensorflow as tf
import keras
import tensorflow.keras.models
import tensorflow.keras.layers as tfl
from tensorflow.keras import Input
from tensorflow.keras import Model
from sklearn.preprocessing import StandardScaler
from keras.layers import BatchNormalization

## Defining the model with same specifications as manual 

In [None]:
def ANN_keras(x):
    
    input_ = tfl.Input(shape=(x.shape[1],))
    
    x = tfl.Dense(3,activation='relu', name = 'Dense_3')(input_) #Layer 1
    
    preds = tfl.Dense(1, activation="sigmoid", name="pred")(x) #Output layer
    
    model = Model(input_, preds, name="ANN_keras")
    model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.SGD(learning_rate=alpha)) #Stochastic Gradient Descent with specified alpha
    
    return model
    
model = ANN_keras(X_arr)
model.summary()

## Training the model

In [None]:
history = model.fit(X_arr,y_arr, epochs=n_iters, batch_size=batch_size,
                    validation_data = (X_test_arr,y_test_arr),verbose=1)

## Predicting through keras model

In [None]:
keras_pred = model.predict(X_test_arr)
keras_pred = np.where(keras_pred>0.5,1,0)

#print(np.unique(keras_pred))
print('#--------------------- Evaluation ----------------------#')
#Evaluation of the predictions
print('ROC AUC of test set :',roc_auc_score(y_test_arr.ravel(),keras_pred.ravel()))
print('Accuracy of test set :',accuracy_score(y_test_arr.ravel(),keras_pred.ravel()))

# Insights : The manual implementation of ANN is giving very similar predictions as that to the Keras counterparts, indicating the implementation is correct and comparable

# END