In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# First, I'll read the data and split into training and test sets

In [None]:
#Read Raw data
import pandas as pd
df = pd.read_csv('/kaggle/input/churn-modelling/Churn_Modelling.csv', usecols=range(3,14))
df.head()

In [None]:
#Process data

#Replace countries with numbers
countries = df["Geography"].unique()
countries_map = dict()
for i in range(len(countries)):
    countries_map[countries[i]] = i
df["Geography"] = df["Geography"].replace(countries_map)

#Replace gender with numbers
gender = df["Gender"].unique()
gender_map = dict()
for i in range(len(gender)):
    gender_map[gender[i]] = i
df["Gender"] = df["Gender"].replace(gender_map)

#Split into test and train
test = df.head(int(df.shape[0] * 0.3))
train = df.tail(int(df.shape[0] * 0.7))

In [None]:
#Checking correlations to see if there are any especially strong features.
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()
ax = sns.heatmap(test.corr())

No features are highly correlated with whether an employee exits the company. Thus all features are retained for now.

# Setting Up a Neural Network

In [None]:
#Currently, just randomly initialise parameters
layers = [10,100,25,1]
weights = [np.random.randn(layers[l+1],layers[l]) for l in range(len(layers)-1)]
biases = [np.random.randn(layers[l],1) for l in range(1,len(layers))]
reg_param = 0 #No regularisation right now
batch_size = 50
epochs = 10
step_size = 0.01

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(z):
    return sigmoid(z) * (1 - sigmoid(z))

def feedforward(X,weights,biases):
    activations = [X.T]
    weighted_inputs = []
    
    for l in range(len(biases)):
        z = np.dot(weights[l],activations[l]) + biases[l]
        weighted_inputs.append(z)
        a = sigmoid(z)
        activations.append(a)
    
    return activations,weighted_inputs

def cost_function(X,y,reg_param,weights,biases):
    activations,weighted_inputs = feedforward(X,weights,biases)
    m = y.size
    
    #Logistic cost function
    cost = np.multiply(y.T, np.log(activations[-1])) + np.multiply((1-y).T, np.log(1-activations[-1]))    
    cost = np.sum(cost) / -m
    
    #Regularisation
    if (not reg_param == 0):
        for l in range(len(weights)):
            cost += reg_param / (2*m) * np.sum(weights[l] ** 2)   
            
    weights_gradient,biases_gradient = backpropagation(activations,weighted_inputs,weights,biases,y,m)
    
    return cost,weights_gradient,biases_gradient

def backpropagation(activations,weighted_inputs,weights,biases,y,m):
    #Backpropagation - output layer
    output_error = (activations[-1].reshape(y.size) - y.reshape(y.size))
    errors = [output_error.reshape((1, output_error.size))] 
    
    weights_gradient = [np.dot(errors[0], activations[-2].T) / m]
    biases_gradient = [np.sum(errors[0], axis=1) / m]
    
    #Backpropagation - other layers (I DON'T UNDERSTAND THIS PART. I JUST ALIGNED MATRIX DIMENSIONS TO AVOID ERRORS)
    for l in range(1, len(biases)):
        #Weight layers selected = [2],[1]. Errors selected = [2],[1]. Weighted inputs selected = [1],[0]
        current_error = np.dot(weights[len(weights) - l].T, errors[0]) * sigmoid_derivative(weighted_inputs[len(weighted_inputs) - l - 1])
        errors.insert(0, current_error)
    
        biases_gradient.insert(0, np.sum(current_error, axis=1) / m) #Sum across examples
        #Activation layers selected = [1], [0]
        weight_grad = np.dot( current_error, activations[len(activations) - l - 2].T ) / m
        weights_gradient.insert(0, weight_grad)

    return weights_gradient,biases_gradient

def miniBatchGradientDescent(X,y,batch_size,epochs,step_size,reg_param,weights,biases):
    cost_history = []
    m = y.size
    
    for epoch in range(epochs):
        print(f"\n Started Training Epoch: {epoch+1}")
        for i in range(0,m,batch_size):
            print(".", end="")
            
            #Get cost and gradients
            last_example = i + batch_size
            X_batch = X[i:last_example]
            y_batch = y[i:last_example]
            cost,weights_gradient,biases_gradient = cost_function(X_batch,y_batch,reg_param,weights,biases)

            #Updates
            cost_history.append(cost)
            weights = [layer[0] - step_size * layer[1] for layer in zip(weights,weights_gradient)]
            biases = [layer[0] - step_size * layer[1].reshape((layer[1].size, 1)) for layer in zip(biases,biases_gradient)]
    
    return cost_history,weights,biases

# Now I'll actually train and test the model

In [None]:
X_train = train.loc[:, train.columns != 'Exited'].to_numpy()
y_train = train["Exited"].to_numpy()
y_train = y_train.reshape((y_train.size,1))
cost_history,weights,biases = miniBatchGradientDescent(X_train,y_train,batch_size,epochs,step_size,reg_param,weights,biases)

import matplotlib.pyplot as plt
average_over = 5
averaged_cost = []
for i in range(0,len(cost_history),average_over):
    avg = sum(cost_history[i:i+average_over]) / average_over
    averaged_cost.append(avg)

plt.plot(range(len(averaged_cost)), averaged_cost, 'r-')
plt.xlabel("Num Iterations")
plt.ylabel("Cost")

I'm seeing the same cost patterns repeat over the epochs. What I think that suggests is that the model isn't making good parameter updates on the overall dataset and optimising for local batches. 
It could also be that the model is overfitting SEVERELY.

So I'm going to create a regular gradient descent algorithm instead of using mini-batches. And then, I'll create learning curves to check bias vs. variance.

In [None]:
def gradientDescent(X,y,step_size,max_iters,reg_param,weights,biases):
    cost_history = []
    m = y.size
    
    for i in range(max_iters):
        print(".", end="")
        cost,weights_gradient,biases_gradient = cost_function(X,y,reg_param,weights,biases)

        #Updates
        cost_history.append(cost)
        weights = [layer[0] - step_size * layer[1] for layer in zip(weights,weights_gradient)]
        biases = [layer[0] - step_size * layer[1].reshape((layer[1].size, 1)) for layer in zip(biases,biases_gradient)]
    
    return cost_history,weights,biases

Trying, gradient descent without mini-batches

In [None]:
X_train = train.loc[:, train.columns != 'Exited'].to_numpy()
y_train = train["Exited"].to_numpy()
y_train = y_train.reshape((y_train.size,1))
max_iters = 100

cost_history,weights,biases = gradientDescent(X_train,y_train,step_size,max_iters,reg_param,weights,biases)

plt.plot(range(len(cost_history)), cost_history, 'r-')
plt.xlabel("Num Iterations")
plt.ylabel("Cost")

This is REALLY weird :O I've never seen a linear cost decrease???? What am I doing wrong???

# Evaluating Accuracy and Learning Curves

In [None]:
#Misclassification accuracy
X_test = test.loc[:, test.columns != "Exited"].to_numpy()
y_test = test["Exited"].to_numpy()

activations,weighted_inputs = feedforward(X_test,weights,biases)
pred = activations[-1]
pred = pred.reshape((pred.size))
pred = [1 if x > 0.5 else 0 for x in pred]

correct = [1 if x == y else 0 for x,y in zip(pred,y_test)]
accuracy = sum(correct) / len(correct)
print(f"The model is {accuracy * 100}% accurate")

In [None]:
#Learning curves
m_step = 50
m_levels = range(0,1000,m_step)
train_costs = []
test_costs = []

for i in m_levels:
    X_batch_train = X_train[i:i+m_step]
    y_batch_train = y_train[i:i+m_step]
    X_batch_test = X_test[i:i+m_step]
    y_batch_test = y_test[i:i+m_step]
    
    cost_history,weights,biases = \
        gradientDescent(X_batch_train,y_batch_train,step_size,max_iters,reg_param,weights,biases)
    test_cost,_,_ = cost_function(X_batch_test,y_batch_test,reg_param,weights,biases)
    
    train_costs.append(cost_history[-1])
    test_costs.append(test_cost)

In [None]:
plt.plot(list(m_levels), train_costs, 'r-', label="train_costs")
plt.plot(list(m_levels), test_costs, 'b-', label="test_costs")
plt.xlabel("Num examples")
plt.ylabel("Cost")
plt.legend()
plt.show()