In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/churn-modelling/Churn_Modelling.csv')
df.head()

In [None]:
df.shape

From above it is clear that RowNumber; CustomerID, Surname are not required for training the model, and will not play any role. Also these are just a unique for a Customer.
So will ignore them, and consider rest of the features.

In [None]:
X = df.iloc[:,3:13]
y = df.iloc[:, 13]

In [None]:
X.shape, y.shape

We also observed that Geography and Gender are Categorical features, so we have to make it Numerical using dummies method.

In [None]:
X.Geography.unique()

In [None]:
X['Gender'].unique()

In [None]:
geo_cat = pd.get_dummies(X["Geography"], drop_first = True)
gender_cat = pd.get_dummies(X['Gender'], drop_first = True)

In [None]:
geo_cat.head()

In [None]:
# merge geo_cat and gender_cat into our X.

X = pd.concat([X, geo_cat, gender_cat], axis = 1)

In [None]:
X.head()

Now have to drop original features, we now we have numerical format for those.


In [None]:
X = X.drop(['Geography',"Gender"], axis = 1)

In [None]:
X.head()

Now splitting the dataset into the training and test split using sklearn

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

Feature Scaling: 

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)


In [None]:
X.head()

In [None]:
X_train

# Step 2: Working on DL

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
# from keras.layers import LeakyReLU, PReLU, ELU
# from keras.layers import Dropout

In [None]:
# Initializing the ANN
clf = Sequential()

While defining Dense layer we have to pass some parameters / arguments such as
* `units` which is to define how many neurons we need in the hidden layer. So here we are taking it as 6. This is just a randon guess. But using Hyper Optimization we may get to the exact count of neurons required.
* `kernel_initializer` - Initializer for the kernel weights matrix. Here we are going with `he_uniform` as this works well with relu activation function.
* `activation` - using relu. Generally for hidden layer relu or leaky relu is uased.. and in the output layer we may use sigmoid or softmax. As relu helps in Vanishing Gradient Problem.
* `input_dim` - Here we have taken it as 11, and this is the count of features which we are passing to the model. Check X_train.shape

In [None]:
# Adding the Input layer and the first hidden layer.
clf.add(Dense(units = 6, kernel_initializer = 'he_uniform', activation = 'relu', input_dim = 11 ))
# clf.add(Dense(output_dim = 6, init = 'he_uniform', activation = 'relu', input_dim = 11 )) # Parameter name chaned refer to https://keras.io/api/layers/core_layers/dense/

As we are defining the second layer we will not require the `input_dim`.

In [None]:
# Adding the second hidden layer
clf.add(Dense(units = 6, kernel_initializer = 'he_uniform', activation = 'relu' ))

In [None]:
# Adding Output Layer
clf.add(Dense(units = 1, kernel_initializer = 'glorot_uniform', activation = 'sigmoid'))

So till here we had made a model with 1 Input Layer; 2 Hidden Layers; and 1 Output Layer.
In Input Layer we have 11 Neurons (which are my features); and first Dense Layer we do have 6 Neurons; and in 2nd hidden layer as well we do have 6 neurons; finally in the output layer we have just one neuron.

In [None]:
# Classifier or Model Summary.
clf.summary()

Once the model is created, you can config the model with losses and metrics with `model.compile()`, train the model with `model.fit()`, or use the model to do prediction with `model.predict()`.
Refer https://keras.io/api/models/model/

So to compile as well we have to set some parameters such as 
* `optimizer` - adam is one of the most popular one so using it. Other optimizer can be found at https://keras.io/api/optimizers/
* `loss` - The purpose of loss functions is to compute the quantity that a model should seek to minimize during training. Based on the problem we are solving there are various loss functions. Here we are using binary_crossentropy. Use this cross-entropy loss when there are only two label classes (assumed to be 0 and 1). For each example, there should be a single floating-point value per prediction. Refer site for more details https://keras.io/api/losses/ 
* `metrics` - The compile() method takes a metrics argument, which is a list of metrics. A metric is a function that is used to judge the performance of your model. Metric functions are similar to loss functions, except that the results from evaluating a metric are not used when training the model. Note that you may use any loss function as a metric. Various metrics are available, out of which we are using accuracy.Refer https://keras.io/api/metrics/

In [None]:
# Compiling the model
clf.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In the `model.fit()` method we do have some parameters which we are going to use.
   
* X : our training data. Can be Vector, array or matrix      
* Y : our training labels. Can be Vector, array or matrix   
* validation_split : to split the provided dataset into 2 sets one for training and another for validation.    
* Batch_size : it can take any integer value or NULL and by default, it will be set to 32. It specifies no. of samples per gradient.      
* Epochs : an integer and number of epochs we want to train our model for.      
* Verbose : specifies verbosity mode(0 = silent, 1= progress bar, 2 = one line per epoch).      
* Shuffle : whether we want to shuffle our training data before each epoch.      
* steps_per_epoch : it specifies the total number of steps taken before one epoch has finished and started the next epoch. By default it values is set to NULL.

In [None]:
# Fit the model.
clf_history = clf.fit(X_train, y_train, validation_split = 0.33, batch_size = 10, epochs = 100)

From above output we can see that the model is getting trained and also displaying the validation accuray `val_accuracy` after each epoch.
Also we have `loss` and `accuracy` calculated on train dataset, and on the validation dataset as `val_loss` and `val_accuracy`.
On comparing the two accuracy we can say that the model was doing pretty good.. and both are nearby.. and not huge difference. If having huge difference then some issue can be considered.

Lets see what our clf_history is hodling..

In [None]:
clf_history

for `tf.keras.callbacks.History` refer https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/History 

In [None]:
clf_history.history.keys()

This looks something familiar... its a dictionary.
and it has the same values which we were seeing loss, accuracy, val_loss, val_accuracy during rum-time.
Lets use it to plot a visualization graph.

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(clf_history.history['accuracy'])
plt.plot(clf_history.history['val_accuracy'])

plt.title('Model Accuracy')

plt.xlabel('Epochs')
plt.ylabel('Accuracy')

plt.legend(['Train', 'Validation'], loc = 'best')

plt.show()

Other options for loc in legend:
    best
	upper right
	upper left
	lower left
	lower right
	right
	center left
	center right
	lower center
	upper center
	center

From visualization we can see that it was increasing...

Similar to Accuracy.. we can visualize for Loss.

In [None]:
plt.plot(clf_history.history['loss'])
plt.plot(clf_history.history['val_loss'])

plt.title('Model Loss')

plt.xlabel('Epochs')
plt.ylabel('Loss')

plt.legend(['Train', 'Validation'], loc = 'best')

plt.show()

# Using Test Dataset to predict
Now lets test the model with our test dataset.

In [None]:
y_pred = clf.predict(X_test)

In [None]:
y_pred

In [None]:
# Lets set the threshold... if less than 0.5 than set it to false.
y_pred = (y_pred > 0.5)
y_pred

In [None]:
# Lets see the accuracy of our Test Dataset.
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm

Calculate the accuracy on test dataset

In [None]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)

score

So we have seen that the accuracy on

* Train Dataset was = 0.8690 
* Validation Dataset = 0.8614
* Test Dataset = 0.859

and all are approx nearby... which is the best way to see if model is doing good.. or if not then there is ovefit issue.

We can play around with the parameters in the model.

This time lets take 3 hidden layes and change the number of neurons in each hidden layer.
Also changing the kernel_initializer.

In [None]:
clf2 = Sequential()

# Adding the Input layer and the first hidden layer.
clf2.add(Dense(units = 10, kernel_initializer = 'he_normal', activation = 'relu', input_dim = 11 ))


# Adding the second hidden layer
clf2.add(Dense(units = 20, kernel_initializer = 'he_normal', activation = 'relu' ))

# Adding the third hidden layer
clf2.add(Dense(units = 15, kernel_initializer = 'he_normal', activation = 'relu' ))


# Adding Output Layer
clf2.add(Dense(units = 1, kernel_initializer = 'glorot_uniform', activation = 'sigmoid'))

# Compiling the model
clf2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fit the model.
clf2_history = clf2.fit(X_train, y_train, validation_split = 0.33, batch_size = 10, epochs = 100)



In [None]:
y_pred = clf2.predict(X_test)
y_pred = (y_pred > 0.5)
score = accuracy_score(y_pred, y_test)
score

With the change of parameters, we observe that the accuracy has changed.

* Train Dataset was = 0.8690 --> 0.8750
* Validation Dataset = 0.8614 --> 0.8504 (seems gone down.. but a very small difference.)
* Test Dataset = 0.859 --> 0.845  (seems gone down.. but a very small difference.)

Besure not to add to many hidden layers.. as it will lead to overfitting of the data.

We can also try using the dropout layer after each hidden layer.. so as to drop some neuorns based on some thereshold.

In [None]:
from keras.layers import Dropout

In [None]:
clf3 = Sequential()

# Adding the Input layer and the first hidden layer.
clf3.add(Dense(units = 10, kernel_initializer = 'he_normal', activation = 'relu', input_dim = 11 ))

# Add dropout layer
clf3.add(Dropout(0.3)) # This is just a random threshold as of now

# Adding the second hidden layer
clf3.add(Dense(units = 20, kernel_initializer = 'he_normal', activation = 'relu' ))

# Add dropout layer
clf3.add(Dropout(0.4)) # This is just a random threshold as of now

# Adding the third hidden layer
clf3.add(Dense(units = 15, kernel_initializer = 'he_normal', activation = 'relu' ))

# Add dropout layer
clf3.add(Dropout(0.2)) # This is just a random threshold as of now

# Adding Output Layer
clf3.add(Dense(units = 1, kernel_initializer = 'glorot_uniform', activation = 'sigmoid'))

# Compiling the model
clf3.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fit the model.
clf3_history = clf3.fit(X_train, y_train, validation_split = 0.33, batch_size = 10, epochs = 100)
