In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"

In [None]:
#input the dataset

#csv file formats are most common and convinient . 
#No surprise python has a separate library for that!!

data=pd.read_csv("../input/deep-learning-az-ann/Churn_Modelling.csv")

#../input/path is a way how the path for raw data files are found.

# **General Trivia**
#One can use the special characters double-dot (..) to move one directory up 
#instead of placing 'kaggle' everytime while taking the input. 

data.head()
# Generally done to display get first look of the dataset in first five rows by default.

In [None]:
#Returns the dimension of your dataframe- (rows , columns). 
#Gives one an idea of number of features 
# and rows that will be trained further in the model.

data.shape

In [None]:
#Returns all statiscal details for each feature present in the data.
#Common idea - One can see the data with no blinders on 
#and study how the values range in each feature.

data.describe()

In [None]:
# Returns the data type of each column .
# Commmon idea - One gets to decide which columns are categorical
# or numeric(int/float) in their type 
# and how can one commence data pre-processing journey.

data.dtypes

In [None]:
#Returns null values present in the dataset
#Common idea- One gets to decide which columns need missing value treatments.

data.isnull().sum()

In [None]:
data.head()

The main idea from here onwards is to convert numeric values into categorical ones since we are trying to classify customers who churned or not.

Methods that we will try in this dataset -

1. Binning
2. One Hot Encoding

In [None]:
#Returns "what it says" - counts of a value in a particular column.
#bins - is a very underrated parameter but it can be used to get some really good
#grouping of data values

data.CreditScore.value_counts(bins=3)

In [None]:
# To get number of unique values in Geography column

data.Geography.nunique()

In [None]:
# Creation of dummy variables for Gender and Geography 
# Idea behind this - 'Geography' if label encoded in 0, 1 ,2 will have no meaning , thus the model wont't be able to understand the importance of this column . 
#However label encoding 'Gender' could have made sense in a way and can also be done alternatively.

gender_cat = pd.get_dummies(data['Gender'] , drop_first=True)
geo_cat = pd.get_dummies(data['Geography'] , drop_first=True)

data=pd.concat([data , gender_cat ,geo_cat] , axis=1)

In [None]:
# Practice of checking the update done by above cell
data.head()

In [None]:
#Dropping "of no use" columns to eliminate redundancy.

data.drop(columns=['Gender', 'Geography' , 'RowNumber', 'CustomerId' , 'Surname'] , axis = 1 , inplace = True )

In [None]:
data.head()

In [None]:
#Data that needs to trained goes in X and respective labels into y (here 'Exited').

from sklearn.model_selection import train_test_split
y = data['Exited']
X = data.drop(['Exited'] , axis = 1)

#Splitting the data into training and testing by specifying it in the test_size ,using other parameters i.e. random_state and shuffle depends on you solely.

X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=0.3 , random_state=0 , shuffle=False)

* **Standardization** -   *we center the feature columns at mean 0 with standard deviation 1 so that the feature columns take the form of a normal distribution, which makes it easier to learn the weights.*

In [None]:
# Scaling is a crucial step to get apt results when your data value range across columns differ in large scale.

# StandardScaler and MinMaxScaler are more common when dealing with continuous numerical data.
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

#Below columns get scaled for train and test respectively.
X_train[['Age',"Tenure" , 'Balance' , 'CreditScore' , "EstimatedSalary" , 'NumOfProducts']] = ss.fit_transform(X_train[['Age',"Tenure" , 'Balance' , 'CreditScore' , "EstimatedSalary" , 'NumOfProducts']])
X_test[['Age',"Tenure" , 'Balance' , 'CreditScore' , "EstimatedSalary" , 'NumOfProducts']] = ss.fit_transform(X_test[['Age',"Tenure" , 'Balance' , 'CreditScore' , "EstimatedSalary" , 'NumOfProducts']])

In [None]:
#Importing the libraries we need to build a neural network

from tensorflow import keras
from tensorflow.keras import layers , Sequential 
from keras.layers import Dense

**We can piece it all together by adding each layer:**

* The model expects sample of data with 11 features mentioned in the input_dim = 11 argument
* The first and the second hidden layer comprises of 8 nodes and uses the relu activation function.
* The output layer has 1 node and uses sigmoid activation function.

Choosing number of nuerons for each hidden layer is intutive
Using too many neurons in the hidden layers may result in overfitting.

**Few rules of thumb that one can consider for determining acceptable number of nuerons to use in the hidden layer -**

* No. of hidden nuerons should be between the size of input layer and size of the output layer.
* No. of hidden layer neurons should be 2/3 the size of the input layer , plus the size of the output layer.
* No. of hidden nuerons should be less than twice the size of the input layer

These three rules can give you a good start and it can eventually come down to intutive reasoning of trial and error in selecting the no. of neurons.


In [None]:

model=Sequential([
    layers.Dense(8, activation = 'relu' , input_shape = [11]),
    layers.Dense(8 , activation = 'relu' ),
    layers.Dense(1 , activation = 'sigmoid')
])
      

**NOTE** - *The most confusing thing here is that the shape of the input to the model is defined as an argument on the first hidden layer. This means that the line of code that adds the first Dense layer is doing 2 things, defining the input or visible layer and the first hidden layer.*

Once the model gets defined it can now be compiled. 

* **optimizer** - adam ( stochasticc gradient descent algorithm ) , because it automatically tunes itself and give good results. 

* **loss** - binary crossentropy ( to evaluate the error in current state of the model which will be estimated repeatedly )

* **metrics** = reports the classification accuracy

In [None]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])


Once the model gets compiled its ready to be trained.

**Epoch** can be thought of as a nested for-loop  that iterates over each batch of samples, where one batch has the specified “batch size” number of samples.

**Batch** is analogous to a for-loop iterating over one or more samples and making predictions.

These configurations can be chosen by trial and error.


In [None]:
model.fit(X_train, y_train, batch_size = 25, epochs = 100,verbose = 0)


In [None]:
# Predicting on train data
y_pred = model.predict(X_train)
score, acc = model.evaluate(X_train, y_train,batch_size=10)
print('Train score:', score)
print('Train accuracy:', acc*100)


In [None]:
# Predicting on test data
y_pred = model.predict(X_test)
score, acc = model.evaluate(X_test, y_test,batch_size=10)
print('Test score:', score)
print('Test accuracy:', acc*100)


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error


y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)*1
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
print("Confusion Matrix Accuracy: "+ str(accuracy*100)+"%")

#F1 score
recall=(cm[0][0])/(cm[0][0]+cm[0][1])
precision=(cm[0][0])/(cm[0][0]+cm[1][0])
F1=(2*recall*precision)/(precision+recall)
print("F1 Score:"+str(F1))

#MAE
mae=mean_absolute_error(y_test, y_pred)
print("MAE:"+str(mae))
