# **Import Python Packages**

In [None]:
# Import python packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# **Load Dataset**

In [None]:
# Load the dataset
data = pd.read_csv('../input/predicting-churn-for-bank-customers/Churn_Modelling.csv')

# **Exploratory Data Analysis (EDA)**

## **Basic Statistical and EDA Operations**

In [None]:
# Check info of data
data.info()

**Insights:**
1) There are no null values
2) There are 3 features with object data type

In [None]:
# Check Descriptive information of dataset features
data.describe(include='all').T

**Insights:**

1) There are 2932 unqiue surnames.

2) There are around 25% of observations in above dataset have zero Balance. Need futher investigation.

In [None]:
#Print head of dataset
data.head()

In [None]:
#Print tail of headset
data.tail()

In [None]:
# Print 10 random observation of dataset
data.sample(10)

**Insights:**

1) Based on number of 3 unique values in Geopgraphy feature and above 10 random sample of dataset, we can infer values as Spain, France and Germany.

2) By looking at the features, we can say that CustomerID and Surname doesn't contribute to Bank Churn Prediction dependent variable 'Exited'.

In [None]:
#Check for duplicate records
data.duplicated().sum()

**Insights:**
1) There are no duplicate records

In [None]:
# Check for null values
data.isnull().sum()

**Insights:**

1) There are no null values.

In [None]:
# Check for number of unique values in features
data.nunique()

**Insights:**

1) Geography. Gender,NumOfProducts, HasCrCard, IsActiveMember and Existed features are categorical.

In [None]:
# Check value count for dataset features
for feature in data.columns:
  print(data[feature].value_counts(normalize=True))

**Insights:**

1) Based on value count of **Geography feature**, around 50% of observation are from France.

2) Based on **Gender** value count, number of observations are distributed equally among Male and Female.

3) Based on **Balance** value count, 36% of observation is having 0 Balance.

4) Based on **NumOfProducts** value count, Majority of accounts, bank account affiliated products owned by Customers are 1 and 2.

5) Based on **HasCrCard** value count, around 71% of observations has credit card.

6) Based on **IsActiveMember** value count, observations are equally distributed.

7) Based on **Exited** value count, 80% of observation have not exited the bank services.

## **Univariate and Bivariate Analysis**

In [None]:
data.columns

In [None]:
# Draw box plot to check for outliers in continous feature
for feature in ['CreditScore', 'Tenure', 'Age', 'Balance', 'EstimatedSalary']:
  plt.figure(figsize=(5, 5))
  sns.boxplot(x = feature, data = data)

**Insights:**

There are outliers in CreditScore feature.

In [None]:
# Check for outlier in Credit Score feature
quantile25 = data.CreditScore.quantile(0.25)
quantile75 = data.CreditScore.quantile(0.75)

iqr = (quantile75 - quantile25)

lowWhisker = quantile25 - (1.5 * iqr)
upperWhisker = quantile75 + (1.5 * iqr)

lowOutliersCount = len(data[data.CreditScore < lowWhisker])
upperOutliersCount = len(data[data.CreditScore > upperWhisker])

percentageOfOutliers = ((lowOutliersCount + upperOutliersCount) / len(data)) * 100

print ( "Percentage of outliers in CreditScore feature : {0}% " .format(percentageOfOutliers))

**Insights:**

Percentage of outliers in Credit Score feature is quite low. Let's treat these outliers by replace with boundary values instead of removing these observations because all these observations have Exited value as 1 and observation with Exited value as 1 are only 20%.

In [None]:
# Treat Credit Score outliers by replacing with boundary values 
data.CreditScore.clip(lower = lowWhisker, upper = upperWhisker, inplace=True)

## **Add new features by classifying Age, Balance and Credit Scores**

In [None]:
# Check descriptive information of Age feature
data.Age.describe()

In [None]:
# Organize Age in 4 groups based on range
data['AgeGroup'] = pd.cut(data.Age,bins=[17, 65, 93],labels=['Adult','Elderly'])

In [None]:
# Print head of dataframe
data.head()

In [None]:
# Bar plot of Different Age Group Vs Exited feature
sns.countplot(x = 'AgeGroup', data = data[data.Exited == 1])

**Insights:**

Majority of the customers exited are more likely is in Adult Age group (17 - 65)

In [None]:
# Check descriptive information of CreditScore feature
data.CreditScore.describe()

In [None]:
# Classify Credit Score based on FICO range
data['CreditScoreGroup'] = pd.cut(data.CreditScore,bins=[300, 579, 669, 739, 799, 900],labels=[0, 1, 2, 3, 4])
data['CreditScoreGroup'] = data.CreditScoreGroup.astype(int)

In [None]:
# Bar plot of Different Credit Score Group Vs Exited feature
sns.countplot(x = 'CreditScoreGroup', data = data[data.Exited == 1])

**Insights:**

Majority of the customers exited the bank is in low Credit age (Very poor and Fair)

In [None]:
# Bar plot of Different Credit Score Group Vs Exited feature
plt.figure(figsize=(10, 10))
sns.countplot( x = 'CreditScoreGroup', data = data[data.Exited == 1], hue = 'AgeGroup')

**Insights:**

Above plot shows that Adult Age group is in majority when it comes to leaving the bank services across all credit score groups.

In [None]:
# Organize Balance in 2 groups based on zero and non-zero
data['BalanceGroup'] = data.Balance.apply(lambda x : 1 if x > 0 else 0)
data.BalanceGroup.value_counts(normalize=True).plot(kind = 'bar')

**Insights:**

Around 36% of observations are having balance value as 0.

## **Plotting feature with continous values vs Exited feature**

In [None]:
# Draw box plot to check relation between Exited and continous features in dataset
for feature in ['CreditScore', 'Tenure', 'Age', 'Balance', 'EstimatedSalary']:
  plt.figure(figsize=(5, 5))
  sns.boxplot(x = 'Exited', y = feature, data = data)

  quantile25 = data[data.Exited == 1][feature].quantile(0.25)
  quantile75 = data[data.Exited == 1][feature].quantile(0.75)
  print ("50% of observation lies in feature {0} between {1} and {2} based on 25th and 75th percentile when Exited value is 1" .format(feature, quantile25, quantile75)) 
  
  quantile25 = data[data.Exited == 0][feature].quantile(0.25)
  quantile75 = data[data.Exited == 0][feature].quantile(0.75)
  print ("50% of observation lies in feature {0} between {1} and {2} based on 25th and 75th percentile when Exited value is 0" .format(feature, quantile25, quantile75)) 


**Insights:**

1) 50% of observation lies in feature CreditScore between 578.0 and 716.0 based on 25th and 75th percentile when Exited value is 1

2) 50% of observation lies in feature CreditScore between 585.0 and 718.0 based on 25th and 75th percentile when Exited value is 0

3) 50% of observation lies in feature Tenure between 2.0 and 8.0 based on 25th and 75th percentile when Exited value is 1

4) 50% of observation lies in feature Tenure between 3.0 and 7.0 based on 25th and 75th percentile when Exited value is 0

5) 50% of observation lies in feature Age between 38.0 and 51.0 based on 25th and 75th percentile when Exited value is 1

6) 50% of observation lies in feature Age between 31.0 and 41.0 based on 25th and 75th percentile when Exited value is 0

7) 50% of observation lies in feature Balance between 38340.02 and 131433.33 based on 25th and 75th percentile when Exited value is 1

8) 50% of observation lies in feature Balance between 0.0 and 126410.28 based on 25th and 75th percentile when Exited value is 0

9) 50% of observation lies in feature EstimatedSalary between 51907.72 and 152422.91 based on 25th and 75th percentile when Exited value is 1

10)50% of observation lies in feature EstimatedSalary between 50783.49 and 148609.95500000002 based on 25th and 75th percentile when Exited value is 0

## **Plotting feature with categorical values vs Exited feature**

In [None]:
# Draw count plot to check relation between Exited and continous features in dataset
for feature in ['Geography', 'Gender', 'NumOfProducts', 'HasCrCard', 
                'IsActiveMember', 'AgeGroup', 'CreditScoreGroup']:
  plt.figure(figsize=(5, 5))
  sns.countplot(x = feature, data = data, hue = 'Exited')

  # Percentage of observation by Geography for each Exited Category ( 0 and 1)
  print("================================================================")  
  print("(Exited == 0) =======>>>>\n{0}"  .format(data[data.Exited == 0][feature].value_counts(normalize = True)))
  print("(Exited == 1) =======>>>>\n{0}"  .format(data[data.Exited == 1][feature].value_counts(normalize = True)))

**Insights:**

1) Around 80% of customer observations(who leaves bank i.e. Exited == 1) from France and Germany are more likely to leave Bank services

2) Around 69% of Customer(who leaves bank i.e. Exited == 1) who owns single number of bank products are more likely to leave Bank services

3) Around 70% of Customer(who leaves bank i.e. Exited == 1) who owns credit card are more likely to leave Bank services

4) Around 64% of Customer(who leaves bank i.e. Exited == 1) who are not active member are more likely to leave Bank services

5) Around 98% of Customer(who leaves bank i.e. Exited == 1) lies in Adult Age group are more likely to leave Bank services

6) Around 82% of Customer(who leaves bank i.e. Exited == 1) lies in Fair, Good and Very poor credit age group are more likely to leave Bank services

## **Multivariate Analysis**

In [None]:
# Check Exited feature based on different category of BalanceGroup, Gender and CreditScoreGroup feature
g = sns.FacetGrid(data, row="BalanceGroup", col = 'Gender', aspect=2, row_order = [0, 1], col_order = ['Male', 'Female'], hue = 'Exited', hue_order = [0, 1])
g.map(sns.countplot, 'CreditScoreGroup', order = [0, 1, 2, 3, 4]).add_legend()

**Insights:**

1) Customers with lower creditScore( between very poor - Good) are more likely to leave bank services.
2) Customers are more likely to leave bank services when Balance is non zero in comparison to zero balance account
3) Percentage of Female customers leaving bank services is higher than percentage of male customers.

In [None]:
# Check Exited feature based on different category of BalanceGroup, Gender and Geography feature
g = sns.FacetGrid(data, row="BalanceGroup", col = 'Gender', aspect=2, row_order = [0, 1], col_order = ['Male', 'Female'], hue = 'Exited', hue_order = [0, 1])
g.map(sns.countplot, 'Geography', order = ['France', 'Spain', 'Germany']).add_legend()

**Insights:**

1) There are higher percentage of Female customers leaving bank service for Germany when balance in account in non zero.
2) Percentage of customers leaving bank services from France is lesser in comparison to other than Spain and Germany Geographical location. 

In [None]:
# Check Exited feature based on different category of BalanceGroup, Gender and AgeGroup feature
g = sns.FacetGrid(data, row="BalanceGroup", col = 'Gender', aspect=2, row_order = [0, 1], col_order = ['Male', 'Female'], hue = 'Exited', hue_order = [0, 1])
g.map(sns.countplot, 'AgeGroup', order = ['Adult', 'Elderly']).add_legend()

**Insights:**

1) Percentage of Adult customers leaving bank services is higher in comparison to Elderly customers.

In [None]:
# Check Exited feature based on different category of BalanceGroup, Gender and IsActiveMember feature
g = sns.FacetGrid(data, row="BalanceGroup", col = 'Gender', aspect=2, row_order = [0, 1], col_order = ['Male', 'Female'], hue = 'Exited', hue_order = [0, 1])
g.map(sns.countplot, 'IsActiveMember', order = [0, 1]).add_legend()

**Insights:**

1) Percentage of Non Active customers leaving bank services is higher in comparison to active customers.

In [None]:
# Check Exited feature based on different category of NumOfProducts, Gender and HasCrCard feature
g = sns.FacetGrid(data, row="Gender", col = 'NumOfProducts', aspect=1, col_order = [1, 2, 3, 4], row_order = ['Male', 'Female'], hue = 'Exited', hue_order = [0, 1])
g.map(sns.countplot, 'HasCrCard', order = [0, 1]).add_legend()

In [None]:
# Check value count of Exited feature for customer owns 3 or 4 number of products
data[data.NumOfProducts.isin([3, 4])].Exited.value_counts(normalize = True)

**Insights:**

1) Customers having NumOfProducts as 3 OR 4 are more likely to leave bank services.

## **Draw Pairplot**

In [None]:
# Univariate and Bivariate analysis using Pairplot
sns.pairplot(data = data, corner=True, diag_kind='kde')

**Insights:**

1) In Diagonal plots, there are no outliers 

## **Draw Heatmap to check for multicollinearity and relationship between two variables**

In [None]:
# Check Heatmap to check for collinearity
plt.figure(figsize=(10, 10))
sns.heatmap(data.corr(), annot=True, cmap='YlGnBu')

**Insights:**

1) CreditScoreGroup Vs CreditScore and BalanceGroup Vs Balance are highly correlation. Drop one of the feature.

## **Eliminating features based on collinearity and large unique values in some of the features**

**1) Drop CustomerId, Surname and RowNumber feature becuase of large value of unique values and does not impacts target variable (Exited)**

**2) Drop CreditScore and Balance feature to address multicollinearity**

In [None]:
# Remove CustomerId, Surname and RowNumber feature
data.drop(columns=['CustomerId', 'Surname', 'RowNumber'], inplace = True)

# Drop one feature from set of two highly correlated features as per Heatmap from both train and test datasets
data.drop(columns = ['CreditScore', 'Balance'], inplace = True)

In [None]:
# Print 10 random sample from dataset
data.sample(10)

In [None]:
# Check shape of dataset
data.shape

In [None]:
# Check datatype of features in dataset
data.info()

## **Converting object data type to Categorical data type**

In [None]:
# Convert feature with object data type (Geography, Surname and Gender) to categorical data type 
data.Geography =  pd.Categorical(data.Geography)
data.Gender =  pd.Categorical(data.Gender)

In [None]:
# Check datatype of features in dataset
data.info()

# **Preparing data for model**

## **Splitting dataset into dependent and independent features (X and Y)**

In [None]:
# Split dataset feature into Dependent and Independent variables
X = data.drop(columns=['Exited'])
y = data['Exited']

## **One Hot Encoding on Categorical features**

In [None]:
# Apply One Hot Encoding to Geography, Gender and AgeGroup categorical variables
categorical_cols = ['Geography', 'Gender', 'AgeGroup']
X = pd.get_dummies(X, columns = categorical_cols, drop_first = True)

## **Splitting dataset into training and testing dataset**

In [None]:
# Import python package to import train_test_split
from sklearn.model_selection import train_test_split

In [None]:
# Split dataset into train and test dataset in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 42)

In [None]:
# Check shape of training dataset (indepenedent feature)
X_train.shape

In [None]:
# Check shape of test dataset (indepenedent feature)
X_test.shape

In [None]:
# Check value count of dependent feature and distribution of training dataset based on target variable (Exited)
y_train.value_counts(normalize=True)

In [None]:
# Check value count of dependent feature and distribution of testing dataset based on target variable (Exited)
y_test.value_counts(normalize=True)

**Insights:**

80% of observation are for the customers who left the bank services. We can train the model with existing imbalance data first and then we will try to over and under sample dataset and train the model again to see if recall can be improved without underfitting/overfitting model.

## **Normalizing training and testing dataset using StandardScaler**

In [None]:
# Check head of independent training dataset 
X_train.head()

In [None]:
# Print feature name of independent training dataset
X_train.columns

In [None]:
# Import required package to normalize training and testing dataset separately
from sklearn.preprocessing import StandardScaler, MinMaxScaler

X_train = pd.DataFrame(MinMaxScaler().fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(MinMaxScaler().fit_transform(X_test), columns=X_test.columns)

In [None]:
# Check head of training dataset to make sure features are normalized
X_train.head()

In [None]:
# Check head of testing dataset to make sure features are normalized
X_test.head()

In [None]:
# Print columns of independent training dataset
X_train.columns

In [None]:
# Print shape of independent training dataset
X_train.shape

## **Utility class**

In [None]:
# Import tensorflow and metrics package
import tensorflow as tf
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report

In [None]:
# Create list of metrics
recallList = []
precisionList = []
TestAccuracyList = []
TrainingAccuracyList = []
epochsList = []
TechniqueList = []
LearningRateList = []
batchSizeList = []

# Defining ANN Model class
class ANNModel:
    '''
    This class implements ANN model using Keras.Sequential and calculate classification metrics
    '''
    def __init__(self, techniqueName, annModel, X_train, y_train, X_test, y_test, learning_rate, epochs, batchSize = 32):
        '''
        Initializing class member variables with different parameters like learning rate and epochs
        '''
        self.modelMetrics = None
        self.y_pred = None
        self.y_prediction = []

        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

        self.learning_rate = learning_rate
        self.epochs = epochs
        self.model = annModel
        self.batchSize = batchSize

        LearningRateList.append(self.learning_rate)
        epochsList.append(self.epochs)
        TechniqueList.append(techniqueName)
        batchSizeList.append(self.batchSize)

    def trainANNModel(self, classWeight = None):
        '''
        This member function compiles and train the keras sequential model
        '''
        self.model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate), metrics=['accuracy'])
        if (classWeight is None):
            self.modelMetrics = self.model.fit(self.X_train, self.y_train, epochs=self.epochs, validation_split = 0.2 , verbose=0, batch_size = self.batchSize, class_weight = classWeight)
        else:
            self.modelMetrics = self.model.fit(self.X_train, self.y_train, epochs=self.epochs, validation_split = 0.2 , verbose=0, batch_size = self.batchSize, class_weight = classWeight)

    def plotLossFunction(self):
        '''
        This member function  plot loss function over different epochs
        '''
        hist = pd.DataFrame(self.modelMetrics.history)
        hist['epoch'] = self.modelMetrics.epoch

        plt.plot(hist['loss'])
        plt.plot(hist['val_loss'])
        plt.legend(("train" , "valid") , loc =0)
        plt.xlabel('Epoch---->')
        plt.ylabel('Loss---->')

    def evaluateANNModel(self):
        '''
        This member function evaluates model performance on testing dataset
        '''
        loss, acc = self.model.evaluate(self.X_train, self.y_train, verbose=0)
        print('Training Accuracy: %.3f'  % acc)
        print('Training Loss: %.3f' % loss)
        TrainingAccuracyList.append(acc)

        loss, acc = self.model.evaluate(self.X_test, self.y_test, verbose=0)
        print('Test Accuracy: %.3f'  % acc)
        print('Test Loss: %.3f' % loss)
        TestAccuracyList.append(acc)

        self.y_pred = np.round(self.model.predict(self.X_test), 2)

        for pred in self.y_pred:
            if pred > 0.5:
                self.y_prediction.append(1)
            else:
                self.y_prediction.append(0)

    def printConfusionMatrix(self):
        '''
        This member function prints confusion matrix
        '''
        recallScore = np.round(recall_score(self.y_test.values, self.y_prediction, zero_division = 0), 2)
        recallList.append(recallScore)

        precisionScore = np.round(precision_score(self.y_test.values, self.y_prediction, zero_division = 0), 2)
        precisionList.append(precisionScore)

        print('Recall Score : {0}' .format(recallScore))
        print('Precision Score : {0}' .format(precisionScore))

        print(classification_report(self.y_test, self.y_prediction))

        cm = tf.math.confusion_matrix(labels=self.y_test,predictions=self.y_prediction)

        plt.figure(figsize = (5,3))
        sns.heatmap(cm, annot=True, fmt='d')
        plt.xlabel('Predicted')
        plt.ylabel('Truth')

# **Train ANN Model on Unbalance training dataset and Predict the result**

In [None]:
# Create keras Sequential model
model = tf.keras.models.Sequential ([tf.keras.layers.Dense(256, input_dim=X_train.shape[1],activation='relu'),
                                     tf.keras.layers.Dropout(0.2),   # Avoiding Overfitting and underfitting
                                     tf.keras.layers.Dense(32, activation='tanh'),
                                     tf.keras.layers.Dropout(0.2), # Avoiding Overfitting and underfitting
                                     tf.keras.layers.Dense(16, activation='relu'),
                                     tf.keras.layers.Dense(1, activation='sigmoid')])

# Creating ANNModel object
trainANNModel_1 = ANNModel('ANN Unbalanced', model, X_train, y_train, X_test, y_test, learning_rate = 0.0001, epochs = 90, batchSize=30)

# Train ANN Model on training dataset
trainANNModel_1.trainANNModel()

# Plot loss value over different epochs
trainANNModel_1.plotLossFunction()

# Evaluate model on testing dataset and predict the results using 0.5 as threshold
trainANNModel_1.evaluateANNModel()

# Print Confusion Matrix
trainANNModel_1.printConfusionMatrix()

**Improvements/Insights:**

The recall of the minority class is very less. It proves that the model is more biased towards majority class. So, it proves that this is not the best model.
Now, let's try class weight to balance data and see their accuracy and recall results as part of improvement 

# **Improvement in Models by balancing dataset using Cost sensitive neural network**

## **Train ANN Model on by applying class weight to balance training dataset and Predict the result**

In [None]:
# Create keras Sequential model
model = tf.keras.models.Sequential ([tf.keras.layers.Dense(256, input_dim=X_train.shape[1],activation='relu'),
                                     tf.keras.layers.Dropout(0.2),   # Avoiding Overfitting and underfitting
                                     tf.keras.layers.Dense(32, activation='tanh'),
                                     tf.keras.layers.Dropout(0.2), # Avoiding Overfitting and underfitting
                                     tf.keras.layers.Dense(16, activation='relu'),
                                     tf.keras.layers.Dense(1, activation='sigmoid')])

# Creating ANNModel object
trainANNModel_2 = ANNModel('ANN Balanced', model, X_train, y_train, X_test, y_test, learning_rate = 0.0001, epochs = 50, batchSize=30)

# Define class weight...Since minority Exited feature dataset is 20% in comparison to 80% of majority of observation with Exited value 0.
# Let's increase the weight by 5 times for minority (with Exited value as 1)
classWeight = {0 : 1, 1: 3}

# Train ANN Model on training dataset
trainANNModel_2.trainANNModel(classWeight = classWeight)

# Plot loss value over different epochs
trainANNModel_2.plotLossFunction()

# Evaluate model on testing dataset and predict the results using 0.5 as threshold
trainANNModel_2.evaluateANNModel()

# Print Confusion Matrix
trainANNModel_2.printConfusionMatrix()

# **Model Metrics**

In [None]:
# Create dataframe to capture model metrics for different type of dataset
metrics = pd.DataFrame()
metrics['Model Technique'] = TechniqueList
metrics['Learning Rate'] = LearningRateList
metrics['Epochs'] = epochsList
metrics['Training Accuracy'] = TrainingAccuracyList
metrics['Testing Accuracy'] = TestAccuracyList
metrics['Batch Size'] = batchSizeList
metrics['Recall'] = recallList
metrics['Precision'] = precisionList
metrics

# **Analysis based on Metrics:**
1) Model should be selected based on recall along with precision and Accuracy. We are performing analysis to predict whether customer will leave the bank services in future or not.
Based on above metrics, we should use cost sensitive ANN model where recall is better along with precision and accuracy.

# **Suggestion to Bank to reduce number of customers leaving bank services:**

In [None]:
# Detemine the parameters Bank need to work on reducing customers leaving bank services
data[(data.Gender == 'Female') & (data.AgeGroup == 'Adult') & ( (data.NumOfProducts == 3) | (data.NumOfProducts == 4) )].Exited.value_counts(normalize = True)

**Insights:**

Bank should work on bank services focussing above parameters i.e. Female Adult Age group (17 - 65) having NumOfProducts 3 OR 4 to reduce customers leaving bank services