# Predicting Customer Churn at a Bank

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

### Importing libraries 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.options.display.max_rows = None
pd.options.display.max_columns = None

# For the predictive models
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier as GBSklearn
from xgboost import XGBClassifier as XGB
import lightgbm as lgb

# Removing annoying warnings
import sys
import warnings

In [None]:
#from sklearn.cross_validation import train_test_split # For splitting the data into training and testing
from sklearn.neighbors import KNeighborsClassifier # K neighbors classification model
from sklearn.naive_bayes import GaussianNB # Gaussian Naive bayes classification model
from sklearn.svm import SVC # Support Vector Classifier model
from sklearn.tree import DecisionTreeClassifier # Decision Tree Classifier model
from sklearn.linear_model import LogisticRegression # Logistic Regression model
from sklearn.ensemble import RandomForestClassifier # Random Forest Classifier model
from sklearn.metrics import accuracy_score # For checking the accuracy of the model

In [None]:
!ls /kaggle/input

### Reading the dataset

Reading the Churn Modelling dataset to get more insights on the data provided

In [None]:
data=pd.read_csv("/kaggle/input/predicting-churn-for-bank-customers/Churn_Modelling.csv")

In [None]:
data.head()  #To view the first rows inorder to understand the data provided

## Understanding the data provided and visualization.

### Some of the key questions from the data view
1. Some customers exited with balance still on their account, what does this mean?
2. What is the meaning of active customers? 
3. Does exiting mean exiting a product or?

In [None]:
data.info() # Viewing columns and there data types

#### From the Exited column, I want to understand the customers that exited and the ones that didn't exit.

In [None]:
exited = len(data[data['Exited'] == 1]['Exited'])
not_exited = len(data[data['Exited'] == 0]['Exited'])
exited_perc = round(exited/len(data)*100,1)
not_exited_perc = round(not_exited/len(data)*100,1)

print('Number of clients that have churned: {} ({}%)'.format(exited, exited_perc))
print('Number of clients that haven\'t churned: {} ({}%)'.format(not_exited, not_exited_perc))

labels = 'Exited', 'Retained'
sizes = [exited, not_exited]
explode = (0, 0.1)
fig1, ax1 = plt.subplots(figsize=(8, 6))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')
plt.title("Proportion of customer churned and retained", size = 10)
plt.show()

#### We need to understand the gender and location of those who exited and if they had a credit card or they were active members compared to those who didn't exit by visualizing the data in a bar graph


In [None]:
    fig, axarr = plt.subplots(2, 2, figsize=(20, 12))
    sns.countplot(x='Geography', hue = 'Exited',data = data, ax=axarr[0][0])
    sns.countplot(x='Gender', hue = 'Exited',data = data, ax=axarr[0][1])
    sns.countplot(x='HasCrCard', hue = 'Exited',data = data, ax=axarr[1][0])
    sns.countplot(x='IsActiveMember', hue = 'Exited',data = data, ax=axarr[1][1])

#### From the visuals, most customers are from France though most churned customers are from German, also most customers are male but the most churned customers are female. Members who exit most are not active members even though some of the active members exit too.

### Age count for males and females
#### The data represents different age groups, therefore I did a count on the age to see the age range where most of the customers fit****

In [None]:
counts = data.groupby(['Age','Gender']).count()
counts = counts.RowNumber
print(counts)

### Histogram for the Age for both customers who exited or didn't

In [None]:
#Stacked histogram: Age
figure = plt.figure(figsize=(15,8))
plt.hist([
        data[(data.Exited==0)]['Age'],
        data[(data.Exited==1)]['Age']
        ], 
         stacked=True, color = ['blue','r'],
         bins = 'auto',label = ['Stayed','Exited'],
         edgecolor='black', linewidth=1.2)
plt.xlabel('Age (years)')
plt.ylabel('Number of customers')
plt.legend()

#### From the histogram shown above, the highest number of customers are between their late 20's and 40 and the most customer's to exit are between their late 30's and 50. This might give you insights on what age group exit the program inorder to help you decide what is the reason for them to exit.

### Visuals for the remaining features in the dataset.

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize = (15,15))
fig.subplots_adjust(left=0.2, wspace=0.6)
ax0, ax1, ax2, ax3 = axes.flatten()

ax0.hist([
        data[(data.Exited==0)]['CreditScore'],
        data[(data.Exited==1)]['CreditScore']
        ], 
         stacked=True, color = ['blue','r'],
         bins = 'auto',label = ['Stayed','Exited'],
         edgecolor='black', linewidth=1.2)
ax0.legend()
ax0.set_title('Credit Score')

ax1.hist([
        data[(data.Exited==0)]['Tenure'],
        data[(data.Exited==1)]['Tenure']
        ], 
         stacked=True, color = ['blue','r'],
         bins = 'auto',label = ['Stayed','Exited'],
         edgecolor='black', linewidth=1.2)
ax1.legend()
ax1.set_title('Tenure')

ax2.hist([
        data[(data.Exited==0)]['Balance'],
        data[(data.Exited==1)]['Balance']
        ], 
         stacked=True, color = ['blue','r'],
         bins = 'auto',label = ['Stayed','Exited'],
         edgecolor='black', linewidth=1.2)
ax2.legend()
ax2.set_title('Balance')

ax3.hist([
        data[(data.Exited==0)]['EstimatedSalary'],
        data[(data.Exited==1)]['EstimatedSalary']
        ], 
         stacked=True, color = ['blue','r'],
         bins = 'auto',label = ['Stayed','Exited'],
         edgecolor='black', linewidth=1.2)
ax3.legend()
ax3.set_title('Estimated Salary')

fig.tight_layout()
plt.show()

Estimated Salary does not seem to affect the churn rate and the customers who churn the most are between 600 and 700 credit score points

### Dropping the irrelevant data

In [None]:
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1)
data.head()

### Finding correlation 
Finding how features correlates well with the target variable which is Exited.

In [None]:
sns.heatmap(data.corr().T,cbar=True)

### Conclusion:
Age, Balance, NumOfProducts, IsActiveMember, CreditScore are the features with significant correlation but all features have a weak or strong correlation with the target therefore we will be using all of them to build a model.

## Building a Machine learning model

### Preparing the data

To convert categorical columns to numeric columns is by using one-hot encoding where we take our categories (France, Germany, Spain, male, female) and represent them with columns. In each column, we use a 1 to designate that the category exists for the current row, and a 0 otherwise.

In [None]:
# One-Hot encoding our categorical attributes
list_cat = ['Geography', 'Gender']
data = pd.get_dummies(data, columns = list_cat, prefix = list_cat)
data.head()


### Data Preprocessing
Before we predict, we need to remove the target column from the dataset, features taking all the other feautures and target equating to the Exited column.

In [None]:
features = list(data.drop(['Exited'], axis = 1))
target = 'Exited'

### Splitting the dataset into the Training set and Test set
Dividing the data into a training and test set,The train set will be used to train our machine learning model. The test set will evaluate how good our model is. I will use 20% of the data for the test set and the remaining 80% for the training set.

In [None]:
train, test = train_test_split(data, test_size = 0.2, random_state = 1)

In [None]:
print(test[target])

### Choosing the best predictive model:

To decide the best predictive model I will use Gaussian Naive bayes,K-nearest neighbors,Support vector classifier,Decision tree classifier,Random Forest and Logistic Regression to find out the most accurate model. The reason I am using the mentioned models is because
1. Gaussian Naive bayes is a classification algorithm for binary and multi-class classification problems. The technique is easiest to understand when described using binary or categorical input values.
2. K-nearest neighbors is a simple, easy-to-implement supervised machine learning algorithm that can be used to solve both classification and regression problems.
3. Support vector classifier can also be used for both classification or regression challenges.
4. Decision tree classifier is a supervised Machine Learning where the data is continuously split according to a certain parameter.
5. Random Forest consists of a large number of individual decision trees that operate as an ensemble
6. Logistic Regression is as simple as plugging in numbers into the logistic regression equation and calculating a result when making a prediction.


In [None]:
# Creating a python list containing all defined models
model = [GaussianNB(), KNeighborsClassifier(), SVC(), DecisionTreeClassifier(), RandomForestClassifier(n_estimators=5, random_state=0), LogisticRegression()]
model_names = ["Gaussian Naive bayes", "K-nearest neighbors", "Support vector classifier", "Decision tree classifier", "Random Forest", "Logistic Regression",]
for i in range(0, 6):
    y_pred = model[i].fit(train[features], train[target]).predict(test[features])
    accuracy = accuracy_score(y_pred, test[target])*100
    print(model_names[i], ":", accuracy, "%")

The best ML algorithm with the highest accuracy is Random forest with 83.85%, therefore we will use this algorithm to make presictions.
To train this algorithm, we call the fit method and pass in the feature set and the corresponding target set.

In [None]:
# Working with the Random Forest model
model = RandomForestClassifier(n_estimators = 100, random_state = 0)
y_pred = model.fit(train[features], train[target]).predict(test[features])
print("The accuracy is:", accuracy_score(y_pred, test[target])*100, "%")

#### How well it works?
Evaluating how well the above model works using F1-score, precision, recall, and accuracy

Accuracy is used predicting actual positives and false positives, precision is true positives divided by the sum of true positives and false positives, recall is true positives divided by the sum of true positives and false negatives and finally F1-score is needed when you want to seek a balance between Precision and Recall.

In [None]:
print(classification_report(test[target],y_pred ))  
print(accuracy_score(test[target], y_pred ))

This explains that our algorithm successfully predicts customer churn 86.3%. 

#### Feature Evaluation:
To conclude we want to see which features plays a great role in identifying customer exit.

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=features)
feat_importances.nlargest(20).plot(kind='barh')

Age plays a big role in customer's exit followed by estimated salary, credit score, balance and number of products.

### Conclusion:
In conclusion, The model created above is 86.3% accurate to predict the customer's churn, and the feature that plays a big role in customers churn is age. Although I believe the accuracy of this model can be improved adding more different features or collecting relevant data from more customers.