# Import Packages, Functions & libraries 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

#import libraries for visualization, processing and modeling
# %clear
# %reset

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.

import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

 # Import Datasets

In [None]:
#load customer data
bk_customers = pd.read_csv(r"../input/predicting-churn-for-bank-customers/Churn_Modelling.csv",index_col=0)

#Get overview of the data
bk_customers.head()

bk_orginal = bk_customers.copy()


In [None]:
# Simple details about the data

print ( "Size of dataset: ", bk_customers.shape) #number of rows and columns of the dataset
print ("Features of the dataset: ", bk_customers.columns.tolist()) #features of the dataset

print ("Variables with missing values: ", bk_customers.isnull().sum()) #No missing values, this is good for our task!
print ("Unique values for each variable: ",bk_customers.nunique())
print ("Check data type of the variables: ", bk_customers.dtypes)

# Data Manipulation

In [None]:
#Carry out some data manipulation

#Create categorical variables out of tenure, credit score and age variables

sorted(bk_customers.CreditScore.unique())
sorted(bk_customers.Tenure.unique())
sorted(bk_customers.Age.unique())

def cust_tenure(bk_customers):
    if bk_customers["Tenure"] <= 2:
        return "Tenure_0-2" # New customers
    elif (bk_customers["Tenure"] > 2) & (bk_customers["Tenure"] <= 4) :
        return "Tenure_2-4" #medium tenure with the bank 
    elif (bk_customers["Tenure"] > 4) & (bk_customers["Tenure"] <= 6) :
        return "Tenure_4-6" # long tenure with the bank
    elif bk_customers["Tenure"] > 6 :
        return "Tenure_gt_6" # very long tenure with the bank
    
#Age categories

def cust_age(bk_customers):
    if bk_customers["Age"] <= 30:
        return "Age_18-30" #Youthful customers
    elif ((bk_customers["Age"]) > 30) & ((bk_customers["Age"]) <= 40):
        return "Age_30-40" #Mid-age customters
    elif ((bk_customers["Age"]) > 40) & ((bk_customers["Age"]) <= 60):
        return "Age_40-60" # Older customers
    elif (bk_customers["Age"]) > 60 :
        return "Age_gt_60" # Pensioners
    
# Credit Score categories (Reference made to a types of credit score called FICO Scores (300 -850))

def cred_score(bk_customers):
    if (bk_customers["CreditScore"] > 300) & (bk_customers["CreditScore"] <= 579):
        return "Very Poor"
    elif (bk_customers["CreditScore"] > 579) & (bk_customers["CreditScore"] <= 669):
        return "Fair" 
    elif (bk_customers["CreditScore"] > 669) & (bk_customers["CreditScore"] <= 739):
        return "Good"
    elif (bk_customers["CreditScore"] > 739) & (bk_customers["CreditScore"] <= 799):
        return "Very Good" 
    elif (bk_customers["CreditScore"] > 799) & (bk_customers["CreditScore"] <= 850):
        return "Exceptional" 

# call/apply the functions above
bk_customers["tenure_cat"] = bk_customers.apply(lambda bk_customers:cust_tenure(bk_customers), axis = 1)
bk_customers["age_cat"] = bk_customers.apply(lambda bk_customers:cust_age(bk_customers), axis = 1)
bk_customers["credit_cat"] = bk_customers.apply(lambda bk_customers:cred_score(bk_customers), axis = 1)


In [None]:
#check the new values created 
sorted(bk_customers.credit_cat.unique())
sorted(bk_customers.tenure_cat.unique())
sorted(bk_customers.age_cat.unique())

# Explanatory Data Analysis


In [None]:
#Explanatory Data Analysis

#1. Explore data types

# split catagorical and numerical variables
Id_var     = ['CustomerId']
target_var = ["Exited"]
cat_vars   = bk_customers.nunique()[bk_customers.nunique() < 6].keys().tolist()
cat_vars   = [x for x in cat_vars if x not in target_var]
num_vars   = [x for x in bk_customers.columns if x not in cat_vars + target_var + Id_var]
num_vars.remove('Surname')

# Create 2 datasets of churned and non churn customers
churn = bk_customers[bk_customers["Exited"] == 1]
not_churn = bk_customers[bk_customers["Exited"] == 0]

#2. Descriptive statistics (Illustrations)

#Create labels for our graphs
labels = "Churned","Retained"

# Graph to display percentages
size = [bk_customers.Exited[bk_customers['Exited']==1].count(), bk_customers.Exited[bk_customers['Exited']==0].count()]
fig, ax = plt.subplots(figsize=(10, 8)) 
ax.pie(size, labels=labels,colors = ['#098be8','#06c739'], autopct='%1.0f%%',startangle=90)
ax.axis('equal')
plt.title("Churned Vs Non-churned Customers", size = 20)
plt.show()

In [None]:
#Take a look at the distribution of variables on churn status

#Illustrations of distribution of categorical

#Bar graph function

def plot_bars(column,df):
    fig, axs = plt.subplots(1, 1, figsize=(10, 8))
    plot = sns.countplot(x=column,hue = 'Exited',data = df)
    return plot

#plot
for i in cat_vars :
    plot_bars(i,bk_customers).set_title(i + " distribution in customer attrition")


In [None]:
#Illustrations of distribution of categorical

#Boxplot function

def plot_box(column,df):
    fig, axs = plt.subplots(1, 1, figsize=(10, 8))
    plot = sns.boxplot(y=column,x = 'Exited', hue = 'Exited',data = df)
    return plot

#plot
for i in num_vars :
    plot_box(i,bk_customers).set_title(i + " distribution in customer attrition")

In [None]:
# Average number of products by tenure category 

avg_num_pdts = bk_customers.groupby(["tenure_cat","Exited"])[["NumOfProducts"]].mean().reset_index()

plot = sns.barplot(x="tenure_cat",y = "NumOfProducts",hue = 'Exited',data = avg_num_pdts)

# Active members by credit group

plot = sns.catplot(x="credit_cat",hue = 'Exited',col="IsActiveMember",data = bk_customers,kind="count")

In [None]:
## Generate new variables to be used a proxy variables

#Estimated salary and bank balance ratio

bk_customers['salary_bal_ratio'] = bk_customers.Balance/bk_customers.EstimatedSalary
sns.boxplot(y='salary_bal_ratio',x = 'Exited', hue = 'Exited',data = bk_customers)
plt.ylim(0, 5)
plt.show()


In [None]:
#Tenure and age of customer - normalize

bk_customers['age_tenure'] = bk_customers.Tenure/(bk_customers.Age)
sns.boxplot(y='age_tenure',x = 'Exited', hue = 'Exited',data = bk_customers)
plt.ylim(0, 1)
plt.show()


In [None]:
#New variable to capture credit habits vs age of customer

bk_customers['age_credit_score'] = bk_customers.CreditScore/(bk_customers.Age)
sns.boxplot(y='age_credit_score',x = 'Exited', hue = 'Exited',data = bk_customers)
plt.show()

In [None]:
#Insights from explanatory data analysis:

#1.The customers churned had a median balance that is slightly higher than that of the customers that the bank retained.
#2. Customers with very low credit scores churned.
# The median age of customers that churned is around 45 yrs compared to 35 of those that the bank retained - interesting trend.
# Tenure and estimated salary have no significant differences for customers that churned and those that didn't.
# Germany has a higher customer attrition ratio compared to France and Spain.
#No significant difference between retained and churned customers in regards to gender
# Customers using 3 or more products churned
#customers with credit cards more likely to churn
#Inactive members more likely to churn
# Average number of products almost similar across all tenure groups. 
#customers with higher balance estimated salary ratio more likely to churn

#Questions/Assumptions
#1. isActiveMember (No) - we assume this implies mere inactivity but still a customer of the bank that could resume activity at a later point in time.

# Data Pre-processing


In [None]:
#Data preprocessing

#Dropping variables that we wont use in predication

list_vars = ['CustomerId','Surname','tenure_cat', 'age_cat', 'credit_cat']

bk_customers = bk_customers.drop(columns = list_vars,axis = 1)

#Update the vars

cat_vars   = bk_customers.nunique()[bk_customers.nunique() < 6].keys().tolist()
cat_vars   = [x for x in cat_vars if x not in target_var]
num_vars   = [x for x in bk_customers.columns if x not in cat_vars + target_var + Id_var]

#Binary columns with 2 values
bin_vars = bk_customers.nunique()[bk_customers.nunique() == 2].keys().tolist()

#Columns more than 2 values
# multi_vars = [i for i in cat_vars if i not in bin_vars]

#encode binary vars - numeric values
le = LabelEncoder()
for i in bin_vars:
    bk_customers[i] = le.fit_transform(bk_customers[i])

#Create dummy vars

bk_customers = pd.get_dummies(data = bk_customers,columns = ['Geography'] )

#Scaling Numerical vars

std = StandardScaler()
scaled = std.fit_transform(bk_customers[num_vars])
scaled = pd.DataFrame(scaled,columns=num_vars)

bk_customers = bk_customers.drop(columns = num_vars,axis = 1)
bk_customers = bk_customers.merge(scaled,left_index=True,right_index=True,how = "left")

# Use mean for NaNs
bk_customers = bk_customers.fillna(bk_customers.mean())


In [None]:
## Variable summary
summary = (bk_customers[[i for i in bk_customers.columns if i not in Id_var]].
           describe().transpose().reset_index())

summary

In [None]:
#looking correlation
corr = bk_customers.corr()

# Heatmap
heatmap = sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)


# Review Dataset for prediction

In [None]:
#visualize dataset on a few variables

df0 = bk_orginal[bk_customers.Exited == 0]
df1 = bk_orginal[bk_customers.Exited == 1]

##Observations
#1. The dataset is imbalanced ~ 4:1, not good for prediction work.
#Solution, use bootstrap method: 1) Include all datapoints of 'churned' and randomly sample an equal amount in 'Not churned'
#2) Use ROC/AUC to evaluate perfomance
#3) cross validation

#visualize dataset on a few variables to select suitable algorithms to use.
plt.xlabel('Age')
plt.ylabel('EstimatedSalary')
plt.scatter(df0['Age'],df0['EstimatedSalary'],color = 'green', marker= '+')
plt.scatter(df1['Age'],df1['EstimatedSalary'],color = 'red', marker= '.')

#No clear boundary between churned/Not churned, so SVM may not be a good candidate.
#It's a fairly small dataset so Logistic regression is a good candiate
#KNN/Decision 
#Random Forest Classifier

# Build model


In [None]:
#Build models

#import libraries required/needed

import plotly.graph_objs as go#visualization
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score 
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score
#from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import tree
from graphviz import Source
from IPython.display import SVG,display
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree


#Split training and test data

train,test = train_test_split(bk_customers,test_size = .25 ,random_state = 142)

#Dependent and independent variables

tgr_var = ['Exited']
ind_var = [x for x in bk_customers.columns if x not in tgr_var]

train_X = train[ind_var]
train_Y = train[tgr_var]
test_X  = test[ind_var]
test_Y  = test[tgr_var]

In [None]:
#Create an array of all the classifiers
classifiers = []

#logistic regression model

logit = LogisticRegression(solver='liblinear', random_state = 12)
classifiers.append(logit)

#K-Nearest Neighbor
knn = KNeighborsClassifier()
classifiers.append(knn)

#support vector machine
svm = svm.SVC()
classifiers.append(svm)

#Decision tree classifier
dc_tree = tree.DecisionTreeClassifier()
classifiers.append(dc_tree)

#Random forest classifier
rfrorest = RandomForestClassifier()
classifiers.append(rfrorest)


In [None]:
#Fit, evaluate and check accuracy and confusion matrix

for clf in classifiers:
    clf.fit(train_X, train_Y.values.ravel())
    y_pred= clf.predict(test_X)
    accuracy = accuracy_score(test_Y, y_pred)
    print("Accuracy of %s is %s"%(clf, accuracy))
    precision = precision_score(test_Y, y_pred)
    print("Precision of %s is %s"%(clf, precision))
    recall = recall_score(test_Y, y_pred)
    print("Recall of %s is %s"%(clf, recall)) 
    cm = confusion_matrix(test_Y, y_pred)
    print("Confusion Matrix of %s is %s"%(clf, cm))

#TODO:Add a visualization plot of the scores.

In [None]:
#Baseline logistic regression model

#create an instance of the estimator
logit = LogisticRegression(solver='liblinear', random_state = 12)

#Train the estimator
logit.fit(train_X,train_Y.values.ravel())

# Check out the attributes of the model

logit.classes_ #look at the values y takes on. Confirmation is a binary classification
#array([0, 1])

logit.intercept_ #look at the value of the slope
#array([-0.17604356])

logit.coef_ #Look at the intercept values
#array([[-0.57983499, -0.24361904, -0.04307553, -0.79347376, -0.37318144,
#         0.52263995, -0.32550208,  0.0569691 ,  0.01272032,  0.01889203,
#         0.02319488,  0.02325823]])

#Evaluate the model
y_pred_logit = logit.predict(test_X)

In [None]:
## Model Performance

#confusion matrix
conf_matrix = confusion_matrix(test_Y,y_pred_logit)

ax = plt.subplot()
sns.heatmap(conf_matrix, annot=True, ax = ax)

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('Actual labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(["Predicted Not churn","Predicted Churn"])
ax.yaxis.set_ticklabels(["Actual Not churn","Actual Churn"]) 


#2000 customers that didn't churn that were correctly identified by the algorithm #TP
#36 customers that churned that were correctly identified by the algorithm #TN
#420 customers that churned but algorithm said they didn't. #FN
#39 customers that didn't churn but algorithm said they did. #FP

In [None]:
#A comprehensive report on the classification model using classification_report

print ("\n Classification report :\n",classification_report(test_Y,y_pred_logit))

#Weighted avg Precision is 0.76/ recall is 0.82 and f1 score 0.76

#F1 Score: 0.1348314606741573 - not super impressive

print ("Accuracy   Score : ",accuracy_score(test_Y,y_pred_logit))

#Accuracy   Score :  0.8152

In [None]:
### KNN classifier

knn = KNeighborsClassifier()
knn.fit(train_X,train_Y.values.ravel())

#Predict the response for test dataset

y_pred_knn   = knn.predict(test_X)

In [None]:
## Model Performance

#confusion matrix
conf_matrix_knn = confusion_matrix(test_Y,y_pred_knn)

ax = plt.subplot()
sns.heatmap(conf_matrix_knn, annot=True, ax = ax)

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('Actual labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(["Predicted Not churn","Predicted Churn"])
ax.yaxis.set_ticklabels(["Actual Not churn","Actual Churn"]) 


#1900 customers that didn't churn that were correctly identified by the algorithm #TP
#110 customers that churned that were correctly identified by the algorithm #TN
#340 customers that churned but algorithm said they didn't. #FN
#150 customers that didn't churn but algorithm said they did. #FP

In [None]:
#A comprehensive report on the classification model using classification_report

print ("\n Classification report :\n",classification_report(test_Y,y_pred_knn))

#Weighted avg Precision is 0.77/ recall is 0.80 and f1 score 0.78

print ("Accuracy   Score : ",accuracy_score(test_Y,y_pred_knn))

#Accuracy   Score :  0.8024


# Advanced Model performance comparisions

In [None]:
#roc_auc_score
model_roc_auc = roc_auc_score(test_Y,y_pred_logit) 
print ("Area under curve : ",model_roc_auc,"\n")

#Area under curve :  0.5317859671333294 
#TODO: Visualize

In [None]:
#roc_auc_score - KNN
model_roc_auc_knn = roc_auc_score(test_Y,y_pred_knn) 
print ("Area under curve : ",model_roc_auc_knn,"\n")

#Area under curve :  0.6447216358146673 


In [None]:
#plot roc curve
#fig2 = go.Figure(data = go.Scatter(x = fpr,y = tpr,
 #                       name = "Roc : " + str(model_roc_auc),
 #                       line = dict(color = ('rgb(22, 96, 167)'),width = 2)))
# 
#fig2.add_trace(go.Scatter(x = [0,1],y=[0,1],
                        line = dict(color = ('rgb(205, 12, 24)'),width = 2,
                        dash = 'dot')))
#fig2["layout"]["xaxis"].update(dict(title = "false positive rate"))
#fig2["layout"]["yaxis"].update(dict(title = "true positive rate"))
#fig2["layout"]["title"].update(dict(text = "Confusion Matrix"))


#fig2.show()

In [None]:
#Evaluate model on precision and recall

#print ("\n Classification report :\n",classification_report(test_Y,y_pred_dt))

#Precision on 1's is 0.44 and recall is 0.51

#print ("Accuracy   Score : ",accuracy_score(test_Y,y_pred_dt))

#Accuracy   Score :  0.7916

#confusion matrix
#conf_matrix1 = confusion_matrix(test_Y,y_pred_dt)

#visualize confusion matrix

#fig4 = go.Figure(data = go.Heatmap(z = conf_matrix1 ,
    #                    x = ["Not churn","Churn"],
   #                     y = ["Not churn","Churn"],
  #                      showscale  = False,
 #                       name = "matrix1"))
                
#fig4.show()
#roc_auc_score
#model_roc_auc1 = roc_auc_score(test_Y,predictions) 
#print ("Area under curve : ",model_roc_auc1,"\n")



In [None]:
#Comparing all models on accuray roc/auc

#plt.barh(y_pos, performance, align='center', alpha=0.5)
#plt.yticks(y_pos, objects)
#plt.xlabel('Usage')
#plt.title('Model Performance')

#plt.show()

In [None]:
#Get coefficients to understand feature importance

coefficients  = pd.DataFrame(logit.coef_.ravel())

column_df     = pd.DataFrame(ind_var)
coef_sumry    = (pd.merge(coefficients,column_df,left_index= True,
                          right_index= True, how = "left"))
coef_sumry.columns = ["coefficients","features"]

coef_sumry    = coef_sumry.sort_values(by = "coefficients",ascending = False)

#plot coeffs
fig3 = go.Figure([go.Bar(x = coef_sumry["features"],y = coef_sumry["coefficients"],
                name = "coefficients",
                marker = dict(color = coef_sumry["coefficients"],
                              colorscale = "Picnic",
                              line = dict(width = .6,color = "black")))])
fig3["layout"]["title"].update(dict(text = "Feature Importance"))

fig3.show()

In [None]:

##The precision is not very impressive, however, this could be improved by retraining the model with additional data.
#Note: I only used baseline models, different variations of the models could have be used for better results. 
##Note: I used alot online resource while attempting this exercise.


#Additional steps that could have been done
#Cross validation
#Using a validate set to fine tune