In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/churn-modelling/Churn_Modelling.csv')

In [None]:
data.head(2)

In [None]:
data.info()

### Lets first remove features that will provide no predictive power, e.g names, Ids, unqiue numbers etc

In [None]:
data = data.drop(['CustomerId','Surname','RowNumber'],axis=1)

In [None]:
fig,ax = plt.subplots(1,2,figsize=(16,6))
sns.set(font_scale=1.5)
sns.countplot(data=data,x='Geography',hue='Exited',ax=ax[0])
sns.countplot(data=data,x='Gender',hue='Exited',ax=ax[1])
plt.show()

In [None]:
data = pd.concat([data,pd.get_dummies(data[['Geography','Gender']])],axis=1).drop(['Gender','Geography'],axis=1)

In [None]:
plt.figure(figsize=(8,6))
sns.set(font_scale=1)
sns.heatmap(data.corr(),annot=True,fmt='0.1f',cmap='icefire')
plt.title('Correlation Matrix - Finding any Collinearity')
plt.show()

#### There is some collinearity here, we'll note to remove one of the genders as this is a binary feature in the dataset anyway

In [None]:
sns.set(font_scale=1.5)
sns.countplot(x=data['Exited'])
plt.show()

#### There is an imbalance in the dataset where customers are 4 times more likely to not exit (churn), we will need to account for this when building models especially if we want to prioritise capturing those customers who will churn. However, in doing so we will also create a higher number of false positives. This would usually be a decision made by the business/user in weighing the cost of running promos etc to retain a false positive (ie. a customer who wasn't going to exit/churn) vs the cost of lost revenue if that customer does leave

In [None]:
fig,ax = plt.subplots(1,2,figsize=(16,6))
sns.set(font_scale=1.5)
sns.histplot(data=data,x='Age',hue='Exited',ax=ax[0])
sns.boxplot(data=data,x='Exited',y='Age',ax=ax[1])
plt.show()

### Customers who end up exiting(churning) tend to be older

In [None]:
sns.set(font_scale=1.5)
sns.countplot(x=data['NumOfProducts'],hue=data['Exited'])
plt.show()

In [None]:
sns.set(font_scale=1.5)
sns.boxplot(data=data,y='Balance',x='Exited')
plt.show()

In [None]:
sns.set(font_scale=1)
fig,ax = plt.subplots(1,3,figsize=(20,5))
sns.kdeplot(data=data,x='Tenure',hue='Exited',ax=ax[0])
sns.kdeplot(data=data,x='EstimatedSalary',hue='Exited',ax=ax[1])
sns.kdeplot(data=data,x='CreditScore',hue='Exited',ax=ax[2])
ax[0].title.set_text('Tenure')
ax[1].title.set_text('Estimated Salary')
ax[2].title.set_text('Credit Score')

### Modelling - Will compare the performance of Logistic Regression and Random Forest, once I've studied and understood XGBoost I will include this also

In [None]:
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest = train_test_split(data.drop('Exited',axis=1),data['Exited'],test_size=0.2,random_state=1)

#### Because the targets are imbalanced (exited=0 occurs 4 times more than exited=1), I will be balancing the classes in the model instantiations

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

features = xtrain
target = ytrain

lr = LogisticRegression(random_state=1,class_weight='balanced')
rfc = RandomForestClassifier(random_state=1,class_weight='balanced')
xgbc = XGBClassifier(random_state=1,use_label_encoder=False,verbosity=0,tree_method='hist')

lrscores = cross_val_score(lr,features,target,cv=5,scoring='f1')
rfcscores = cross_val_score(rfc,features,target,cv=5,scoring='f1')
xgbcscores = cross_val_score(xgbc,features,target,cv=5,scoring='f1')

print('Logistic Regression f1 Score: {}\nRandom Forest f1 Score:{}\nXGBoost Classifier f1 Score:{}'.format(round(np.mean(lrscores),2),round(np.mean(rfcscores),2),round(np.mean(xgbcscores),2)))

### Looks like RandomForest performs better than Logistic Regression on this data, before moving onto tuning our Random Forest lets first choose the best features in predicting whether a customer will exit(churn)

In [None]:
featurecolumns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts',
       'IsActiveMember', 'EstimatedSalary', 'Geography_Spain',
       'Geography_Germany', 'Gender_Female']
features = features[featurecolumns]

### Onto model tuning, I will use gridsearchCV to find the best combination of hyperparameters from a selection provided and use the f1 score as a measure of accuracy. The GridSearchCV function in Sklearn will return the best set of hyperparameters, the best score and the corresponding estimator(model) with these hyperparameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import timeit

# Random Forest Tuning
tic=timeit.default_timer()
paramsrfc = { 
    'n_estimators': [25,50,100,150],
    'max_features': ['sqrt'],
    'max_depth' : [4,6,8,10],
    'criterion' :['gini'],
    'class_weight' :['balanced']
}

rsearchcv_rfc = RandomizedSearchCV(rfc,param_distributions=paramsrfc,cv=5,scoring='f1',random_state=1)
rsearchcv_rfc.fit(features,target)
toc=timeit.default_timer()
print('Time taken to tune RFC: {}'.format(toc-tic))

best_paramsrfc = rsearchcv_rfc.best_params_
best_scorerfc = rsearchcv_rfc.best_score_
modelrfc = rsearchcv_rfc.best_estimator_

# XGBoost Classifier Tuning
tic=timeit.default_timer()
paramsxgbc = {
    "learning_rate": [0.001,0.01,0.1,0.3],
    "max_depth":[5,8,10],
    "subsample":[0.5, 0.75, 1.0],
    "n_estimators":[50,150,500],
    'scale_pos_weight':[1,2,3,4],
    'objective': ['binary:logistic','reg:logistic']
    }

randomsearchcv_xgbc = RandomizedSearchCV(xgbc,param_distributions=paramsxgbc,cv=5,scoring='f1',random_state=1)
randomsearchcv_xgbc.fit(features,target)
toc=timeit.default_timer()
print('Time taken to tune XGBoost: {}'.format(toc-tic))

best_paramsxgbc = randomsearchcv_xgbc.best_params_
best_scorexgbc = randomsearchcv_xgbc.best_score_
modelxgbc = randomsearchcv_xgbc.best_estimator_

In [None]:
print('Best rfc model f1 score:',round(best_scorerfc,3),"\n")
print('Best rfc parameters')
for k,v in best_paramsrfc.items():
    print(k,":",v)
    
print('\n')   
print('Best xgbc model f1 score:',round(best_scorexgbc,3),"\n")
print('Best xgbc parameters')
for k,v in best_paramsxgbc.items():
    print(k,":",v)

#### By tuning the model, the f1 score has improved from 0.55 to 0.62!

In [None]:
rfcimportances = {'Features':features.columns,'Importances':modelrfc.feature_importances_}
rfcimportances = pd.DataFrame(rfcimportances).sort_values('Importances',ascending=False)

xgbcimportances = {'Features':features.columns,'Importances':modelxgbc.feature_importances_}
xgbcimportances = pd.DataFrame(xgbcimportances).sort_values('Importances',ascending=False)

fig,ax = plt.subplots(1,2,figsize=(16,6))
sns.barplot(y=rfcimportances['Features'],x=rfcimportances['Importances'],ax=ax[0])
sns.barplot(y=xgbcimportances['Features'],x=xgbcimportances['Importances'],ax=ax[1])
ax[0].title.set_text('Random Forest Feature Importances')
ax[1].title.set_text('XGBoost Feature Importances')
sns.set(font_scale=1)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve

sns.set(font_scale=1)
predictionsproba = modelrfc.predict_proba(xtest[featurecolumns])
predictionsproba_2 = modelxgbc.predict_proba(xtest[featurecolumns])
lr.fit(features[featurecolumns],target)
rfc.fit(features[featurecolumns],target)
xgbc.fit(features[featurecolumns],target)
predictionsproba_lr = lr.predict_proba(xtest[featurecolumns])
predictionsproba_rfc = rfc.predict_proba(xtest[featurecolumns])
predictionsproba_xgbc = xgbc.predict_proba(xtest[featurecolumns])
precision,recall,thresholds = precision_recall_curve(ytest,predictionsproba[:,1])
precision_lr,recall_lr,thresholds_lr = precision_recall_curve(ytest,predictionsproba_lr[:,1])
precision_rfc,recall_rfc,thresholds_rfc = precision_recall_curve(ytest,predictionsproba_rfc[:,1])
precision_xgbc,recall_xgbc,thresholds_xgbc = precision_recall_curve(ytest,predictionsproba_xgbc[:,1])
precision_2,recall_2,thresholds_2 = precision_recall_curve(ytest,predictionsproba_2[:,1])

plt.figure(figsize=(12,6))
#Tuned Models
sns.lineplot(x=recall,y=precision,label='Random Forest - tuned')
sns.lineplot(x=recall_2,y=precision_2,label='XGBoostClassifier - tuned')
#Untuned Models
sns.lineplot(x=recall_lr,y=precision_lr,label='Logistic Regression - untuned')
sns.lineplot(x=recall_rfc,y=precision_rfc,label='Random Forest - untuned')
sns.lineplot(x=recall_xgbc,y=precision_xgbc,label='XGBoostClassifier - untuned')

plt.xlabel('Recall - Of actual posi, how many were correctly predicted')
plt.ylabel('Precision - Of predicted posi, how many were actual?')
plt.title('Precision Recall Curve')
plt.show()

In [None]:
from sklearn.model_selection import learning_curve

fig,ax = plt.subplots(1,2,figsize=(16,6))
sns.set(font_scale=1.5)
learningcurverfc = learning_curve(modelrfc, xtrain, ytrain, cv=5, scoring='f1')
learningcurvexgbc = learning_curve(modelxgbc,xtrain,ytrain,cv=5,scoring='f1')
trainsizerfc = learningcurverfc[0]
trainscorerfc = []
testscorerfc = []
trainstdrfc = []
teststdrfc = []
trainsizexgbc = learningcurvexgbc[0]
trainscorexgbc = []
testscorexgbc = []
trainstdxgbc = []
teststdxgbc = []

for i in range(len(trainsizerfc)):
    trainscorerfc.append(np.mean(learningcurverfc[1][i]))
    testscorerfc.append(np.mean(learningcurverfc[2][i]))
    trainstdrfc.append(np.std(learningcurverfc[1][i]))
    teststdrfc.append(np.std(learningcurverfc[2][i]))
    
for i in range(len(trainsizexgbc)):
    trainscorexgbc.append(np.mean(learningcurvexgbc[1][i]))
    testscorexgbc.append(np.mean(learningcurvexgbc[2][i]))
    trainstdxgbc.append(np.std(learningcurvexgbc[1][i]))
    teststdxgbc.append(np.std(learningcurvexgbc[2][i]))

ax[0].title.set_text('Learning Curve - Tuned Random Forest')
ax[1].title.set_text('Learning Curve - Tuned XGBoost')
sns.lineplot(x=trainsizerfc,y=trainscorerfc,ax=ax[0],label='Training Score')
ax[0].fill_between(trainsizerfc,np.array(trainscorerfc)-np.array(trainstdrfc),np.array(trainscorerfc)+np.array(trainstdrfc),alpha=0.2,color='b')
sns.lineplot(x=trainsizerfc,y=testscorerfc,ax=ax[0],color='g',label='Testing Score')
ax[0].fill_between(trainsizerfc,np.array(testscorerfc)-np.array(teststdrfc),np.array(testscorerfc)+np.array(teststdrfc),alpha=0.2,color='g')

sns.lineplot(x=trainsizexgbc,y=trainscorexgbc,ax=ax[1],label='Training Score')
ax[1].fill_between(trainsizexgbc,np.array(trainscorexgbc)-np.array(trainstdxgbc),np.array(trainscorexgbc)+np.array(trainstdxgbc),alpha=0.2,color='b')
sns.lineplot(x=trainsizexgbc,y=testscorexgbc,ax=ax[1],color='g',label='Testing Score')
ax[1].fill_between(trainsizexgbc,np.array(testscorexgbc)-np.array(teststdxgbc),np.array(testscorexgbc)+np.array(teststdxgbc),alpha=0.2,color='g')
for i in range(2):
    ax[i].set_xlabel('Training Size')
    ax[i].set_ylabel('F1 Score')
plt.tight_layout()
plt.show()

#### From looking at the Random Forest learning curve of the model it would beneficial to obtain more data to improve model performance

In [None]:
predictions = modelrfc.predict(xtest[featurecolumns])
predictions_2 = modelxgbc.predict(xtest[featurecolumns])

from sklearn.metrics import confusion_matrix
confusionmatrix = pd.crosstab(ytest,predictions,rownames=['Actual'],colnames=['Predicted'])
confusionmatrix2 = pd.crosstab(ytest,predictions_2,rownames=['Actual'],colnames=['Predicted'])

fig,ax = plt.subplots(1,2,figsize=(16,6))
sns.set(font_scale=1.5)
sns.heatmap(confusionmatrix,annot=True,fmt='g',cmap='magma',cbar=False,ax=ax[0])
sns.heatmap(confusionmatrix2,annot=True,fmt='g',cmap='magma',cbar=False,ax=ax[1])
ax[0].title.set_text('Confusion Matrix - Tuned RFC')
ax[1].title.set_text('Confusion Matrix - Tuned XGBoost')
plt.show()

### Interpreting the confusion matrix
#### **The model predicted 430 customers churning, of those, 265 did and 165 stayed.** Thought experiment : if we used this model to predict churn and entice customers before they leave, what would be the cost of applying product discounts, offers etc on those 165 customers who didn't churn.
#### **Conversely, of the 415 customers that actually churned, 150 were predicted to stay.** What would be the cost of losing these customers?

In [None]:
truepositive = confusionmatrix[1][1]
truenegative = confusionmatrix[0][0]
falsepositive = confusionmatrix[1][0]
falsenegative = confusionmatrix[0][1]

# precision = tp/(tp+fp)   <-- Of those predicted positive, how many were actually positive?
# recall = tp/(tp+fn)      <-- Of those actual positives, how many were correctly predicted positive?

precision = truepositive/(truepositive+falsepositive)
recall = truepositive/(truepositive+falsenegative)

print('Precision:{} Recall:{}'.format(round(precision,3),round(recall,3)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(ytest,predictions,digits=3))