In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Read the data
data = pd.read_csv('../input/predicting-churn-for-bank-customers/Churn_Modelling.csv')
data.head()

In [None]:
#Checking the categories in categorical variables

print(data['Geography'].unique())
print(data['Gender'].unique())

In [None]:
#Find blanks in data
data.info()

In [None]:
#create a dict file to convert string variable into numerical one
# for Gender column
gender = {'Male':0, 'Female':1}
data.Gender = [gender[item] for item in data.Gender]
data.head()

In [None]:
#create a dict file to convert string variable into numerical one
#For contries
geo = {'France':1, 'Spain':2, 'Germany':3}
data.Geography = [geo[item] for item in data.Geography]
data.head()

In [None]:
# delete the unnecessary features from dataset
data.pop('CustomerId')
data.pop('Surname')
data.pop('RowNumber')
data.head()

### Correlations between customer data features and customer churn ###

To decide which features of the data to include in our predictive churn model, we’ll examine the correlation between churn and each customer feature

In [None]:
corr = data.corr()
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values, annot = True, annot_kws={'size':12})
heat_map=plt.gcf()
heat_map.set_size_inches(20,15)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()

We can see that there is no high correlation between features. therefore, there is no multicollineality problem

### Predictive modelling ###


We will consider several different models to predict customer churn. To ensure we are not over-fitting to our data, we will split the 10,000 customer records into a training and test set, with the test set being 25% of the total records.

In [None]:
from sklearn.model_selection import train_test_split 
train, test = train_test_split(data, test_size = 0.25)
 
train_y = train['Exited']
test_y = test['Exited']
 
train_x = train
train_x.pop('Exited')
test_x = test
test_x.pop('Exited')

### Logistic regression ####

Logistic regression is one of the more basic classification algorithms in a data scientist’s toolkit. It is used to predict a category or group based on an observation. Logistic regression is usually used for binary classification (1 or 0, win or lose, true or false). The output of logistic regression is a probability, which will always be a value between 0 and 1. While the output value does not give a classification directly, we can choose a cutoff value so that inputs with with probability greater than the cutoff belong to one class, and those with less than the cutoff belong to the other.

For example, if the classifier predicts a probability of customer attrition being 70%, and our cutoff value is 50%, then we predict that the customer will churn. Similarly, if the model outputs a 30% chance of attrition for a customer, then we predict that the customer won’t churn

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
 
logisticRegr = LogisticRegression()
logisticRegr.fit(X=train_x, y=train_y)
 
test_y_pred = logisticRegr.predict(test_x)
confusion_matrix = confusion_matrix(test_y, test_y_pred)
print('Intercept: ' + str(logisticRegr.intercept_))
print('Regression: ' + str(logisticRegr.coef_))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logisticRegr.score(test_x, test_y)))
print(classification_report(test_y, test_y_pred))
 
confusion_matrix_df = pd.DataFrame(confusion_matrix, ('No churn', 'Churn'), ('No churn', 'Churn'))
heatmap = sns.heatmap(confusion_matrix_df, annot=True, annot_kws={"size": 20}, fmt="d")
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize = 14)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize = 14)
plt.ylabel('True label', fontsize = 14)
plt.xlabel('Predicted label', fontsize = 14)

We got 78% classification accuracy from our logistic regression classifier. But the precision and recall for predictions in the positive class (churn) are relatively low, which suggests our data set may be imbalanced.

### handling imbalanced classes

It is also important to look at the distribution of how many customers churn. If 95% of customers don’t churn, we can achieve 95% accuracy by building a model that simply predicts that all customers won’t churn. But this isn’t a very useful model, because it will never tell us when a customer will churn, which is what we are really interested in.

In [None]:
# Checking how many customers exited
data['Exited'].value_counts()

### Up-sampling the minority class

In [None]:
from sklearn.utils import resample
 
data_majority = data[data['Exited']==0]
data_minority = data[data['Exited']==1]
 
data_minority_upsampled = resample(data_minority,
replace=True,
n_samples=7963, #same number of samples as majority classe
random_state=1) #set the seed for random resampling
# Combine resampled results
data_upsampled = pd.concat([data_majority, data_minority_upsampled])
 
data_upsampled['Exited'].value_counts()

Now that we have a 1:1 ratio for our classes, let’s train another logistic regression model:

In [None]:
train, test = train_test_split(data_upsampled, test_size = 0.25)
 
train_y_upsampled = train['Exited']
test_y_upsampled = test['Exited']
 
train_x_upsampled = train
train_x_upsampled.pop('Exited')
test_x_upsampled = test
test_x_upsampled.pop('Exited')
 
logisticRegr_balanced = LogisticRegression()
logisticRegr_balanced.fit(X=train_x_upsampled, y=train_y_upsampled)
 
test_y_pred_balanced = logisticRegr_balanced.predict(test_x_upsampled)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logisticRegr_balanced.score(test_x_upsampled, test_y_upsampled)))
print(classification_report(test_y_upsampled, test_y_pred_balanced))


The overall accuracy of the model has decreased, but the precision and recall scores for predicting a churn have improved.

### Using a different performance metric

In [None]:
from sklearn.metrics import roc_auc_score
 
# Get class probabilities for both models
test_y_prob = logisticRegr.predict_proba(test_x)
test_y_prob_balanced = logisticRegr_balanced.predict_proba(test_x_upsampled)
 
# We only need the probabilities for the positive class
test_y_prob = [p[1] for p in test_y_prob]
test_y_prob_balanced = [p[1] for p in test_y_prob_balanced]
 
print('Unbalanced model AUROC: ' + str(roc_auc_score(test_y, test_y_prob)))
print('Balanced model AUROC: ' + str(roc_auc_score(test_y_upsampled, test_y_prob_balanced)))

Interestingly, the AUROC scores are very similar between the two models. Both are above 0.5 however, suggesting that both models have the ability to distiguish between observations from each class.

# Decision Trees

In [None]:
from sklearn import tree
from sklearn import tree
 
# Create each decision tree (pruned and unpruned)
decisionTree_unpruned = tree.DecisionTreeClassifier()
decisionTree = tree.DecisionTreeClassifier(max_depth = 4)
 
# Fit each tree to our training data
decisionTree_unpruned = decisionTree_unpruned.fit(X=train_x, y=train_y)
decisionTree = decisionTree.fit(X=train_x, y=train_y)
 
test_y_pred_dt = decisionTree.predict(test_x)
test_y_pred_dt = decisionTree_unpruned.predict(test_x)
test_y_pred_dt = decisionTree.predict(train_x)
test_y_pred_dt = decisionTree_unpruned.predict(train_x)
print('Accuracy of unpruned decision tree classifier on train set: {:.2f}'.format(decisionTree_unpruned.score(train_x, train_y)))
print('Accuracy of unpruned decision tree classifier on test set: {:.2f}'.format(decisionTree_unpruned.score(test_x, test_y)))
print('Accuracy of decision tree classifier on train set: {:.2f}'.format(decisionTree.score(train_x, train_y)))
print('Accuracy of decision tree classifier on test set: {:.2f}'.format(decisionTree.score(test_x, test_y)))

Exactly as we suspected! The unpruned tree gets a perfect score on the training set, but a relatively lower score (81%) on the test set. Our pruned tree is less accurate on the training set, but performs better when presented with the out-of-sample test data.

# Cross validation (k_Fold)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

# KNN

In [None]:
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, test_x, test_y, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

We can notice that even KNN is doing better for this classification problem

# Conclusion

Decision tree is outperforming other models considered in this practice. Therefore, Decison tree model could be a better choice.