## Load the processed data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("./Churn_modelling_processed_data.csv")

In [3]:
data.head()

Unnamed: 0,CreditScoreScaled,Germany,Spain,GenderCoded,AgeScaled,Tenure,BalanceScaled,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalaryScaled,Exited
0,0.538,0,0,0,0.324324,2,0.0,1,1,1,0.506735,1
1,0.516,0,1,0,0.310811,1,0.334031,1,0,1,0.562709,0
2,0.304,0,0,0,0.324324,8,0.636357,3,1,0,0.569654,1
3,0.698,0,0,0,0.283784,1,0.0,2,0,0,0.46912,0
4,1.0,0,1,0,0.337838,2,0.500246,1,1,1,0.3954,0


## Predictors and Target

In [4]:
columns = data.columns.to_list()

In [5]:
target = 'Exited'

In [6]:
columns.remove(target)

In [7]:
X = data[columns].values

In [8]:
y = data[target].values

In [9]:
X.shape

(10000, 11)

In [10]:
y.shape

(10000,)

## Split the data into train and test

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2022)

In [13]:
X_train.shape

(7500, 11)

In [14]:
X_test.shape

(2500, 11)

In [15]:
y_train.shape

(7500,)

In [16]:
y_test.shape

(2500,)

## Building ML model

### Logistic Regression Model

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
logistic_regression_clf = LogisticRegression()  # defining the LR classifier

In [19]:
logistic_regression_clf.fit(X_train, y_train)   # training the LR classifier

LogisticRegression()

**Check performance on test dataset**

In [21]:
y_pred = logistic_regression_clf.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score, classification_report, f1_score

**Accuracy Score**

In [23]:
accuracy_score(y_true=y_test, y_pred=y_pred)

0.8164

**f1-Score**

In [24]:
f1_score(y_test, y_pred)

0.31594634873323396

**Classification report on the test dataset**

In [25]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.83      0.97      0.89      1988
           1       0.67      0.21      0.32       512

    accuracy                           0.82      2500
   macro avg       0.75      0.59      0.60      2500
weighted avg       0.79      0.82      0.78      2500



**Confusion Matrix**

In [43]:
def create_confusion_matrix(y_true, y_pred):
    cm = np.zeros((2,2), dtype='int')
    for i in range(len(y_true)):
        if y_true[i] == 0:
            if y_pred[i] == 0:
                cm[0,0] += 1
            else:
                cm[1,0] += 1
        else:
            if y_pred[i] == 0:
                cm[0,1] += 1
            else:
                cm[1,1] += 1
    cm_df = pd.DataFrame(cm, index=['Predicted-0','Predicted-1'], columns=['Actual-0', 'Actual-1'])
    return cm_df

In [44]:
cf_mat = create_confusion_matrix(y_true=y_test, y_pred=y_pred)

In [45]:
cf_mat

Unnamed: 0,Actual-0,Actual-1
Predicted-0,1935,406
Predicted-1,53,106


In [46]:
total = cf_mat.values.sum()

In [47]:
correct = 0

for i in range(len(cf_mat)):
    correct += cf_mat.values[i,i]

In [48]:
correct

2041

In [49]:
accuracy = correct / total

In [50]:
accuracy

0.8164

In [51]:
from sklearn.metrics import confusion_matrix

In [52]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[1935,   53],
       [ 406,  106]], dtype=int64)

In [54]:
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_pred), index=['Actual-0', 'Actual-1'], columns=['Predicted-0','Predicted-1'])

Unnamed: 0,Predicted-0,Predicted-1
Actual-0,1935,53
Actual-1,406,106


### k-NN Classifier

In [55]:
from sklearn.neighbors import KNeighborsClassifier

In [56]:
kNNclf = KNeighborsClassifier(n_neighbors=5)  # specifying kNN classifier with 5 neighbors

In [57]:
kNNclf.fit(X_train, y_train)  # training the kNN classifier

KNeighborsClassifier()

**Check Performance**

In [58]:
y_pred = kNNclf.predict(X_test)

In [59]:
accuracy_score(y_test, y_pred)

0.8036

In [60]:
f1_score(y_test, y_pred)

0.347941567065073

In [61]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88      1988
           1       0.54      0.26      0.35       512

    accuracy                           0.80      2500
   macro avg       0.69      0.60      0.62      2500
weighted avg       0.77      0.80      0.77      2500



In [62]:
data['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [63]:
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_pred), index=['Actual-0', 'Actual-1'], columns=['Predicted-0','Predicted-1'])

Unnamed: 0,Predicted-0,Predicted-1
Actual-0,1878,110
Actual-1,381,131


### Decision Tree Classifier

In [64]:
from sklearn.tree import DecisionTreeClassifier

In [65]:
dtclf = DecisionTreeClassifier(min_samples_leaf=20)   # defining the Decision Tree classifier

In [66]:
dtclf.fit(X_train, y_train)  # training decision tree model

DecisionTreeClassifier(min_samples_leaf=20)

**Check the performance**

In [67]:
y_pred = dtclf.predict(X_test)

In [68]:
accuracy_score(y_test, y_pred)

0.8556

In [69]:
f1_score(y_test, y_pred)

0.5930101465614431

In [70]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      1988
           1       0.70      0.51      0.59       512

    accuracy                           0.86      2500
   macro avg       0.79      0.73      0.75      2500
weighted avg       0.85      0.86      0.85      2500



In [71]:
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_pred), index=['Actual-0', 'Actual-1'], columns=['Predicted-0','Predicted-1'])

Unnamed: 0,Predicted-0,Predicted-1
Actual-0,1876,112
Actual-1,249,263


### SVM classifier

In [72]:
from sklearn.svm import SVC

In [73]:
SVCclf = SVC()

In [74]:
SVCclf.fit(X_train, y_train)   # training SVC model

SVC()

**Check the performance**

In [75]:
y_pred = SVCclf.predict(X_test)

In [76]:
accuracy_score(y_test, y_pred)

0.8004

In [77]:
f1_score(y_test, y_pred)

0.049523809523809526

In [78]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      1.00      0.89      1988
           1       1.00      0.03      0.05       512

    accuracy                           0.80      2500
   macro avg       0.90      0.51      0.47      2500
weighted avg       0.84      0.80      0.72      2500



In [79]:
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_pred), index=['Actual-0', 'Actual-1'], columns=['Predicted-0','Predicted-1'])

Unnamed: 0,Predicted-0,Predicted-1
Actual-0,1988,0
Actual-1,499,13


## Conclusion

After this exercise we can say that Decision Tree classifier is performing well out of these 4 classifiers we have trained in this particular dataset.