In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('Customer-Churn-Records.csv', sep = ',')

In [3]:
df.isnull().any()

RowNumber             False
CustomerId            False
Surname               False
CreditScore           False
Geography             False
Gender                False
Age                   False
Tenure                False
Balance               False
NumOfProducts         False
HasCrCard             False
IsActiveMember        False
EstimatedSalary       False
Exited                False
Complain              False
Satisfaction Score    False
Card Type             False
Point Earned          False
dtype: bool

In [4]:
df = df.drop('Complain', axis = 1)

In [5]:
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1)

In [6]:
hot = pd.get_dummies(df[['Geography', 'Gender', 'Card Type']])


df = pd.concat([df, hot], axis = 1)
df = df.drop(['Geography', 'Gender', 'Card Type'], axis = 1)

In [7]:
df.columns

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'Satisfaction Score',
       'Point Earned', 'Geography_France', 'Geography_Germany',
       'Geography_Spain', 'Gender_Female', 'Gender_Male', 'Card Type_DIAMOND',
       'Card Type_GOLD', 'Card Type_PLATINUM', 'Card Type_SILVER'],
      dtype='object')

In [8]:
X = df.drop('Exited', axis = 1)
X = X.values
y = df['Exited']

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_standard = scaler.fit_transform(X)

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X_standard, y, test_size = 0.3, random_state = 0)

In [11]:


from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)



In [12]:
from sklearn.metrics import classification_report
from termcolor import colored

def report(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=['B', 'M'], index=['B', 'M']))
    print("---------------------------------------------------------------------------------")
    

In [13]:

decision_tree = DecisionTreeClassifier(criterion = 'entropy', min_samples_split = 2, max_depth= 11, random_state=0)
decision_tree.fit(X_train, y_train)

In [14]:
report(decision_tree, X_test, y_test)

[32mClassification report for model DecisionTreeClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.91      0.79      0.84      2378
           1       0.46      0.69      0.55       622

    accuracy                           0.77      3000
   macro avg       0.68      0.74      0.70      3000
weighted avg       0.81      0.77      0.78      3000

---------------------------------------------------------------------------------
[32mConfusion matrix for model DecisionTreeClassifier on training data [0m
---------------------------------------------------------------------------------
      B    M
B  1869  509
M   193  429
---------------------------------------------------------------------------------


In [15]:
from sklearn.ensemble import RandomForestClassifier

parameters = {'max_depth': [3, 4, 5, 6, 7, 9, 11],
              'min_samples_split': [2, 3, 4, 5, 6, 7],
              'criterion': ['entropy', 'gini']
             }

model = RandomForestClassifier()
gridRandomForest = RandomizedSearchCV(model, parameters, cv = 5, n_jobs = -1)
gridRandomForest.fit(X_train, y_train)
report(gridRandomForest, X_test, y_test)

[32mClassification report for model RandomizedSearchCV on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      2378
           1       0.59      0.68      0.63       622

    accuracy                           0.84      3000
   macro avg       0.75      0.78      0.76      3000
weighted avg       0.85      0.84      0.84      3000

---------------------------------------------------------------------------------
[32mConfusion matrix for model RandomizedSearchCV on training data [0m
---------------------------------------------------------------------------------
      B    M
B  2089  289
M   201  421
---------------------------------------------------------------------------------
