In [2]:
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix
from imblearn.over_sampling import SMOTE

In [10]:
df=pd.read_csv('../data/processed_data.csv',usecols=lambda column:not column.startswith('Unnamed'))
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,614,Spain,Female,40,3,113348.5,1,1,1,77789.01,0
1,758,France,Female,34,1,154139.45,1,1,1,60728.89,0
2,541,Germany,Female,39,9,100116.67,1,1,1,199808.1,1
3,481,France,NotSpecified,37,8,152303.66,2,1,1,175082.2,0
4,711,France,Female,37,8,113899.92,1,0,0,80215.2,0


## Models Training
### Let's start by splitting our dataframe into separate objects:
#### y for the target varibale
#### X for the input features

In [13]:
x=df.drop('Exited',axis=1)
y=df['Exited']

In [15]:
print(x.shape,y.shape)

(10000, 10) (10000,)


In [19]:
num_col=x.select_dtypes(include='number').columns.to_list()
num_col

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [21]:
cal_col=x.select_dtypes(include='object').columns.to_list()
cal_col

['Geography', 'Gender']

In [25]:
for column in num_col:
    x[column]=MinMaxScaler().fit_transform(x[[column]])

In [31]:
for column in cal_col:
    x[column]=x[[column]].apply(LabelEncoder().fit_transform)

In [64]:
def class_count(a):
    counter=Counter(a)
    kv=[list(counter.keys()),list(counter.values())]
    abt2 = pd.DataFrame(np.array(kv).T, columns=['Exited','Count'])
    abt2['Count'] = abt2['Count'].astype('int64')
    abt2['%'] = round(abt2['Count'] / a.shape[0] * 100, 2)
    return abt2.sort_values('Count',ascending=False)
    

In [66]:
class_count(y)

Unnamed: 0,Exited,Count,%
0,0,7963,79.63
1,1,2037,20.37


## Create a Train Test Split
#### We will continue with splitting our data into separate training and test sets.
#### 30% of observations will be set aside for the test set
#### the rest, 70%, will be used as the training set

In [79]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=10)

In [81]:
print(len(x_train),len(x_test),len(y_train),len(y_test))

7000 3000 7000 3000


In [83]:
x_smote,y_smote=SMOTE().fit_resample(x_train,y_train)

In [85]:
print(len(x_smote),len(y_smote))

11218 11218


### Fit Model

In [88]:
from sklearn.tree import DecisionTreeClassifier
DT=DecisionTreeClassifier()

In [94]:
from sklearn.model_selection import GridSearchCV
dt_param_grid = {
            'max_leaf_nodes': [2, 10, 20, 30], 
            'min_samples_split': [2, 3, 4],
            'criterion': ['gini', 'entropy']
            }

dt_grid = GridSearchCV(DT, dt_param_grid, verbose=3, cv= 5, scoring='accuracy')

In [96]:
dt_grid.fit(x_smote,y_smote)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END criterion=gini, max_leaf_nodes=2, min_samples_split=2;, score=0.686 total time=   0.0s
[CV 2/5] END criterion=gini, max_leaf_nodes=2, min_samples_split=2;, score=0.696 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=2, min_samples_split=2;, score=0.713 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=2, min_samples_split=2;, score=0.699 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=2, min_samples_split=2;, score=0.696 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=2, min_samples_split=3;, score=0.686 total time=   0.0s
[CV 2/5] END criterion=gini, max_leaf_nodes=2, min_samples_split=3;, score=0.696 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=2, min_samples_split=3;, score=0.713 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=2, min_samples_split=3;, score=0.699 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_

In [98]:
print(dt_grid.best_params_)

{'criterion': 'gini', 'max_leaf_nodes': 30, 'min_samples_split': 2}


In [100]:
print(dt_grid.best_score_)

0.8308120411055031


In [104]:
print(f'Training Data Score : {dt_grid.score(x_train,y_train)}')
print(f'Training Data Score : {dt_grid.score(x_test,y_test)}')

Training Data Score : 0.8314285714285714
Training Data Score : 0.829


In [106]:
prediction=dt_grid.predict(x_test)

In [108]:
pd.DataFrame({'Prediction':prediction,'Actual':y_test})

Unnamed: 0,Prediction,Actual
937,1,1
9355,0,0
2293,0,0
192,0,0
8675,0,0
...,...,...
4964,0,0
8978,0,0
7540,0,0
5275,0,1


In [110]:
print(classification_report(prediction,y_test))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      2349
           1       0.61      0.60      0.60       651

    accuracy                           0.83      3000
   macro avg       0.75      0.75      0.75      3000
weighted avg       0.83      0.83      0.83      3000



In [112]:
import joblib
filename = '../Models/decision_tree.sav'
joblib.dump(dt_grid, filename)

['../Models/decision_tree.sav']

In [114]:
dt_model=joblib.load(filename)
print(dt_model.score(x_test,y_test))

0.829
