In [54]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix,roc_curve,roc_auc_score
from sklearn.tree import DecisionTreeClassifier,plot_tree

from imblearn.over_sampling import SMOTE
import pickle

In [28]:
df = pd.read_csv('ccp.csv')

In [29]:
x = df.drop('Exited', axis=1)
y = df['Exited']

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=20, stratify=y)

In [31]:
y_train.value_counts()

0    5972
1    1528
Name: Exited, dtype: int64

## Oversampling

In [32]:
sm = SMOTE()
x_train_new, y_train_new = sm.fit_resample(x_train, y_train)
y_train_new.value_counts()

0    5972
1    5972
Name: Exited, dtype: int64

## Model Training

In [33]:
dt_cl =DecisionTreeClassifier()
dt_cl.fit(x_train_new,y_train_new)

## Evaluation

In [34]:
y_pred=dt_cl.predict(x_test)
cmat =confusion_matrix(y_test,y_pred)
print(cmat)
clf = classification_report(y_test,y_pred)
print(clf)

[[1687  304]
 [ 144  365]]
              precision    recall  f1-score   support

           0       0.92      0.85      0.88      1991
           1       0.55      0.72      0.62       509

    accuracy                           0.82      2500
   macro avg       0.73      0.78      0.75      2500
weighted avg       0.84      0.82      0.83      2500



In [47]:
y_pred_train=dt_cl.predict(x_train_new)

cmat =confusion_matrix(y_train_new,y_pred_train)
print(cmat)
clf = classification_report(y_train_new,y_pred_train)
print(clf)

[[5715  257]
 [2367 3605]]
              precision    recall  f1-score   support

           0       0.71      0.96      0.81      5972
           1       0.93      0.60      0.73      5972

    accuracy                           0.78     11944
   macro avg       0.82      0.78      0.77     11944
weighted avg       0.82      0.78      0.77     11944



## Hyperparameter Tunning

In [57]:
hyper={'criterion':['gini','entropy'],
    'splitter':['best','random'],
    'max_depth':np.arange(4,8),
    'min_samples_split':np.arange(5,12),
    'min_samples_leaf':np.arange(2,15)}
gscv_dt=GridSearchCV(dt_cl,hyper,cv=5)
gscv_dt.fit(x_train_new,y_train_new)

In [58]:
gscv_dt.best_params_

{'criterion': 'entropy',
 'max_depth': 7,
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'splitter': 'best'}

In [59]:
dt_cl =DecisionTreeClassifier(criterion='gini',
 max_depth=7,
 min_samples_leaf=4,
 min_samples_split=5,
 splitter= 'best')
dt_cl.fit(x_train_new,y_train_new)

In [60]:
y_pred=dt_cl.predict(x_test)
cmat =confusion_matrix(y_test,y_pred)
print(cmat)
clf = classification_report(y_test,y_pred)
print(clf)


[[1681  310]
 [ 120  389]]
              precision    recall  f1-score   support

           0       0.93      0.84      0.89      1991
           1       0.56      0.76      0.64       509

    accuracy                           0.83      2500
   macro avg       0.74      0.80      0.77      2500
weighted avg       0.86      0.83      0.84      2500



In [61]:
y_pred_train=dt_cl.predict(x_train_new)

cmat =confusion_matrix(y_train_new,y_pred_train)
print(cmat)
clf = classification_report(y_train_new,y_pred_train)
print(clf)

[[5114  858]
 [ 700 5272]]
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      5972
           1       0.86      0.88      0.87      5972

    accuracy                           0.87     11944
   macro avg       0.87      0.87      0.87     11944
weighted avg       0.87      0.87      0.87     11944



In [62]:
file = open("ccp.pkl",'wb')
pickle.dump(dt_cl, file)
file.close()

In [63]:
file = open("columns_list.obj",'wb')
pickle.dump(x.columns, file)
file.close()