In [18]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN
import pickle

# Loading the Cleaned Dataset

In [2]:
df = pd.read_csv('cleaned_data.csv')
df = df.drop('Unnamed: 0', axis = 1)
df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_groups_1 - 12,tenure_groups_13 - 24,tenure_groups_25 - 36,tenure_groups_37 - 48,tenure_groups_49 - 60,tenure_groups_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,0


### Splitting the Target and Features

In [3]:
x = df.drop('Churn', axis = 1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_groups_1 - 12,tenure_groups_13 - 24,tenure_groups_25 - 36,tenure_groups_37 - 48,tenure_groups_49 - 60,tenure_groups_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [4]:
y = df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

### Splitting 80-20 train test splits

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

# Decision Tree Classifier

In [6]:
dt_model = DecisionTreeClassifier(criterion = 'gini', random_state = 100, max_depth = 6, min_samples_leaf = 8)
dt_model.fit(x_train, y_train)
dt_y_pred = dt_model.predict(x_test)

In [7]:
print(classification_report(y_test, dt_y_pred, labels = [0, 1]))
print()
print('Confusion Matrix: \n', confusion_matrix(y_test, dt_y_pred))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1035
           1       0.62      0.52      0.57       372

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407


Confusion Matrix: 
 [[919 116]
 [179 193]]


### Resampling using SMOTEENN

In [8]:
sm = SMOTEENN()
x_resampled, y_resampled = sm.fit_resample(x, y)

In [9]:
xr_train, xr_test, yr_train, yr_test = train_test_split(x_resampled, y_resampled, test_size = 0.2)

In [10]:
smote_dt_model = DecisionTreeClassifier(criterion = 'gini', random_state = 100, max_depth = 6, min_samples_leaf = 8)
smote_dt_model.fit(xr_train, yr_train)
smote_y_pred_dt = smote_dt_model.predict(xr_test)

In [11]:
print(classification_report(yr_test, smote_y_pred_dt, labels = [0, 1]))
print()
print('Confusion Matrix: \n', confusion_matrix(yr_test, smote_y_pred_dt))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       506
           1       0.95      0.97      0.96       663

    accuracy                           0.95      1169
   macro avg       0.96      0.95      0.95      1169
weighted avg       0.95      0.95      0.95      1169


Confusion Matrix: 
 [[474  32]
 [ 21 642]]


# Random Forest

In [12]:
rf_model = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 100, max_depth = 6, min_samples_leaf = 8)
rf_model.fit(x_train, y_train)
rf_y_pred = rf_model.predict(x_test)

In [13]:
print(classification_report(y_test, rf_y_pred, labels = [0, 1]))
print()
print('Confusion Matrix: \n', confusion_matrix(y_test, rf_y_pred))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1035
           1       0.68      0.49      0.57       372

    accuracy                           0.80      1407
   macro avg       0.76      0.70      0.72      1407
weighted avg       0.79      0.80      0.79      1407


Confusion Matrix: 
 [[951  84]
 [191 181]]


### Resampling using SMOTEENN

In [14]:
sm = SMOTEENN()
x_resampled, y_resampled = sm.fit_resample(x, y)

In [15]:
xr_train, xr_test, yr_train, yr_test = train_test_split(x_resampled, y_resampled, test_size = 0.2)

In [16]:
smote_rf_model = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 100, max_depth = 6, min_samples_leaf = 8)
smote_rf_model.fit(xr_train, yr_train)
smote_y_pred_rf = smote_dt_model.predict(xr_test)

In [17]:
print(classification_report(yr_test, smote_y_pred_rf, labels = [0, 1]))
print()
print('Confusion Matrix: \n', confusion_matrix(yr_test, smote_y_pred_rf))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94       519
           1       0.95      0.96      0.96       649

    accuracy                           0.95      1168
   macro avg       0.95      0.95      0.95      1168
weighted avg       0.95      0.95      0.95      1168


Confusion Matrix: 
 [[487  32]
 [ 26 623]]


# Saving the most accurate model!

In [19]:
pickle.dump(smote_rf_model, open('model.sav', 'wb'))

In [20]:
load_model = pickle.load(open('model.sav', 'rb'))

In [24]:
model_acc = 100 * load_model.score(xr_test, yr_test)
print('Model accuracy:', round(model_acc, 2))

Model accuracy: 94.86
