### == Goal : To predict if an existing customer (not churned yet) is likely to churn or not ==

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN
import pickle as pk

In [6]:
df = pd.read_csv('tel_churn.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,True,False,False,True,True,...,False,False,True,False,True,False,False,False,False,False
1,1,0,56.95,1889.5,0,False,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
2,2,0,53.85,108.15,1,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False
3,3,0,42.3,1840.75,0,False,True,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,4,0,70.7,151.65,1,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False


In [7]:
# df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,True,False,False,True,True,...,False,False,True,False,True,False,False,False,False,False
1,1,0,56.95,1889.5,0,False,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
2,2,0,53.85,108.15,1,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False
3,3,0,42.3,1840.75,0,False,True,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,4,0,70.7,151.65,1,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False


Creating X and y variables

In [8]:
X = df.drop('Churn', axis=1)
y = df['Churn']


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

======= Use decision tree classifier =======

In [10]:
# Define the model
model_dt = DecisionTreeClassifier(criterion='gini', random_state=13, max_depth=6, min_samples_leaf=8)

In [11]:
# Fit model to data
model_dt.fit(X_train, y_train)

In [12]:
# Predictions
y_pred = model_dt.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 1])

In [13]:
# Evaluate the model
model_dt.score(X_test, y_test)  # ~78%

0.7569296375266524

In [14]:
print(classification_report(y_test, y_pred, labels=[0, 1]))

              precision    recall  f1-score   support

           0       0.79      0.90      0.84      1014
           1       0.60      0.40      0.48       393

    accuracy                           0.76      1407
   macro avg       0.70      0.65      0.66      1407
weighted avg       0.74      0.76      0.74      1407



Always look at the minority class, here 1 (churners).
You can see that precision(0.61), recall(0.50) and f1-score(0.55) are low. That means the model isn't properly created. Why? -> Because the dataset was highly imbalanced!

In [15]:
print(confusion_matrix(y_test, y_pred))

[[909 105]
 [237 156]]


#### ==== Use SMOTE analysis, since data is highly imbalanced ====

SMOTEEN is a class for oversampling technique using SMOTE and cleaning using ENN

In [16]:
something = SMOTEENN()
X_resampled, y_resampled = something.fit_resample(X, y)

After upsampling, create your model again

In [17]:
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

In [18]:
model_smote_dt = DecisionTreeClassifier(criterion='gini', random_state=13, max_depth=6, min_samples_leaf=8)

In [19]:
model_smote_dt.fit(Xr_train, yr_train)

In [21]:
y_pred_smote_dt = model_smote_dt.predict(Xr_test)

In [22]:
print(classification_report(yr_test, y_pred_smote_dt, labels=[0, 1]))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87       428
           1       0.89      0.91      0.90       568

    accuracy                           0.89       996
   macro avg       0.89      0.89      0.89       996
weighted avg       0.89      0.89      0.89       996



After upsamping, the metrics for minority class have increased!

======= Use Random Forest Classifier =======

In [23]:
model_smote_rf = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=13, max_depth=6, min_samples_leaf=8)
model_smote_rf.fit(Xr_train, yr_train)
y_pred_smote_rf = model_smote_rf.predict(Xr_test)

In [24]:
print(classification_report(yr_test, y_pred_smote_rf, labels=[0, 1]))

              precision    recall  f1-score   support

           0       0.91      0.84      0.88       428
           1       0.89      0.94      0.91       568

    accuracy                           0.90       996
   macro avg       0.90      0.89      0.89       996
weighted avg       0.90      0.90      0.90       996



Save model

In [30]:
filename = 'Customer_Churn_Analysis_RF_Model.sav'  # here, saving random forest model, not the decision tree one

In [31]:
pk.dump(model_smote_rf, open(filename, 'wb'))

In [32]:
load_model = pk.load(open(filename, 'rb'))

In [33]:
load_model.score(Xr_test, yr_test)

0.8975903614457831