### Importing Libraries

In [3]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

#### Reading csv

In [7]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29,29,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56,1889,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53,108,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42,1840,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70,151,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [4]:
df=df.drop('Unnamed: 0',axis=1)

In [5]:
# Creating X & Y variables
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29,29,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56,1889,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53,108,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42,1840,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70,151,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84,1990,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103,7362,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29,346,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74,306,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [6]:
# Creating X & Y variables
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

##### Train Test Split

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

#### Decision Tree Classifier

In [9]:
model_dt = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [10]:
model_dt.fit(x_train,y_train)

In [13]:
y_pred = model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

In [14]:
model_dt.score(x_test,y_test)

0.7882018479033405

In [17]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1039
           1       0.63      0.47      0.54       368

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.70      1407
weighted avg       0.78      0.79      0.78      1407



###### As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

###### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

###### Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [19]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x, y)

In [20]:
xr_train, xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled, test_size = 0.2)

In [21]:
model_dt_smote = DecisionTreeClassifier(criterion = "gini",random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [22]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.924190800681431
              precision    recall  f1-score   support

           0       0.94      0.89      0.92       554
           1       0.91      0.95      0.93       620

    accuracy                           0.92      1174
   macro avg       0.93      0.92      0.92      1174
weighted avg       0.93      0.92      0.92      1174



In [23]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[494  60]
 [ 29 591]]


###### Now we can see quite better results, i.e. Accuracy: 92 %, and a very good recall, precision & f1 score for minority class.

###### Let's try with some other classifier.

#### Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
model_rf = RandomForestClassifier(n_estimators = 100, criterion='gini', random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [35]:
model_rf.fit(x_train,y_train)

In [36]:
y_pred = model_rf.predict(x_test)

In [37]:
model_rf.score(x_test,y_test)

0.7974413646055437

In [38]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1039
           1       0.67      0.45      0.54       368

    accuracy                           0.80      1407
   macro avg       0.75      0.68      0.70      1407
weighted avg       0.78      0.80      0.78      1407



In [39]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [40]:
xr_train1, xr_test1,  yr_train1, yr_test1 = train_test_split(X_resampled1, y_resampled1, test_size = 0.2)

In [41]:
model_rf_smote = RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [43]:
model_rf_smote.fit(xr_train1, yr_train1)

In [44]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [45]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [48]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9273204903677759
              precision    recall  f1-score   support

           0       0.96      0.88      0.92       547
           1       0.90      0.97      0.93       595

    accuracy                           0.93      1142
   macro avg       0.93      0.93      0.93      1142
weighted avg       0.93      0.93      0.93      1142



In [30]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[478  40]
 [ 27 625]]


###### RF classifier seems to perform better than Decision Tree on this dataset.

#### Performing PCA

In [51]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [52]:
model = RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [53]:
model.fit(xr_train_pca, yr_train1)

In [54]:
yr_predict_pca = model.predict(xr_test_pca)

In [55]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [36]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.7239316239316239
              precision    recall  f1-score   support

           0       0.72      0.61      0.66       518
           1       0.72      0.81      0.77       652

    accuracy                           0.72      1170
   macro avg       0.72      0.71      0.71      1170
weighted avg       0.72      0.72      0.72      1170



##### PCA doesn't yield better results. Therefore, finalizing the model which was created by RF Classifier.

#### Pickling the model

In [56]:
import pickle

In [57]:
filename = 'model.sav'

In [58]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [59]:
load_model = pickle.load(open(filename, 'rb'))

In [60]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [61]:
model_score_r1

0.9273204903677759

##### Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.