In [1]:
pip install "numpy<2"

Note: you may need to restart the kernel to use updated packages.


In [2]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
import pandas as pd
import numpy as np



In [3]:
df=pd.read_csv('tel_churn.csv')

In [4]:
df=df.drop('Unnamed: 0',axis=1)
df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,True,False,False,True,True,False,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.5,0,False,True,True,False,True,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,1,False,True,True,False,True,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.3,1840.75,0,False,True,True,False,True,False,...,True,False,False,False,False,False,False,True,False,False
4,0,70.7,151.65,1,True,False,True,False,True,False,...,False,False,True,False,True,False,False,False,False,False


In [5]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,True,False,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.50,False,True,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,False,True,True,False,True,False,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.30,1840.75,False,True,True,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,0,70.70,151.65,True,False,True,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,False,True,False,True,False,True,False,...,False,False,False,True,False,True,False,False,False,False
7028,0,103.20,7362.90,True,False,False,True,False,True,False,...,False,True,False,False,False,False,False,False,False,True
7029,0,29.60,346.45,True,False,False,True,False,True,True,...,False,False,True,False,True,False,False,False,False,False
7030,1,74.40,306.60,False,True,False,True,True,False,False,...,False,False,False,True,True,False,False,False,False,False


In [6]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

**Train Test Split**

In [6]:

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

## **Decision Tree Classifier**

In [7]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [8]:
model_dt.fit(x_train,y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,8
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,100
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [9]:
y_pred=model_dt.predict(x_test)
y_pred

array([1, 0, 0, ..., 0, 0, 0])

In [10]:
model_dt.score(x_test,y_test)

0.7768301350390903

In [11]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85      1018
           1       0.62      0.50      0.55       389

    accuracy                           0.78      1407
   macro avg       0.72      0.69      0.70      1407
weighted avg       0.77      0.78      0.77      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [12]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Apply SMOTE first
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(x_train, y_train)

# Then apply Tomek Links
tomek = TomekLinks()
X_resampled, y_resampled = tomek.fit_resample(X_smote, y_smote)

# Train your model
model_dt = DecisionTreeClassifier(
    criterion="gini",
    random_state=100,
    max_depth=6,
    min_samples_leaf=8
)
model_dt.fit(X_resampled, y_resampled)

# Predictions and evaluation
y_pred = model_dt.predict(x_test)
print("Model Accuracy:", model_dt.score(x_test, y_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, labels=[0, 1]))

Model Accuracy: 0.7526652452025586

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.82      0.83      1018
           1       0.55      0.59      0.57       389

    accuracy                           0.75      1407
   macro avg       0.69      0.70      0.70      1407
weighted avg       0.76      0.75      0.76      1407



Now we can see quite better results, i.e. Accuracy: 92 %, and a very good recall, precision & f1 score for minority class.

## **Random Forest Classifier**

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [15]:
model_rf.fit(x_train,y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,8
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
y_pred=model_rf.predict(x_test)

In [17]:

model_rf.score(x_test,y_test)

0.7924662402274343

In [18]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.93      0.87      1018
           1       0.70      0.44      0.54       389

    accuracy                           0.79      1407
   macro avg       0.76      0.68      0.70      1407
weighted avg       0.78      0.79      0.78      1407



In [7]:
from sklearn.ensemble import RandomForestClassifier

sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x, y)  # Changed to fit_resample

xr_train1, xr_test1, yr_train1, yr_test1 = train_test_split(X_resampled1, y_resampled1, test_size=0.2, random_state=42)

model_rf_smote = RandomForestClassifier(
    n_estimators=100, 
    criterion='gini', 
    random_state=100,
    max_depth=6, 
    min_samples_leaf=8
)

model_rf_smote.fit(xr_train1, yr_train1)

yr_predict1 = model_rf_smote.predict(xr_test1)
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

print("Model Accuracy:", model_score_r1)
print("\nClassification Report:")
print(metrics.classification_report(yr_test1, yr_predict1))

Model Accuracy: 0.9345794392523364

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       536
           1       0.93      0.96      0.94       641

    accuracy                           0.93      1177
   macro avg       0.94      0.93      0.93      1177
weighted avg       0.93      0.93      0.93      1177




**With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.**