In [2]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4


In [4]:
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [5]:
print("pandas", pd.__version__)
print("sklearn", sklearn.__version__)

pandas 2.2.3
sklearn 1.6.1


In [6]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [7]:
df=df.drop('Unnamed: 0',axis=1)

In [8]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [9]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

TRAIN TEST SPLIT

In [10]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

Using Decision Tree Classifier

In [11]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [12]:
model_dt.fit(x_train,y_train)

In [13]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
model_dt.score(x_test,y_test)

0.7853589196872779

In [15]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1028
           1       0.65      0.45      0.53       379

    accuracy                           0.79      1407
   macro avg       0.73      0.68      0.70      1407
weighted avg       0.77      0.79      0.77      1407



We can can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [16]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x, y)



In [17]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [18]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [19]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9378774805867127
              precision    recall  f1-score   support

           0       0.92      0.93      0.93       504
           1       0.95      0.94      0.94       655

    accuracy                           0.94      1159
   macro avg       0.94      0.94      0.94      1159
weighted avg       0.94      0.94      0.94      1159



In [20]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[471  33]
 [ 39 616]]


Trying other classifiers

Using Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [23]:
model_rf.fit(x_train,y_train)


In [24]:
y_pred=model_rf.predict(x_test)

In [25]:
model_rf.score(x_test,y_test)

0.7910447761194029

In [26]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1028
           1       0.67      0.44      0.53       379

    accuracy                           0.79      1407
   macro avg       0.74      0.68      0.70      1407
weighted avg       0.78      0.79      0.78      1407



In [27]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x, y)



In [28]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [29]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [30]:
model_rf_smote.fit(xr_train1,yr_train1)

In [31]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [32]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [33]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9444444444444444
              precision    recall  f1-score   support

           0       0.96      0.91      0.94       536
           1       0.93      0.97      0.95       634

    accuracy                           0.94      1170
   macro avg       0.95      0.94      0.94      1170
weighted avg       0.95      0.94      0.94      1170



In [34]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[490  46]
 [ 19 615]]


PCA: 

In [35]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [36]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [37]:
model.fit(xr_train_pca,yr_train1)

In [38]:
yr_predict_pca = model.predict(xr_test_pca)

In [39]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [40]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.7273504273504273
              precision    recall  f1-score   support

           0       0.74      0.63      0.68       536
           1       0.72      0.81      0.76       634

    accuracy                           0.73      1170
   macro avg       0.73      0.72      0.72      1170
weighted avg       0.73      0.73      0.72      1170



 finalise the model which was created by RF Classifier

In [41]:
import pickle

In [42]:
filename = 'model.sav'

In [43]:
pickle.dump(model_rf_smote, open(filename, 'wb'))


In [44]:
expected_columns = xr_test1.columns.tolist()  # Or X_train.columns
pickle.dump(expected_columns, open('expected_columns.sav', 'wb'))

In [45]:
load_model = pickle.load(open(filename, 'rb'))
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [46]:
model_score_r1

0.9444444444444444