In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [2]:
df=pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df = pd.read_csv('telco_data_dummies.csv')

In [4]:
df.head()


Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29,29,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56,1889,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53,108,1,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42,1840,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,70,151,1,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,0


In [5]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29,29,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56,1889,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53,108,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42,1840,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70,151,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84,1990,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103,7362,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29,346,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74,306,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [6]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [8]:
# Decision Tree Classifier


In [9]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [10]:
model_dt.fit(x_train,y_train)

In [11]:
y_pred=model_dt.predict(x_test)
y_pred

array([1, 1, 0, ..., 1, 0, 0], dtype=int64)

In [12]:
model_dt.score(x_test,y_test)

0.7938877043354655

In [13]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1053
           1       0.62      0.46      0.53       354

    accuracy                           0.79      1407
   macro avg       0.73      0.68      0.70      1407
weighted avg       0.78      0.79      0.78      1407



In [14]:
print(confusion_matrix(y_test, y_pred))

[[953 100]
 [190 164]]


In [15]:
# SMOTE -> (Synthetic Minority Over-sampling Technique) – Upsampling 

In [16]:
#  ENN (Edited Nearest Neighbors) – Downsampling (Noise Removal)

In [17]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [18]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [19]:
model_dt_smotee=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [20]:
model_dt_smotee.fit(xr_train1 , yr_train1)

In [21]:
print("xr_test1 shape:", xr_test1.shape)

xr_test1 shape: (1161, 50)


In [22]:
print("xr_test1 shape:", y_test.shape)

xr_test1 shape: (1407,)


In [23]:
y_pred_smotee=model_dt_smotee.predict(xr_test1)

In [24]:
print(classification_report(yr_test1, y_pred_smotee, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       508
           1       0.94      0.95      0.95       653

    accuracy                           0.94      1161
   macro avg       0.94      0.94      0.94      1161
weighted avg       0.94      0.94      0.94      1161



In [25]:
print(confusion_matrix(yr_test1, y_pred_smotee))

[[469  39]
 [ 32 621]]


In [26]:
# Random Forest Classifier

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
model_rf=RandomForestClassifier(n_estimators=100,criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [29]:
model_rf.fit(x_train,y_train)

In [30]:
y_pred_rf=model_rf.predict(x_test)


In [31]:
print(classification_report(y_test, y_pred_rf, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1053
           1       0.63      0.43      0.51       354

    accuracy                           0.79      1407
   macro avg       0.73      0.67      0.69      1407
weighted avg       0.78      0.79      0.78      1407



In [32]:
model_rf_smotee=RandomForestClassifier(n_estimators=100,criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [33]:
model_rf_smotee.fit(xr_train1 , yr_train1)

In [34]:
# If y_test is a pandas Series, reset its index and align it with xr_test1
y_test = y_test.reset_index(drop=True)  # Reset index of y_test

# Align y_test with xr_test1 (based on number of rows in xr_test1)
y_test_filtered = y_test.iloc[:xr_test1.shape[0]]  # Slice y_test to match the number of rows in xr_test1

# Now that y_test_filtered is aligned with xr_test1, we can proceed with predictions
y_pred_smotee_rf = model_rf_smotee.predict(xr_test1)  # Predict using the model

# Generate the classification report using the aligned y_test
from sklearn.metrics import classification_report
print(classification_report(y_test_filtered, y_pred_smotee_rf, labels=[0, 1]))


              precision    recall  f1-score   support

           0       0.75      0.42      0.54       857
           1       0.27      0.61      0.38       304

    accuracy                           0.47      1161
   macro avg       0.51      0.51      0.46      1161
weighted avg       0.63      0.47      0.50      1161



In [35]:
y_pred_smotee_rf=model_rf_smotee.predict(xr_test1)

In [36]:
#print(classification_report(yr_test1, y_pred_smotee_rf, labels=[0,1]))

In [37]:
print("y_test shape:", y_test.shape)
print("y_pred_smotee_rf shape:", y_pred_smotee_rf.shape)


y_test shape: (1407,)
y_pred_smotee_rf shape: (1161,)


In [38]:
print(set(y_pred_smotee_rf))
print(set(y_test))


{0, 1}
{0, 1}


In [39]:
print("Unique values in y_test:", set(y_test))
print("Unique values in y_pred_smotee_rf:", set(y_pred_smotee_rf))


Unique values in y_test: {0, 1}
Unique values in y_pred_smotee_rf: {0, 1}


In [40]:
print("xr_test1 shape:", xr_test1.shape)  # Should be (1407, ...)


xr_test1 shape: (1161, 50)


In [41]:
print("y_test shape:", y_test.shape)

y_test shape: (1407,)


In [43]:
##saving model

In [44]:
import pickle

In [45]:
filename= 'model.sav'

In [46]:
pickle.dump(model_dt_smotee , open(filename , 'wb'))

In [47]:
load_model= pickle.load(open(filename,'rb'))

In [50]:
load_model.score(xr_test1,yr_test1)

0.9388458225667528