In [2]:

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split


from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [3]:
df=pd.read_csv("../data/cleaned/final_cleaned_file.csv")

In [3]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


In [4]:
df["Churn"]=df['Churn'].map({'No':0, 'Yes':1})

In [5]:
categorial_columns=df.select_dtypes('object').columns

In [6]:
df=pd.get_dummies(df,columns=categorial_columns,drop_first=True)

In [7]:
features = df.drop(columns ='Churn')
target = df['Churn']

In [29]:
features.shape

(7043, 30)

In [30]:
target.shape

(7043,)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20,random_state=0, stratify=target
)

In [9]:
features.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [9]:
import pickle

In [10]:
with open("model_features.pkl", "wb") as f:
    pickle.dump(X_train.columns.tolist(), f)

In [32]:
scaler = StandardScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

KNN

In [33]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2)

In [34]:
knn.fit(X_train_scaled, y_train)

In [35]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
pred_knn = knn.predict(X_test_scaled)

In [36]:
knn.score(X_test_scaled ,y_test)

0.7466288147622427

In [37]:
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix, f1_score

In [38]:
print(classification_report(y_pred = pred_knn, y_true = y_test))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83      1035
           1       0.53      0.48      0.50       374

    accuracy                           0.75      1409
   macro avg       0.67      0.66      0.66      1409
weighted avg       0.74      0.75      0.74      1409



In [39]:
from sklearn.model_selection import train_test_split, GridSearchCV


In [40]:
param_grid = {
    "n_neighbors": [3, 5, 7, 11, 15],
    "weights": ["uniform", "distance"],
    "metric": ["minkowski", "manhattan"],
    "p": [1, 2]  # p=1 → Manhattan, p=2 → Euclidean
}

knn_model = KNeighborsClassifier()

grid_search_knn = GridSearchCV(
    estimator=knn_model,
    param_grid=param_grid,
    cv=3,
    scoring="f1",
    n_jobs=-1
)

grid_search_knn.fit(X_train_scaled, y_train)
print("Best Parameters:", grid_search_knn.best_params_)

best_knn = grid_search_knn.best_estimator_
pred_best_knn = best_knn.predict(X_test_scaled)
print("Best KNN Report:\n", classification_report(y_test, pred_best_knn))

Best Parameters: {'metric': 'minkowski', 'n_neighbors': 15, 'p': 1, 'weights': 'uniform'}
Best KNN Report:
               precision    recall  f1-score   support

           0       0.84      0.86      0.85      1035
           1       0.59      0.55      0.57       374

    accuracy                           0.78      1409
   macro avg       0.71      0.71      0.71      1409
weighted avg       0.77      0.78      0.78      1409



In [41]:
proba_knn = knn.predict_proba(X_test_scaled)[:,1]
pred_knn_thresh = (proba_knn >= 0.3).astype(int)  # lower threshold
print(classification_report(y_test, pred_knn_thresh))

              precision    recall  f1-score   support

           0       0.88      0.70      0.78      1035
           1       0.48      0.74      0.58       374

    accuracy                           0.71      1409
   macro avg       0.68      0.72      0.68      1409
weighted avg       0.77      0.71      0.73      1409



logistic regression

In [43]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [44]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=1000, random_state=0)
log_reg.fit(X_train_scaled, y_train)


In [45]:
pred_log = log_reg.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred_log, y_test))
print("RMSE", mean_squared_error(pred_log, y_test))
print("R2 score", log_reg.score(X_test_scaled, y_test))

MAE 0.20014194464158977
RMSE 0.20014194464158977
R2 score 0.7998580553584103


In [46]:
print(classification_report(y_pred = pred_log, y_true = y_test))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.64      0.55      0.59       374

    accuracy                           0.80      1409
   macro avg       0.74      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409



In [47]:
proba_log = log_reg.predict_proba(X_test_scaled)[:,1]
pred_log_thresh = (proba_log >= 0.4).astype(int)  # lower threshold
print(classification_report(y_test, pred_log_thresh))


              precision    recall  f1-score   support

           0       0.88      0.83      0.86      1035
           1       0.60      0.68      0.64       374

    accuracy                           0.79      1409
   macro avg       0.74      0.76      0.75      1409
weighted avg       0.80      0.79      0.80      1409



RANDOM FOREST

In [50]:
forest = RandomForestClassifier(n_estimators=100,
                             max_depth=20, random_state=0)
forest.fit(X_train_scaled, y_train)

In [51]:
pred_forest = forest.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred_forest, y_test))
print("RMSE", mean_squared_error(pred_forest, y_test))
print("R2 score", forest.score(X_test_scaled, y_test))

MAE 0.20865862313697658
RMSE 0.20865862313697658
R2 score 0.7913413768630234


In [52]:
print(classification_report(y_pred = pred_forest, y_true = y_test))

              precision    recall  f1-score   support

           0       0.83      0.91      0.86      1035
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1409
   macro avg       0.74      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409



In [53]:
param_grid = {"n_estimators": [50, 100, 200,500],
        "max_leaf_nodes": [250, 500, 1000, None],
        "max_depth":[10,30,50]}
forest_class = RandomForestClassifier(n_jobs=-1, random_state=0)

In [54]:
grid_forest = GridSearchCV(estimator = forest_class, param_grid = param_grid, cv=5)
grid_forest.fit(X_train_scaled, y_train)

In [55]:
print(grid_forest.best_params_)
best_model_gf = grid_forest.best_estimator_
print(best_model_gf)

{'max_depth': 10, 'max_leaf_nodes': 250, 'n_estimators': 100}
RandomForestClassifier(max_depth=10, max_leaf_nodes=250, n_jobs=-1,
                       random_state=0)


In [56]:
gf_pred = best_model_gf.predict(X_test_scaled)

print("MAE", mean_absolute_error(gf_pred, y_test))
print("RMSE", mean_squared_error(gf_pred, y_test))
print("R2 score", best_model_gf.score(X_test_scaled, y_test))

MAE 0.1880766501064585
RMSE 0.1880766501064585
R2 score 0.8119233498935415


In [57]:
print(classification_report(y_pred = gf_pred, y_true = y_test))

              precision    recall  f1-score   support

           0       0.84      0.91      0.88      1035
           1       0.69      0.53      0.60       374

    accuracy                           0.81      1409
   macro avg       0.77      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409



DECISION TREE

In [58]:
decision = DecisionTreeClassifier(max_depth=20, random_state=0)
decision.fit(X_train_scaled, y_train)

In [59]:
pred_dt = decision.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred_dt, y_test))
print("RMSE", mean_squared_error(pred_dt, y_test))
print("R2 score", decision.score(X_test_scaled, y_test))

MAE 0.2576295244854507
RMSE 0.2576295244854507
R2 score 0.7423704755145494


In [60]:
print(classification_report(y_pred=pred_dt,y_true=y_test))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82      1035
           1       0.51      0.51      0.51       374

    accuracy                           0.74      1409
   macro avg       0.67      0.67      0.67      1409
weighted avg       0.74      0.74      0.74      1409



In [61]:
param_grid = {"max_depth": [None, 5, 10, 20,30,50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
        }
decision_class = DecisionTreeClassifier(random_state=0)

In [62]:
grid_decision = GridSearchCV(estimator = decision_class, param_grid = param_grid, cv=5)
grid_decision.fit(X_train_scaled, y_train)

In [63]:
print(grid_decision.best_params_)
best_model_gd = grid_decision.best_estimator_

{'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}


In [64]:
pred_decision = best_model_gd.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred_decision, y_test))
print("RMSE", mean_squared_error(pred_decision, y_test))
print("R2 score", best_model_gd.score(X_test_scaled, y_test))

MAE 0.21149751596877217
RMSE 0.21149751596877217
R2 score 0.7885024840312278


In [65]:
print(classification_report(y_pred=pred_decision,y_true=y_test))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1035
           1       0.62      0.53      0.57       374

    accuracy                           0.79      1409
   macro avg       0.73      0.71      0.72      1409
weighted avg       0.78      0.79      0.78      1409



Gradient boosting

In [66]:
gb_clf = GradientBoostingClassifier(max_depth=5,
                                   n_estimators=100, learning_rate=0.1,random_state=0
)

In [67]:
gb_clf.fit(X_train_scaled, y_train)

In [68]:
pred_gb = gb_clf.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred_gb, y_test))
print("RMSE", mean_squared_error(pred_gb, y_test))
print("R2 score", gb_clf.score(X_test_scaled, y_test))

MAE 0.198722498225692
RMSE 0.198722498225692
R2 score 0.801277501774308


In [69]:
print(classification_report(y_pred=pred_gb,y_true=y_test))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.66      0.53      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.73      1409
weighted avg       0.79      0.80      0.79      1409



In [70]:
param_grid = {
    "n_estimators": [100, 200],
    "learning_rate": [ 0.05, 0.1],
    "max_depth": [3, 5]
}

In [71]:

gb_model = GradientBoostingClassifier(random_state=0)
grid_gb = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5)
grid_gb.fit(X_train_scaled, y_train)


In [72]:
print(grid_gb.best_params_)
best_model_grad = grid_gb.best_estimator_

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}


In [73]:
pred_grad = best_model_gd.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred_grad, y_test))
print("RMSE", mean_squared_error(pred_grad, y_test))
print("R2 score", best_model_grad.score(X_test_scaled, y_test))

MAE 0.21149751596877217
RMSE 0.21149751596877217
R2 score 0.8005677785663591


In [74]:
print(classification_report(y_pred=pred_grad,y_true=y_test))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1035
           1       0.62      0.53      0.57       374

    accuracy                           0.79      1409
   macro avg       0.73      0.71      0.72      1409
weighted avg       0.78      0.79      0.78      1409



In [75]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(random_state=0)

param_dist = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7],
    "min_samples_split": [2, 5, 10]
}

random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_dist,
    n_iter=10,   # only try 10 random combos
    cv=3,
   
    random_state=0,
    n_jobs=-1
)

random_search.fit(X_train_scaled, y_train)
print("Best Parameters:", random_search.best_params_)

Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': 3, 'learning_rate': 0.05}


In [76]:
best_model_random = random_search.best_estimator_

In [77]:
pred_random = best_model_random.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred_random, y_test))
print("RMSE", mean_squared_error(pred_random, y_test))
print("R2 score", best_model_random.score(X_test_scaled, y_test))

MAE 0.1994322214336409
RMSE 0.1994322214336409
R2 score 0.8005677785663591


In [78]:
print(classification_report(y_pred=pred_random,y_true=y_test))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.66      0.51      0.57       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



In [79]:
#!pip install xgboost

In [80]:
import xgboost

XGBOOST

In [81]:
from xgboost import XGBClassifier


In [82]:
xgb_clf = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=0,
    use_label_encoder=False,
    eval_metric="logloss"   # avoids warnings
)

xgb_clf.fit(X_train_scaled, y_train)




Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [83]:
pred_xgb = xgb_clf.predict(X_test_scaled)
print("MAE ", mean_absolute_error(y_test, pred_xgb))
print("RMSE ", mean_squared_error(y_test, pred_xgb))
print("R2 score ", r2_score(y_test, pred_xgb))

MAE  0.21078779276082327
RMSE  0.21078779276082327
R2 score  -0.08107416879795393


In [84]:
print(classification_report(y_pred=pred_xgb,y_true=y_test))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409



In [85]:
param_dist = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

In [86]:
xgb_random_search = RandomizedSearchCV(
    estimator=XGBClassifier(
        random_state=0,
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    param_distributions=param_dist,
    n_iter=10,   # only 10 random combos
    cv=3,
    
    n_jobs=-1
)

xgb_random_search.fit(X_train_scaled, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [87]:
print(xgb_random_search.best_params_)
best_model_xgb_random = xgb_random_search.best_estimator_

{'subsample': 0.8, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 1.0}


In [88]:
pred_random_xgb = best_model_xgb_random.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred_random_xgb, y_test))
print("RMSE", mean_squared_error(pred_random_xgb, y_test))
print("R2 score", best_model_xgb_random.score(X_test_scaled, y_test))

MAE 0.1980127750177431
RMSE 0.1980127750177431
R2 score 0.8019872249822569


In [89]:
print(classification_report(y_pred=pred_random_xgb,y_true=y_test))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.66      0.51      0.58       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



ADA BOOST 

In [90]:

ada_clf = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),  # use 'estimator' instead of 'base_estimator'
    n_estimators=100,
    learning_rate=0.1,
    random_state=0
)

ada_clf.fit(X_train_scaled, y_train)


In [91]:
pred_ada = ada_clf.predict(X_test_scaled)
print("MAE ", mean_absolute_error(y_test, pred_ada))
print("RMSE ", mean_squared_error(y_test, pred_ada))
print("R2 score ", r2_score(y_test, pred_ada))

MAE  0.18949609652235627
RMSE  0.18949609652235627
R2 score  0.02812524219173851


In [92]:
print(classification_report(y_pred=pred_ada,y_true=y_test))

              precision    recall  f1-score   support

           0       0.84      0.91      0.88      1035
           1       0.69      0.53      0.60       374

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409



In [93]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.05, 0.1, 0.5],
    "estimator__max_depth": [1, 2, 3, 5]   # updated key
}


In [94]:
ada_model_grid = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(),
    random_state=0
)


In [95]:
grid_search_ada = GridSearchCV(
    estimator=ada_model_grid,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1
)
grid_search_ada.fit(X_train_scaled, y_train)


In [96]:
print(grid_search_ada.best_params_)
best_model_ada_grid = grid_search_ada.best_estimator_

{'estimator__max_depth': 2, 'learning_rate': 0.5, 'n_estimators': 200}


In [97]:
pred_random_ada = best_model_xgb_random.predict(X_test_scaled)

print("MAE", mean_absolute_error(pred_ada, y_test))
print("RMSE", mean_squared_error(pred_ada, y_test))
print("R2 score", best_model_xgb_random.score(X_test_scaled, y_test))

MAE 0.18949609652235627
RMSE 0.18949609652235627
R2 score 0.8019872249822569


In [98]:
print(classification_report(y_pred=pred_random_ada,y_true=y_test))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.66      0.51      0.58       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



In [99]:
from sklearn.utils import resample

SMOTE

In [100]:
from imblearn.over_sampling import SMOTE

In [101]:
sm = SMOTE(random_state = 0)
X_train_sm,y_train_sm = sm.fit_resample(X_train_scaled,y_train)

In [102]:



adaboost_clf = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),
    n_estimators=200,
    learning_rate=0.1,
    random_state=0
)


In [103]:

adaboost_clf.fit(X_train_sm, y_train_sm)

In [104]:
pred_smote = adaboost_clf.predict(X_test_scaled)
print(classification_report(y_pred = pred_smote, y_true = y_test))

              precision    recall  f1-score   support

           0       0.89      0.79      0.84      1035
           1       0.56      0.74      0.64       374

    accuracy                           0.78      1409
   macro avg       0.73      0.76      0.74      1409
weighted avg       0.80      0.78      0.78      1409



PREDICT_PROBA

In [105]:
ada_clf.predict_proba(X_test_scaled)[:,1]

array([0.11920292, 0.41089684, 0.75755099, ..., 0.11920292, 0.16569715,
       0.76701862])

In [106]:
import pickle
adaboost_clf

In [107]:
pickle.dump(adaboost_clf, open("model.pkl", "wb"))