In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# pip install catboost

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('cleaned_dataset.csv')

In [5]:
df.drop(columns=['service_failure_count'], inplace=True)

In [6]:
X = df.drop(columns=['churn'])
y = df['churn']

In [7]:
models = {
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Linear SVM": LinearSVC(random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=7, random_state=42),
    "Random Forest": RandomForestClassifier(max_depth=7, random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
}

In [8]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
cv_results = []

for name, model in tqdm(models.items(), desc="Cross-validating"):
    if name in ["Logistic Regression", "Linear SVM"]:
        pipeline = make_pipeline(StandardScaler(), model)
    else:
        pipeline = model

    accuracy_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
    f1_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='f1')
    recall_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='recall')
    roc_auc_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc')

    cv_results.append({
        "Model": name,
        "Accuracy Mean (%)": round(np.mean(accuracy_scores) * 100, 2),
        "Accuracy Std (%)": round(np.std(accuracy_scores) * 100, 2),
        "F1 Mean (%)": round(np.mean(f1_scores) * 100, 2),
        "F1 Std (%)": round(np.std(f1_scores) * 100, 2),
        "Recall Mean (%)": round(np.mean(recall_scores) * 100, 2),
        "Recall Std (%)": round(np.std(recall_scores) * 100, 2),
        "ROC AUC Mean": round(np.mean(roc_auc_scores), 4),
        "ROC AUC Std": round(np.std(roc_auc_scores), 4)
    })

Cross-validating:  88%|████████▊ | 7/8 [01:00<00:10, 10.84s/it]

In [None]:
cv_df = pd.DataFrame(cv_results)
cv_df

Unnamed: 0,Model,Accuracy Mean (%),Accuracy Std (%),F1 Mean (%),F1 Std (%),Recall Mean (%),Recall Std (%),ROC AUC Mean,ROC AUC Std
0,Naive Bayes,92.09,0.14,92.79,0.12,91.39,0.18,0.9435,0.0006
1,Logistic Regression,92.4,0.13,93.03,0.13,91.05,0.28,0.9554,0.0006
2,Linear SVM,91.96,0.12,92.59,0.12,90.23,0.26,0.9544,0.0006
3,Decision Tree,93.82,0.07,94.39,0.06,93.38,0.21,0.9646,0.001
4,Random Forest,93.75,0.09,94.34,0.08,93.51,0.21,0.9698,0.001
5,AdaBoost,93.37,0.13,93.99,0.12,93.21,0.22,0.9615,0.0015
6,XGBoost,94.23,0.08,94.79,0.08,94.22,0.25,0.9798,0.0007
7,CatBoost,94.31,0.11,94.86,0.11,94.19,0.29,0.9806,0.0006


In [None]:
cv_df.to_excel('model_cv.xlsx', index=False)