In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score, classification_report

import joblib


In [4]:
df = pd.read_csv("churn_dataset_5000.csv")

df.head()


Unnamed: 0,customer_id,tenure_months,avg_session_duration,sessions_last_30_days,feature_usage_score,support_tickets_last_90_days,last_login_days_ago,monthly_spend_usd,payment_failures_last_6m,plan_type,region,is_annual_plan,churn
0,CUST_00001,39,9.99,1,0.02,2,6,34.54,4,basic,LATAM,1,1
1,CUST_00002,52,54.07,39,0.62,2,36,238.73,0,basic,APAC,0,1
2,CUST_00003,29,42.9,37,0.66,4,7,260.8,2,enterprise,LATAM,0,0
3,CUST_00004,15,56.31,47,0.31,2,4,428.07,2,free,EU,0,0
4,CUST_00005,43,36.59,10,0.32,9,43,367.51,0,pro,EU,0,1


In [5]:
X = df.drop(columns=["customer_id", "churn"])
y = df["churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [6]:
numeric_features = [
    "tenure_months",
    "avg_session_duration",
    "sessions_last_30_days",
    "feature_usage_score",
    "support_tickets_last_90_days",
    "last_login_days_ago",
    "monthly_spend_usd",
    "payment_failures_last_6m"
]

categorical_features = ["plan_type", "region"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [7]:
from sklearn.model_selection import GridSearchCV

log_reg_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(
        max_iter=10000,
        class_weight="balanced"
    ))
])

log_reg_param_grid = {
    "model__C": [0.01, 0.1, 1.0, 10.0],
    "model__penalty": ["l1", "l2"],
    "model__solver": ["liblinear"]
}

log_reg_grid = GridSearchCV(
    log_reg_pipeline,
    param_grid=log_reg_param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

log_reg_grid.fit(X_train, y_train)

print("Best Logistic Regression Params:")
print(log_reg_grid.best_params_)


Best Logistic Regression Params:
{'model__C': 0.01, 'model__penalty': 'l2', 'model__solver': 'liblinear'}


In [8]:
best_lr = log_reg_grid.best_estimator_

lr_probs = best_lr.predict_proba(X_test)[:, 1]
lr_auc = roc_auc_score(y_test, lr_probs)

print("Tuned Logistic Regression ROC-AUC:", round(lr_auc, 4))


Tuned Logistic Regression ROC-AUC: 0.6574


In [9]:
rf_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(
        random_state=42,
        class_weight="balanced"
    ))
])

rf_param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5],
    "model__min_samples_leaf": [1, 2]
}

rf_grid = GridSearchCV(
    rf_pipeline,
    param_grid=rf_param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)

print("Best Random Forest Params:")
print(rf_grid.best_params_)


Best Random Forest Params:
{'model__max_depth': 10, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 200}


In [10]:
best_rf = rf_grid.best_estimator_

rf_probs = best_rf.predict_proba(X_test)[:, 1]
rf_auc = roc_auc_score(y_test, rf_probs)

print("Tuned Random Forest ROC-AUC:", round(rf_auc, 4))


Tuned Random Forest ROC-AUC: 0.658


In [11]:
print(classification_report(y_test, best_rf.predict(X_test)))


              precision    recall  f1-score   support

           0       0.69      0.68      0.68       617
           1       0.49      0.50      0.50       383

    accuracy                           0.61      1000
   macro avg       0.59      0.59      0.59      1000
weighted avg       0.61      0.61      0.61      1000



In [12]:
results = pd.DataFrame({
    "Model": ["Logistic Regression (Tuned)", "Random Forest (Tuned)"],
    "ROC_AUC": [lr_auc, rf_auc]
})

results


Unnamed: 0,Model,ROC_AUC
0,Logistic Regression (Tuned),0.657358
1,Random Forest (Tuned),0.657959


In [13]:
if lr_auc >= rf_auc:
    final_model = best_lr
    selected = "Logistic Regression"
else:
    final_model = best_rf
    selected = "Random Forest"

print("Final Selected Model:", selected)


Final Selected Model: Random Forest


In [14]:
import joblib

joblib.dump(final_model, "churn_model.pkl")
print("Final model saved as churn_model.pkl")


Final model saved as churn_model.pkl
