In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix
)
import joblib


In [5]:
#  Load the processed dataset
df = pd.read_csv("data/processed_churn_data.csv")

X = df.drop("Churn", axis=1)
y = df["Churn"]

In [6]:
#  Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
# Hyperparameter tuning for Random Forest
rf_params = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5]
}

In [8]:
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid=rf_params,
    cv=3,
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [None, 10, ...], 'min_samples_split': [2, 5], 'n_estimators': [100, 200]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [9]:
print("✅ Best Random Forest Parameters:", rf_grid.best_params_)

# 🎯 Evaluate tuned Random Forest
y_pred_rf = rf_grid.predict(X_test)

print("\n🌳 Tuned Random Forest Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

✅ Best Random Forest Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}

🌳 Tuned Random Forest Results:
Accuracy: 0.8005677785663591
F1 Score: 0.5799701046337817

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.66      0.52      0.58       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409

Confusion Matrix:
 [[934 101]
 [180 194]]


In [10]:
# Re-train Logistic Regression (with higher max_iter)
log_model = LogisticRegression(max_iter=2000, solver="lbfgs")
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
print("\n🔗 Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("F1 Score:", f1_score(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))


🔗 Logistic Regression Results:
Accuracy: 0.8055358410220014
F1 Score: 0.6028985507246377

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.90      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409

Confusion Matrix:
 [[927 108]
 [166 208]]


In [14]:
#  Save the best Random Forest model
joblib.dump(rf_grid.best_estimator_, "churn_rf_model.pkl")

#  Save the Logistic Regression model
joblib.dump(log_model, "churn_log_model.pkl")

# Save the column names (needed for prediction later)
columns = list(X.columns)
joblib.dump(columns, "model_columns.pkl")

print("\n✅ Models and column names saved successfully.")


✅ Models and column names saved successfully.
