# TREE based Model

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np

from src.utils import *

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report

In [None]:
# Data Loading and X & Y saperation
df = load_csv('../data/processed/Treated_subscriptions_churn.csv')
X = df.drop(["subscription_canceled","is_active_last30d","total_revenue"],axis=1)
y = df["subscription_canceled"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16, stratify=y)


## RANDOM FOREST MODEL

In [None]:

# 1. Grid Search for Random Forest
param_grid_rf = {
    'n_estimators': [100,300,500],
    'max_features': ['sqrt','log2'],
    'max_depth': [10,15,20],
    'min_samples_split': [ 2,5,10],
    'min_samples_leaf': [ 1,2,4],
    'class_weight': ['balanced'] # Critical for churn prediction
}

print("Starting Random Forest Tuning...")
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=16),
    param_grid_rf,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train, y_train)

# Final Model and Evaluation ---
best_rf = rf_grid.best_estimator_

print(f"--- Random Forest Results ---")
print(f"\nOptimal RF Parameters: {rf_grid.best_params_}")
print(f"Cross-Validated ROC AUC on Training Data: {rf_grid.best_score_:.4f}")

# Evaluate on the unseen test data
y_prob = best_rf.predict_proba(X_test)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_prob)
print(f"ROC AUC on Unseen Test Data: {test_roc_auc:.4f}\n")

# Use a standard threshold (0.5) to get a classification report
y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# # Optimal RF Parameters: {'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
# # Cross-Validated ROC AUC on Training Data: 0.8374

In [None]:
# # THRESHOLDS TUNING
# rf_proba = best_rf.predict_proba(X_test)[:,1]
# thresholds = np.arange(0.2, 0.8, 0.05)

# for t in thresholds:
#     rf_pred = (rf_proba >= t).astype(int)
#     print("ROC AUC:", roc_auc_score(y_test, rf_proba), " @", t)
#     print(classification_report(y_test, rf_pred))

### “Since churn is a cost-sensitive problem, we evaluated multiple probability thresholds between 0.20 and 0.80. While higher thresholds improved accuracy, they significantly reduced recall for churners. The F1-optimized thresholds range from 0.35 to 0.55, and the final choice is a business decision based on the cost function. We selected a threshold of 0.38, which maintains high churn recall (85%) while better precision and operational efficiency(73%). This provides the best balance between customer retention coverage and campaign cost.” 

In [None]:

# 3. Extract Feature Importance
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\n--- Top 10 Random Forest Feature Importances ---")
print(importances.head(10).to_markdown(index=False))


In [None]:
import joblib
import os

# --- CONFIGURATION ---
# Replace 'customer_id' with the EXACT name of the ID column in your processed CSV
CUSTOMER_ID_COL = 'customer_id' 
# ---------------------

# --- 1. Load the Original Processed Data (Contains IDs) ---
df=pd.read_csv("../data/processed/cleaned_subscriptions_churn.csv")
print(f"Loaded data with {df.shape[0]} rows.")

# Separate Target and Features
# We use the target to ensure the split is identical to your modeling step
y = df['subscription_canceled'].values.ravel()
X = df.drop(columns=['subscription_canceled'])

# --- 2. Re-Split to Isolate the Test Set IDs ---
# We use the EXACT same parameters: test_size=0.2, random_state=42, stratify=y
# We don't need to drop columns here because we only care about the indices
X_train_orig, X_test_orig, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16, stratify=y)

# --- 3. Generate Predictions ---
# Use your trained 'best_rf' and the scaled test data 'XTest'
# (These must be currently in your memory from previous steps)
y_prob = best_rf.predict_proba(X_test)[:, 1]

# Apply your optimized threshold
CHURN_THRESHOLD = 0.38
y_pred_final = (y_prob >= CHURN_THRESHOLD).astype(int)

# --- 4. Assemble the Final Data Frame ---
# Reset index to ensure 1:1 alignment between the Original Data rows and Predictions
X_test_orig = X_test_orig.reset_index(drop=True)

results_df = X_test_orig.copy()
results_df['True_Churn_Status'] = y_test # Add true status
results_df['Predicted_Probability'] = y_prob # Add model probability
results_df['Predicted_Churn_Flag'] = y_pred_final # Add final decision

# --- 5. Organize Columns (Put ID First) ---
# Moves customer_id to the first column for readability
if CUSTOMER_ID_COL in results_df.columns:
    cols = [CUSTOMER_ID_COL] + [c for c in results_df.columns if c != CUSTOMER_ID_COL]
    results_df = results_df[cols]

# Define output directory
OUTPUT_DIR = "../data/processed/tree_model"

# Create directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Save Final Artifacts ---
joblib.dump(best_rf,os.path.join(OUTPUT_DIR, "best_rf_model.joblib"))
print("Saved Model:", os.path.join(OUTPUT_DIR, "best_rf_model.joblib"))

results_df.to_csv(os.path.join(OUTPUT_DIR, "rf_churn_predictions_ACTIONABLE.csv"),index=False)
print("Saved Predictions:", os.path.join(OUTPUT_DIR, "rf_churn_predictions_ACTIONABLE.csv"))


# Print Summary
print(f"\n--- Delivery Complete ---")
print(f"Total Customers in Report: {len(results_df)}")
print(f"Customers Flagged for Retention: {results_df['Predicted_Churn_Flag'].sum()}")

## XGBOOST MODEL

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# 1. Calculate the imbalance ratio
# This is crucial for ROC AUC in imbalanced datasets
counts = y_train.value_counts()
ratio = counts[0] / counts[1]

# 2. Define the base model
# We set n_estimators high because early_stopping will find the perfect cutoff
xgb_model = XGBClassifier(
    tree_method='hist',
    eval_metric='auc',
    scale_pos_weight=ratio, 
    early_stopping_rounds=50,
    random_state=16
)

# 3. Define the Search Space
# We focus on min_child_weight and gamma to control the imbalance sensitivity
param_grid = {
    'n_estimators': [2000],              # Early stopping handles the actual limit
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6],           # Keeping it shallow prevents overfitting imbalance
    'min_child_weight': [1, 5, 10],      # Higher values prevent leaf nodes on outliers
    'gamma': [0, 0.1, 0.2, 0.4],         # Minimum loss reduction for a split
    'subsample': [0.7, 0.8, 0.9],        # Row sampling
    'colsample_bytree': [0.7, 0.8, 0.9], # Feature sampling
    'reg_alpha': [0, 0.1, 1],            # L1 regularization
    'reg_lambda': [1, 5, 10]             # L2 regularization
}

# 4. Execute Randomized Search
xgb_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50, 
    scoring='roc_auc',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=16
)

# 5. Fit with Early Stopping
# We pass eval_set to monitor performance on unseen data during training
xgb_search.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)


# 6. Final Evaluation
best_xgb = xgb_search.best_estimator_
y_prob_xgb = best_xgb.predict_proba(X_test)[:, 1]
final_auc = roc_auc_score(y_test, y_prob_xgb)

print(f"\nOptimal XGB Parameters: {xgb_search.best_params_}")
print(f"Cross-Validated ROC AUC on Training Data: {xgb_search.best_score_:.4f}")

print(f"XGBoost Test ROC AUC: {final_auc:.4f}")
xgb_pred = (y_prob_xgb >= 0.5).astype(int)
print("ROC AUC:", final_auc)
print(classification_report(y_test, xgb_pred))

In [None]:

# # THRESHOLDS TUNING

# y_prob_xgb = best_xgb.predict_proba(X_test)[:, 1]
# thresholds = np.arange(0.3, 0.8, 0.05)

# for t in thresholds:
#     xgb_pred = (y_prob_xgb >= t).astype(int)
#     print("ROC AUC:", roc_auc_score(y_test, y_prob_xgb)," @ ", t)
#     print(classification_report(y_test, xgb_pred))

### “Since churn is a cost-sensitive problem, we evaluated multiple probability thresholds between 0.30 and 0.80. While higher thresholds improved accuracy, they significantly reduced recall for churners. The F1-optimized thresholds range from 0.45 to 0.65, and the final choice is a business decision based on the cost function. We selected a threshold of 0.55, which maintains high churn recall (88%) while improving precision(49%) and operational efficiency(72%). This provides the best balance between customer retention coverage and campaign cost.” 

In [None]:

# 3. Extract Feature Importance
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_xgb.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\n--- Top 10 XGBoost Feature Importances ---")
print(importances.head(10).to_markdown(index=False))


In [None]:
import joblib
import os

# --- CONFIGURATION ---
# Replace 'customer_id' with the EXACT name of the ID column in your processed CSV
CUSTOMER_ID_COL = 'customer_id' 
# ---------------------

# --- 1. Load the Original Processed Data (Contains IDs) ---
df=pd.read_csv("../data/processed/cleaned_subscriptions_churn.csv")
print(f"Loaded data with {df.shape[0]} rows.")

# Separate Target and Features
# We use the target to ensure the split is identical to your modeling step
y = df['subscription_canceled'].values.ravel()
X = df.drop(columns=['subscription_canceled'])

# --- 2. Re-Split to Isolate the Test Set IDs ---
# We use the EXACT same parameters: test_size=0.2, random_state=42, stratify=y
# We don't need to drop columns here because we only care about the indices
X_train_orig, X_test_orig, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16, stratify=y)

# --- 3. Generate Predictions ---
# Use your trained 'best_xgb' and the scaled test data 'XTest'
# (These must be currently in your memory from previous steps)
y_prob = best_xgb.predict_proba(X_test)[:, 1]

# Apply your optimized threshold
CHURN_THRESHOLD = 0.46
y_pred_final = (y_prob >= CHURN_THRESHOLD).astype(int)

# --- 4. Assemble the Final Data Frame ---
# Reset index to ensure 1:1 alignment between the Original Data rows and Predictions
X_test_orig = X_test_orig.reset_index(drop=True)

results_df = X_test_orig.copy()
results_df['True_Churn_Status'] = y_test # Add true status
results_df['Predicted_Probability'] = y_prob # Add model probability
results_df['Predicted_Churn_Flag'] = y_pred_final # Add final decision

# --- 5. Organize Columns (Put ID First) ---
# Moves customer_id to the first column for readability
if CUSTOMER_ID_COL in results_df.columns:
    cols = [CUSTOMER_ID_COL] + [c for c in results_df.columns if c != CUSTOMER_ID_COL]
    results_df = results_df[cols]

# Define output directory
OUTPUT_DIR = "../data/processed/tree_model"

# Create directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Save Final Artifacts ---
joblib.dump(best_xgb,os.path.join(OUTPUT_DIR, "best_xgb_model.joblib"))
print("Saved Model:", os.path.join(OUTPUT_DIR, "best_xgb_model.joblib"))

results_df.to_csv(os.path.join(OUTPUT_DIR, "xgb_churn_predictions_ACTIONABLE.csv"),index=False)
print("Saved Predictions:", os.path.join(OUTPUT_DIR, "xgb_churn_predictions_ACTIONABLE.csv"))


# Print Summary
print(f"\n--- Delivery Complete ---")
print(f"Total Customers in Report: {len(results_df)}")
print(f"Customers Flagged for Retention: {results_df['Predicted_Churn_Flag'].sum()}")