# L1 Logistics Regression:

In [1]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.utils import *

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

In [2]:
XTrain=load_csv('../data/processed/XTrain.csv')
XTest=load_csv('../data/processed/XTest.csv')
YTrain=load_csv('../data/processed/YTrain.csv')
YTest=load_csv('../data/processed/YTest.csv')
XTrain.columns.tolist()

['months_subscribed',
 'streaming_quality',
 'subscription_type',
 'monthly_plan_cost',
 'app_usage_hours',
 'last30d_usage_hours',
 'customer_rating',
 'promo_email_clicks',
 'num_profiles',
 'auto_renew',
 'support_tickets_last6m',
 'nps_score',
 'gender_Male',
 'payment_mode_Credit card (automatic)',
 'payment_mode_Electronic check',
 'payment_mode_Mailed check',
 'device_type_Mobile',
 'device_type_SmartTV',
 'total_revenue_log']

In [3]:
#Droppoing the High colinear cols for l1 logistics model is highly sensitive to the high collinearity.
XTrain = XTrain.drop(columns=['promo_email_clicks', 'num_profiles', 'support_tickets_last6m'], errors='ignore')
XTest = XTest.drop(columns=['promo_email_clicks', 'num_profiles', 'support_tickets_last6m'], errors='ignore')
XTrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4930 entries, 0 to 4929
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   months_subscribed                     4930 non-null   float64
 1   streaming_quality                     4930 non-null   float64
 2   subscription_type                     4930 non-null   float64
 3   monthly_plan_cost                     4930 non-null   float64
 4   app_usage_hours                       4930 non-null   float64
 5   last30d_usage_hours                   4930 non-null   float64
 6   customer_rating                       4930 non-null   float64
 7   auto_renew                            4930 non-null   float64
 8   nps_score                             4930 non-null   float64
 9   gender_Male                           4930 non-null   float64
 10  payment_mode_Credit card (automatic)  4930 non-null   float64
 11  payment_mode_Elec

# Model Generation

In [4]:
# --- Set up L1 Logistic Regression with GridSearchCV ---

# C is the inverse of regularization strength. Smaller C = stronger regularization (fewer features kept).
param_grid = {
    'C': np.logspace(-4, 0, 10) # Search range from very strong to very weak regularization
}

# The L1 penalty forces coefficients of weak features to zero.
l1_tuner = GridSearchCV(
  # estimator=LogisticRegression(penalty='l1', solver='liblinear', random_state=16),---------------------- results are very poor because of the Class Imbalance (recall:49% only)
    estimator=LogisticRegression(penalty='l1', solver='liblinear', class_weight='balanced', random_state=16),
    param_grid=param_grid,
    scoring='roc_auc', 
    cv=5, # 5-fold Cross-Validation
    verbose=1,
    n_jobs=-1
)

# DataConversion Warning Fix:
Ytest = YTest.values.ravel()
Ytrain = YTrain.values.ravel()

# Fit the model to the scaled training data
l1_tuner.fit(XTrain, Ytrain)

# Final Model and Evaluation ---
best_l1_model = l1_tuner.best_estimator_

print(f"--- L1 Logistic Regression Results ---")
print(f"Optimal Regularization Strength (C): {l1_tuner.best_params_['C']:.4f}")
print(f"Cross-Validated ROC AUC on Training Data: {l1_tuner.best_score_:.4f}")

# Evaluate on the unseen test data
y_prob = best_l1_model.predict_proba(XTest)[:, 1]
test_roc_auc = roc_auc_score(Ytest, y_prob)
print(f"ROC AUC on Unseen Test Data: {test_roc_auc:.4f}\n")

# Use a standard threshold (0.5) to get a classification report
y_pred = best_l1_model.predict(XTest)
print(classification_report(Ytest, y_pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
--- L1 Logistic Regression Results ---
Optimal Regularization Strength (C): 0.0464
Cross-Validated ROC AUC on Training Data: 0.8415
ROC AUC on Unseen Test Data: 0.8488

              precision    recall  f1-score   support

           0       0.92      0.69      0.79      1552
           1       0.49      0.84      0.62       561

    accuracy                           0.73      2113
   macro avg       0.71      0.77      0.70      2113
weighted avg       0.81      0.73      0.74      2113



In [5]:
# # Testing the different Thresholds 
# thresholds = np.arange(0.30, 0.80, 0.05)
# for t in thresholds:
#     print(f"-----------------------------{t}--------------------------")
#     y_pred_custom = (y_prob >= t).astype(int)
#     print(classification_report(YTest, y_pred_custom))

### “Since churn is a cost-sensitive problem, we evaluated multiple probability thresholds between 0.30 and 0.80. While higher thresholds improved accuracy, they significantly reduced recall for churners. The F1-optimized thresholds range from 0.45 to 0.65, and the final choice is a business decision based on the cost function. We selected a threshold of 0.55, which maintains high churn recall (81%) while improving precision and operational efficiency. This provides the best balance between customer retention coverage and campaign cost.” 

In [6]:
# Logistics ElasticNet Saga Model
# Define the search space for Elastic-Net
param_grid_en = {
    # C is the inverse of regularization strength
    'C': np.logspace(-4, 0, 5), 
    # l1_ratio controls the mix: 0=pure L2, 1=pure L1
    'l1_ratio': [0.25, 0.5, 0.75, 1.0] 
}

# Set up the Elastic-Net tuner (must use solver='saga')
en_tuner = GridSearchCV(
    estimator=LogisticRegression(
        penalty='elasticnet', 
        solver='saga', 
        random_state=16, 
        class_weight='balanced', # Keep this from previous enhancement
        max_iter=500 # Increase max_iter for 'saga' solver stability
    ),
    param_grid=param_grid_en,
    scoring='roc_auc', 
    cv=5, 
    verbose=1,
    n_jobs=-1
)
Ytest = YTest.values.ravel()
Ytrain = YTrain.values.ravel()
# Fit the model
en_tuner.fit(XTrain, Ytrain)

# Compare results:
print("\n--- Elastic-Net Results ---")
print(f"Best Parameters: {en_tuner.best_params_}")
print(f"Best CV ROC AUC: {en_tuner.best_score_:.4f}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits

--- Elastic-Net Results ---
Best Parameters: {'C': np.float64(0.1), 'l1_ratio': 1.0}
Best CV ROC AUC: 0.8420


the improvement from the elasticnet momdel is minimal(.0005) hence it suggest that L1 choice is optimal for our Dataset, now we know the best c= 0.1 hence we will rerun the pure l1 model using that value and the most optimal thersold value.

# Final Logistics Model 

In [7]:
# Optimal C found from Elastic-Net:
optimal_C = 0.1

# Reruning the final L1 Model with optimal C and class balancing
final_l1_model = LogisticRegression(penalty='l1', C=optimal_C, solver='liblinear', class_weight='balanced', random_state=16)

# DataConversion Warning Fix:
Ytest = YTest.values.ravel()
Ytrain = YTrain.values.ravel()

# Fit the model to the scaled training data
final_l1_model.fit(XTrain, Ytrain)

# Report
y_prob = final_l1_model.predict_proba(XTest)[:, 1]
y_pred_55 = (y_prob >= .55).astype(int)
print(classification_report(Ytest, y_pred_55))


              precision    recall  f1-score   support

           0       0.91      0.74      0.82      1552
           1       0.53      0.81      0.64       561

    accuracy                           0.76      2113
   macro avg       0.72      0.77      0.73      2113
weighted avg       0.81      0.76      0.77      2113



In [8]:
import joblib
import os

# --- CONFIGURATION ---
# Replace 'customer_id' with the EXACT name of the ID column in your processed CSV
CUSTOMER_ID_COL = 'customer_id' 
# ---------------------

# --- 1. Load the Original Processed Data (Contains IDs) ---
df=pd.read_csv("../data/processed/cleaned_subscriptions_churn.csv")
print(f"Loaded data with {df.shape[0]} rows.")

# Separate Target and Features
# We use the target to ensure the split is identical to your modeling step
y = df['subscription_canceled'].values.ravel()
X = df.drop(columns=['subscription_canceled'])

# --- 2. Re-Split to Isolate the Test Set IDs ---
# We use the EXACT same parameters: test_size=0.2, random_state=42, stratify=y
# We don't need to drop columns here because we only care about the indices
X_train_orig, X_test_orig, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16, stratify=y)

# --- 3. Generate Predictions ---
# Use your trained 'final_l1_model' and the scaled test data 'XTest'
# (These must be currently in your memory from previous steps)
y_prob = final_l1_model.predict_proba(XTest)[:, 1]

# Apply your optimized threshold
CHURN_THRESHOLD = 0.55
y_pred_final = (y_prob >= CHURN_THRESHOLD).astype(int)

# --- 4. Assemble the Final Data Frame ---
# Reset index to ensure 1:1 alignment between the Original Data rows and Predictions
X_test_orig = X_test_orig.reset_index(drop=True)

results_df = X_test_orig.copy()
results_df['True_Churn_Status'] = y_test # Add true status
results_df['Predicted_Probability'] = y_prob # Add model probability
results_df['Predicted_Churn_Flag'] = y_pred_final # Add final decision

# --- 5. Organize Columns (Put ID First) ---
# Moves customer_id to the first column for readability
if CUSTOMER_ID_COL in results_df.columns:
    cols = [CUSTOMER_ID_COL] + [c for c in results_df.columns if c != CUSTOMER_ID_COL]
    results_df = results_df[cols]

# Define output directory
OUTPUT_DIR = "../data/processed/l1_model"

# Create directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Save Final Artifacts ---
joblib.dump(final_l1_model,os.path.join(OUTPUT_DIR, "final_l1_logistic_model.joblib"))
print("Saved Model:", os.path.join(OUTPUT_DIR, "final_l1_logistic_model.joblib"))

results_df.to_csv(os.path.join(OUTPUT_DIR, "l1_churn_predictions_ACTIONABLE.csv"),index=False)
print("Saved Predictions:", os.path.join(OUTPUT_DIR, "l1_churn_predictions_ACTIONABLE.csv"))


# Print Summary
print(f"\n--- Delivery Complete ---")
print(f"Total Customers in Report: {len(results_df)}")
print(f"Customers Flagged for Retention: {results_df['Predicted_Churn_Flag'].sum()}")

Loaded data with 7043 rows.
Saved Model: ../data/processed/l1_model/final_l1_logistic_model.joblib
Saved Predictions: ../data/processed/l1_model/l1_churn_predictions_ACTIONABLE.csv

--- Delivery Complete ---
Total Customers in Report: 2113
Customers Flagged for Retention: 857


In [9]:

# define feature_list
feature_list = XTest.columns.tolist()  # or X_test_scaled.columns.tolist()

coefficients_df = pd.DataFrame({
    'Feature': feature_list,
    'Coefficient': final_l1_model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\n--- Final L1 Model Coefficients ---")
print(coefficients_df.to_markdown(index=False))

churn_drivers = coefficients_df[coefficients_df['Coefficient'] > 0].head(5)
retention_drivers = coefficients_df[coefficients_df['Coefficient'] < 0].tail(5)

print("\n--- Top 5 Churn Drivers ---")
print(churn_drivers.to_markdown(index=False))

print("\n--- Top 5 Retention Drivers ---")
print(retention_drivers.to_markdown(index=False))



--- Final L1 Model Coefficients ---
| Feature                              |   Coefficient |
|:-------------------------------------|--------------:|
| months_subscribed                    |    2.47144    |
| payment_mode_Mailed check            |    0.328278   |
| gender_Male                          |    0.321388   |
| nps_score                            |    0.038091   |
| customer_rating                      |    0          |
| subscription_type                    |    0          |
| monthly_plan_cost                    |    0          |
| app_usage_hours                      |    0          |
| total_revenue_log                    |    0          |
| payment_mode_Credit card (automatic) |    0          |
| last30d_usage_hours                  |    0          |
| device_type_SmartTV                  |   -0.00134841 |
| payment_mode_Electronic check        |   -0.00602115 |
| device_type_Mobile                   |   -0.0901755  |
| auto_renew                           |   -0.72239