In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score,
    log_loss, matthews_corrcoef, balanced_accuracy_score
)
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import calibration_curve

# ================== File Paths ==================
input_path = r"C:\Users\Adity\OneDrive\Desktop\Customer_Data.csv"
output_path = r"C:\Users\Adity\OneDrive\Desktop\Predicted_Churn.csv"

In [13]:
# ================== Load Data ==================
df = pd.read_csv(input_path)
print("✅ Data loaded:")
print(df.head())

✅ Data loaded:
  Customer_ID  Gender  Age Married        State  Number_of_Referrals  \
0   19877-DEL    Male   35      No        Delhi                    7   
1   58353-MAH  Female   45     Yes  Maharashtra                   14   
2   25063-WES    Male   51      No  West Bengal                    4   
3   59787-KAR    Male   79      No    Karnataka                    3   
4   28544-TAM  Female   80      No   Tamil Nadu                    3   

   Tenure_in_Months Value_Deal Phone_Service Multiple_Lines  ...  \
0                27        NaN           Yes             No  ...   
1                13        NaN           Yes            Yes  ...   
2                35     Deal 5           Yes             No  ...   
3                21     Deal 4           Yes             No  ...   
4                 8        NaN           Yes             No  ...   

    Payment_Method Monthly_Charge Total_Charges Total_Refunds  \
0      Credit Card           65.6        593.30          0.00   
1      Credit

In [15]:
# ================== Preprocessing ==================
df = df.dropna(subset=['Customer_Status'])
df = df.drop(['Customer_ID', 'Churn_Category', 'Churn_Reason'], axis=1, errors='ignore')

columns_to_encode = [
    'Gender', 'Married', 'State', 'Value_Deal', 'Phone_Service', 'Multiple_Lines',
    'Internet_Service', 'Internet_Type', 'Online_Security', 'Online_Backup',
    'Device_Protection_Plan', 'Premium_Support', 'Streaming_TV', 'Streaming_Movies',
    'Streaming_Music', 'Unlimited_Data', 'Contract', 'Paperless_Billing',
    'Payment_Method'
]

label_encoders = {}
for col in columns_to_encode:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

df['Customer_Status'] = df['Customer_Status'].map({'Stayed': 0, 'Churned': 1})
df = df.dropna()

X = df.drop('Customer_Status', axis=1)
y = df['Customer_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# ================== MLflow Setup ==================
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Churn_Prediction")

# ================== Solution 1: Increase max_iter and use better solver ==================
# The most common solution is to increase max_iter and use a more robust solver

with mlflow.start_run(run_name="LogisticRegression_Classifier_Fixed") as run:
    
    # Option 1: Increase max_iter with liblinear solver (good for small datasets)
    # model = LogisticRegression(max_iter=5000, solver='liblinear', random_state=42)
    
    # Option 2: Use saga solver (good for large datasets and L1/L2 regularization)
    # model = LogisticRegression(max_iter=5000, solver='saga', random_state=42)
    
    # Option 3: Use newton-cg solver (good for small datasets, L2 only)
    # model = LogisticRegression(max_iter=5000, solver='newton-cg', random_state=42)
    
    # Option 4: Recommended - Use saga with increased iterations
    model = LogisticRegression(
        max_iter=5000,
        solver='saga',
        random_state=42,
        tol=1e-4,  # Tolerance for stopping criteria
        warm_start=False
    )
    
    # ================== Alternative: Scale the data first ==================
    # If convergence is still an issue, scale the features
    from sklearn.preprocessing import StandardScaler
    
    # Uncomment these lines if you want to scale the data:
    # scaler = StandardScaler()
    # X_train_scaled = scaler.fit_transform(X_train)
    # X_test_scaled = scaler.transform(X_test)
    # model.fit(X_train_scaled, y_train)
    
    # For now, let's use the original data with improved solver
    model.fit(X_train, y_train)

    # ================== Evaluate ==================
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of class 1 (churned)
    
    # Basic metrics
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    
    # Advanced metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    avg_precision = average_precision_score(y_test, y_pred_proba)
    log_loss_score = log_loss(y_test, y_pred_proba)
    matthews_corr = matthews_corrcoef(y_test, y_pred)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    
    # Confusion matrix components
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Additional derived metrics
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)  # Same as recall
    false_positive_rate = fp / (fp + tn)
    false_negative_rate = fn / (fn + tp)
    positive_predictive_value = tp / (tp + fp)  # Same as precision
    negative_predictive_value = tn / (tn + fn)
    
    # Print results
    print("\n✅ Logistic Regression Model Performance Metrics:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Log Loss: {log_loss_score:.4f}")
    print(f"Matthews Correlation: {matthews_corr:.4f}")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"False Negative Rate: {false_negative_rate:.4f}")
    print(f"Negative Predictive Value: {negative_predictive_value:.4f}")
    
    print("\n✅ Confusion Matrix:")
    print(cm)
    print("\n✅ Classification Report:")
    print(classification_report(y_test, y_pred))

    # ================== Check Convergence Status ==================
    convergence_info = {
        'n_iter': model.n_iter_[0] if hasattr(model, 'n_iter_') else None,
        'converged': model.n_iter_[0] < model.max_iter if hasattr(model, 'n_iter_') else True,
        'max_iter': model.max_iter,
        'solver': model.solver,
        'tol': model.tol
    }
    
    print(f"\n📊 Convergence Info:")
    print(f"Solver: {convergence_info['solver']}")
    print(f"Iterations used: {convergence_info['n_iter']}")
    print(f"Max iterations: {convergence_info['max_iter']}")
    print(f"Converged: {convergence_info['converged']}")
    print(f"Tolerance: {convergence_info['tol']}")

    # ================== Create Visualizations ==================
    
    # 1. ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Logistic Regression - ROC Curve')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.savefig("lr_roc_curve.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Precision-Recall Curve
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall_curve, precision_curve, color='blue', lw=2, 
             label=f'PR curve (AP = {avg_precision:.4f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Logistic Regression - Precision-Recall Curve')
    plt.legend(loc="lower left")
    plt.grid(True, alpha=0.3)
    plt.savefig("lr_precision_recall_curve.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Confusion Matrix Heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Stayed', 'Churned'], 
                yticklabels=['Stayed', 'Churned'])
    plt.title('Logistic Regression - Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig("lr_confusion_matrix.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 4. Feature Coefficients Plot
    if hasattr(model, "coef_"):
        coef = model.coef_[0]
        feature_importance_df = pd.DataFrame({
            'feature': X.columns,
            'coefficient': coef,
            'abs_coefficient': np.abs(coef)
        }).sort_values('abs_coefficient', ascending=False)
        
        plt.figure(figsize=(12, 8))
        top_features = feature_importance_df.head(20)
        colors = ['red' if x < 0 else 'green' for x in top_features['coefficient']]
        plt.barh(range(len(top_features)), top_features['coefficient'], color=colors)
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Coefficient Value')
        plt.title('Top 20 Feature Coefficients (Logistic Regression)')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig("lr_coefficients.png", dpi=300, bbox_inches='tight')
        plt.close()
    
    # 5. Prediction Distribution
    plt.figure(figsize=(10, 6))
    plt.hist(y_pred_proba[y_test == 0], bins=50, alpha=0.7, label='Stayed', color='blue')
    plt.hist(y_pred_proba[y_test == 1], bins=50, alpha=0.7, label='Churned', color='red')
    plt.xlabel('Predicted Probability of Churn')
    plt.ylabel('Frequency')
    plt.title('Logistic Regression - Distribution of Predicted Probabilities')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig("lr_prediction_distribution.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 6. Calibration Plot
    fraction_of_positives, mean_predicted_value = calibration_curve(y_test, y_pred_proba, n_bins=10)
    plt.figure(figsize=(8, 6))
    plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Logistic Regression")
    plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    plt.xlabel('Mean Predicted Probability')
    plt.ylabel('Fraction of Positives')
    plt.title('Logistic Regression - Calibration Plot')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig("lr_calibration_plot.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 7. Coefficient Distribution
    if hasattr(model, "coef_"):
        plt.figure(figsize=(10, 6))
        plt.hist(coef, bins=20, alpha=0.7, color='purple', edgecolor='black')
        plt.xlabel('Coefficient Value')
        plt.ylabel('Frequency')
        plt.title('Distribution of Logistic Regression Coefficients')
        plt.grid(True, alpha=0.3)
        plt.savefig("lr_coef_distribution.png", dpi=300, bbox_inches='tight')
        plt.close()
    
    # 8. Odds Ratios Plot
    if hasattr(model, "coef_"):
        odds_ratios = np.exp(coef)
        odds_df = pd.DataFrame({
            'feature': X.columns,
            'odds_ratio': odds_ratios,
            'log_odds': coef
        }).sort_values('odds_ratio', ascending=False)
        
        plt.figure(figsize=(12, 8))
        top_odds = odds_df.head(20)
        colors = ['red' if x < 1 else 'green' for x in top_odds['odds_ratio']]
        plt.barh(range(len(top_odds)), top_odds['odds_ratio'], color=colors)
        plt.yticks(range(len(top_odds)), top_odds['feature'])
        plt.xlabel('Odds Ratio')
        plt.title('Top 20 Feature Odds Ratios (Logistic Regression)')
        plt.axvline(x=1, color='black', linestyle='--', alpha=0.7)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig("lr_odds_ratios.png", dpi=300, bbox_inches='tight')
        plt.close()
    
    # 9. Convergence Analysis Plot
    plt.figure(figsize=(8, 6))
    convergence_data = {
        'Solver': convergence_info['solver'],
        'Iterations Used': convergence_info['n_iter'],
        'Max Iterations': convergence_info['max_iter'],
        'Converged': 'Yes' if convergence_info['converged'] else 'No'
    }
    
    # Create a simple bar chart for convergence info
    plt.bar(['Iterations Used', 'Max Iterations'], 
            [convergence_info['n_iter'], convergence_info['max_iter']], 
            color=['green' if convergence_info['converged'] else 'red', 'gray'])
    plt.ylabel('Number of Iterations')
    plt.title(f'Convergence Analysis - {convergence_info["solver"]} Solver')
    plt.grid(True, alpha=0.3)
    plt.savefig("lr_convergence_analysis.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ================== Log to MLflow ==================
    
    # Log parameters
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("max_iter", model.max_iter)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("n_features", X.shape[1])
    mlflow.log_param("n_samples", X.shape[0])
    mlflow.log_param("class_balance", f"{(y==0).sum()}:{(y==1).sum()}")
    mlflow.log_param("solver", model.solver)
    mlflow.log_param("penalty", model.penalty)
    mlflow.log_param("C", model.C)
    mlflow.log_param("tol", model.tol)
    
    # Log convergence info
    mlflow.log_param("n_iter_actual", convergence_info['n_iter'])
    mlflow.log_param("converged", convergence_info['converged'])
    mlflow.log_param("convergence_status", "Converged" if convergence_info['converged'] else "Did not converge")
    
    # Log basic metrics
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    
    # Log advanced metrics
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("average_precision", avg_precision)
    mlflow.log_metric("log_loss", log_loss_score)
    mlflow.log_metric("matthews_correlation", matthews_corr)
    mlflow.log_metric("balanced_accuracy", balanced_acc)
    
    # Log confusion matrix components
    mlflow.log_metric("true_positives", tp)
    mlflow.log_metric("false_positives", fp)
    mlflow.log_metric("true_negatives", tn)
    mlflow.log_metric("false_negatives", fn)
    
    # Log additional derived metrics
    mlflow.log_metric("specificity", specificity)
    mlflow.log_metric("sensitivity", sensitivity)
    mlflow.log_metric("false_positive_rate", false_positive_rate)
    mlflow.log_metric("false_negative_rate", false_negative_rate)
    mlflow.log_metric("positive_predictive_value", positive_predictive_value)
    mlflow.log_metric("negative_predictive_value", negative_predictive_value)
    
    # Log Logistic Regression specific metrics
    if hasattr(model, "coef_"):
        mlflow.log_metric("max_abs_coefficient", np.max(np.abs(coef)))
        mlflow.log_metric("min_abs_coefficient", np.min(np.abs(coef)))
        mlflow.log_metric("mean_abs_coefficient", np.mean(np.abs(coef)))
        mlflow.log_metric("std_coefficient", np.std(coef))
        mlflow.log_metric("max_odds_ratio", np.max(odds_ratios))
        mlflow.log_metric("min_odds_ratio", np.min(odds_ratios))
        
        # Log feature name as parameter and coefficient value as metric
        mlflow.log_param("feature_with_max_coef", feature_importance_df.iloc[0]['feature'])
        mlflow.log_metric("max_coef_value", feature_importance_df.iloc[0]['abs_coefficient'])
    
    # Log model
    mlflow.sklearn.log_model(model, "logistic_regression_model")
    
    # Log all artifacts
    mlflow.log_artifact("lr_roc_curve.png")
    mlflow.log_artifact("lr_precision_recall_curve.png")
    mlflow.log_artifact("lr_confusion_matrix.png")
    mlflow.log_artifact("lr_coefficients.png")
    mlflow.log_artifact("lr_prediction_distribution.png")
    mlflow.log_artifact("lr_calibration_plot.png")
    mlflow.log_artifact("lr_coef_distribution.png")
    mlflow.log_artifact("lr_odds_ratios.png")
    mlflow.log_artifact("lr_convergence_analysis.png")
    
    # Save and log model files
    joblib.dump(model, "logistic_regression_model.pkl")
    joblib.dump(label_encoders, "label_encoders.pkl")
    mlflow.log_artifact("logistic_regression_model.pkl")
    mlflow.log_artifact("label_encoders.pkl")
    
    # Log feature importance/coefficients as JSON
    if hasattr(model, "coef_"):
        feature_importance_df.to_json("lr_feature_coefficients.json")
        mlflow.log_artifact("lr_feature_coefficients.json")
        
        odds_df.to_json("lr_odds_ratios.json")
        mlflow.log_artifact("lr_odds_ratios.json")
    
    # Log model summary with convergence info
    model_summary = {
        'model_type': 'LogisticRegression',
        'n_features': int(X.shape[1]),
        'n_samples': int(X.shape[0]),
        'converged': bool(convergence_info['converged']),
        'n_iter_actual': int(convergence_info['n_iter']),
        'max_iter': int(convergence_info['max_iter']),
        'solver': str(model.solver),
        'penalty': str(model.penalty),
        'C': float(model.C),
        'tol': float(model.tol)
    }
    
    import json
    with open("lr_model_summary.json", "w") as f:
        json.dump(model_summary, f, indent=2)
    mlflow.log_artifact("lr_model_summary.json")
    
    print(f"\n✅ MLflow run completed. Run ID: {run.info.run_id}")
    print(f"📊 Model convergence status: {'✅ Converged' if convergence_info['converged'] else '❌ Did not converge'}")
    print(f"📊 Iterations used: {convergence_info['n_iter']}/{convergence_info['max_iter']}")




✅ Logistic Regression Model Performance Metrics:
Accuracy: 0.7795
Precision: 0.6805
Recall (Sensitivity): 0.5014
F1-Score: 0.5774
ROC AUC: 0.8096
Average Precision: 0.6735
Log Loss: 0.4773
Matthews Correlation: 0.4421
Balanced Accuracy: 0.7002
Specificity: 0.8989
False Positive Rate: 0.1011
False Negative Rate: 0.4986
Negative Predictive Value: 0.8077

✅ Confusion Matrix:
[[756  85]
 [180 181]]

✅ Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.90      0.85       841
         1.0       0.68      0.50      0.58       361

    accuracy                           0.78      1202
   macro avg       0.74      0.70      0.71      1202
weighted avg       0.77      0.78      0.77      1202


📊 Convergence Info:
Solver: saga
Iterations used: 5000
Max iterations: 5000
Converged: False
Tolerance: 0.0001





✅ MLflow run completed. Run ID: cc7acbca7b724de0bed8df6f15aa16e3
📊 Model convergence status: ❌ Did not converge
📊 Iterations used: 5000/5000
🏃 View run LogisticRegression_Classifier_Fixed at: http://127.0.0.1:5000/#/experiments/635998505895616251/runs/cc7acbca7b724de0bed8df6f15aa16e3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/635998505895616251


In [23]:
# ================== Predict on Entire Data ==================
new_data = pd.read_csv(input_path)
original_data = new_data.copy()

new_data = new_data.drop(['Customer_ID', 'Customer_Status', 'Churn_Category', 'Churn_Reason'], axis=1, errors='ignore')
new_data = new_data.dropna()

for col in new_data.select_dtypes(include='object').columns:
    if col in label_encoders:
        new_data[col] = label_encoders[col].transform(new_data[col].astype(str))
    else:
        print(f"⚠️ No encoder found for column: {col}. Skipping.")

new_predictions = model.predict(new_data)
new_predictions_proba = model.predict_proba(new_data)[:, 1]

original_data = original_data.iloc[:len(new_predictions)].copy()
original_data['Customer_Status_Predicted'] = new_predictions
original_data['Churn_Probability'] = new_predictions_proba

churned = original_data[original_data['Customer_Status_Predicted'] == 1]
churned.to_csv(output_path, index=False)

print(f"\n✅ Churned customer predictions saved to: {output_path}")
print(f"📊 Total customers predicted to churn: {len(churned)}")
print(f"📊 Churn rate: {len(churned)/len(original_data)*100:.2f}%")
print(f"📊 Average churn probability: {new_predictions_proba.mean():.4f}")
print(f"📊 High-risk customers (>80% churn prob): {sum(new_predictions_proba > 0.8)}")
print(f"📊 Medium-risk customers (50-80% churn prob): {sum((new_predictions_proba > 0.5) & (new_predictions_proba <= 0.8))}")
print(f"📊 Low-risk customers (<50% churn prob): {sum(new_predictions_proba <= 0.5)}")

# Display top risk factors
if hasattr(model, "coef_"):
    print(f"\n📊 Top 5 Positive Risk Factors (increase churn probability):")
    positive_coef = feature_importance_df[feature_importance_df['coefficient'] > 0].head(5)
    for idx, row in positive_coef.iterrows():
        print(f"   • {row['feature']}: {row['coefficient']:.4f}")
    
    print(f"\n📊 Top 5 Negative Risk Factors (decrease churn probability):")
    negative_coef = feature_importance_df[feature_importance_df['coefficient'] < 0].head(5)
    for idx, row in negative_coef.iterrows():
        print(f"   • {row['feature']}: {row['coefficient']:.4f}")


✅ Churned customer predictions saved to: C:\Users\Adity\OneDrive\Desktop\Predicted_Churn.csv
📊 Total customers predicted to churn: 681
📊 Churn rate: 34.17%
📊 Average churn probability: 0.3334
📊 High-risk customers (>80% churn prob): 70
📊 Medium-risk customers (50-80% churn prob): 611
📊 Low-risk customers (<50% churn prob): 1312

📊 Top 5 Positive Risk Factors (increase churn probability):
   • Internet_Type: 0.2931
   • Paperless_Billing: 0.2249
   • Streaming_Movies: 0.1430
   • Value_Deal: 0.0818
   • Streaming_Music: 0.0682

📊 Top 5 Negative Risk Factors (decrease churn probability):
   • Contract: -1.3223
   • Online_Security: -0.4775
   • Premium_Support: -0.4090
   • Payment_Method: -0.2449
   • Unlimited_Data: -0.1996
