In [37]:
pip install pandas numpy matplotlib seaborn scikit-learn joblib openpyxl mlflow

Note: you may need to restart the kernel to use updated packages.




In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score,
    log_loss, matthews_corrcoef, balanced_accuracy_score
)
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import calibration_curve

# ================== File Paths ==================
input_path = r"C:\Users\Adity\OneDrive\Desktop\Customer_Data.csv"
output_path = r"C:\Users\Adity\OneDrive\Desktop\Predicted_Churn.csv"

# ================== Load Data ==================
df = pd.read_csv(input_path)
print("✅ Data loaded:")
print(df.head())

# ================== Preprocessing ==================

# Drop rows with missing target
df = df.dropna(subset=['Customer_Status'])

# Drop unnecessary columns
df = df.drop(['Customer_ID', 'Churn_Category', 'Churn_Reason'], axis=1, errors='ignore')

# Categorical columns to encode
columns_to_encode = [
    'Gender', 'Married', 'State', 'Value_Deal', 'Phone_Service', 'Multiple_Lines',
    'Internet_Service', 'Internet_Type', 'Online_Security', 'Online_Backup',
    'Device_Protection_Plan', 'Premium_Support', 'Streaming_TV', 'Streaming_Movies',
    'Streaming_Music', 'Unlimited_Data', 'Contract', 'Paperless_Billing',
    'Payment_Method'
]

# Label Encoding
label_encoders = {}
for col in columns_to_encode:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

# Encode target
df['Customer_Status'] = df['Customer_Status'].map({'Stayed': 0, 'Churned': 1})

# Drop rows with any NaN left
df = df.dropna()

# Feature-target split
X = df.drop('Customer_Status', axis=1)
y = df['Customer_Status']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ================== MLflow Setup ==================
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Churn_Prediction")

with mlflow.start_run(run_name="RandomForest_Classifier") as run:
    # ================== Train Random Forest ==================
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # ================== Evaluate Model ==================
    y_pred = rf_model.predict(X_test)
    y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Probability of class 1 (churned)
    
    # Basic metrics
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    
    # Advanced metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    avg_precision = average_precision_score(y_test, y_pred_proba)
    log_loss_score = log_loss(y_test, y_pred_proba)
    matthews_corr = matthews_corrcoef(y_test, y_pred)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    
    # Confusion matrix components
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Additional derived metrics
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)  # Same as recall
    false_positive_rate = fp / (fp + tn)
    false_negative_rate = fn / (fn + tp)
    positive_predictive_value = tp / (tp + fp)  # Same as precision
    negative_predictive_value = tn / (tn + fn)
    
    # Print results
    print("\n✅ Random Forest Model Performance Metrics:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Log Loss: {log_loss_score:.4f}")
    print(f"Matthews Correlation: {matthews_corr:.4f}")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print(f"False Negative Rate: {false_negative_rate:.4f}")
    print(f"Negative Predictive Value: {negative_predictive_value:.4f}")
    
    print("\n✅ Confusion Matrix:")
    print(cm)
    print("\n✅ Classification Report:")
    print(classification_report(y_test, y_pred))

    # ================== Create Visualizations ==================
    
    # 1. ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Random Forest - ROC Curve')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.savefig("rf_roc_curve.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Precision-Recall Curve
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall_curve, precision_curve, color='blue', lw=2, 
             label=f'PR curve (AP = {avg_precision:.4f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Random Forest - Precision-Recall Curve')
    plt.legend(loc="lower left")
    plt.grid(True, alpha=0.3)
    plt.savefig("rf_precision_recall_curve.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Confusion Matrix Heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Stayed', 'Churned'], 
                yticklabels=['Stayed', 'Churned'])
    plt.title('Random Forest - Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig("rf_confusion_matrix.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 4. Feature Importance Plot
    importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    top_features = feature_importance_df.head(20)
    plt.barh(range(len(top_features)), top_features['importance'], color='forestgreen')
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 20 Feature Importances (Random Forest)')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig("rf_feature_importance.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 5. Prediction Distribution
    plt.figure(figsize=(10, 6))
    plt.hist(y_pred_proba[y_test == 0], bins=50, alpha=0.7, label='Stayed', color='blue')
    plt.hist(y_pred_proba[y_test == 1], bins=50, alpha=0.7, label='Churned', color='red')
    plt.xlabel('Predicted Probability of Churn')
    plt.ylabel('Frequency')
    plt.title('Random Forest - Distribution of Predicted Probabilities')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig("rf_prediction_distribution.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 6. Calibration Plot
    fraction_of_positives, mean_predicted_value = calibration_curve(y_test, y_pred_proba, n_bins=10)
    plt.figure(figsize=(8, 6))
    plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Random Forest")
    plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    plt.xlabel('Mean Predicted Probability')
    plt.ylabel('Fraction of Positives')
    plt.title('Random Forest - Calibration Plot')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig("rf_calibration_plot.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 7. Tree Depth and Estimator Analysis
    if hasattr(rf_model, 'estimators_'):
        tree_depths = [estimator.tree_.max_depth for estimator in rf_model.estimators_]
        plt.figure(figsize=(10, 6))
        plt.hist(tree_depths, bins=20, alpha=0.7, color='forestgreen', edgecolor='black')
        plt.xlabel('Tree Depth')
        plt.ylabel('Number of Trees')
        plt.title('Distribution of Tree Depths in Random Forest')
        plt.grid(True, alpha=0.3)
        plt.savefig("rf_tree_depths.png", dpi=300, bbox_inches='tight')
        plt.close()
    
    # 8. Out-of-Bag Score (if available)
    if hasattr(rf_model, 'oob_score_'):
        oob_score = rf_model.oob_score_
        print(f"Out-of-Bag Score: {oob_score:.4f}")
    
    # 9. Feature Importance vs Ranking
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(importances) + 1), sorted(importances, reverse=True), 'o-')
    plt.xlabel('Feature Rank')
    plt.ylabel('Feature Importance')
    plt.title('Feature Importance by Rank (Random Forest)')
    plt.grid(True, alpha=0.3)
    plt.savefig("rf_feature_ranking.png", dpi=300, bbox_inches='tight')
    plt.close()

    # ================== Log to MLflow ==================
    
    # Log parameters
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("n_features", X.shape[1])
    mlflow.log_param("n_samples", X.shape[0])
    mlflow.log_param("class_balance", f"{(y==0).sum()}:{(y==1).sum()}")
    mlflow.log_param("max_depth", rf_model.max_depth)
    mlflow.log_param("min_samples_split", rf_model.min_samples_split)
    mlflow.log_param("min_samples_leaf", rf_model.min_samples_leaf)
    mlflow.log_param("bootstrap", rf_model.bootstrap)
    
    # Log basic metrics
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    
    # Log advanced metrics
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("average_precision", avg_precision)
    mlflow.log_metric("log_loss", log_loss_score)
    mlflow.log_metric("matthews_correlation", matthews_corr)
    mlflow.log_metric("balanced_accuracy", balanced_acc)
    
    # Log confusion matrix components
    mlflow.log_metric("true_positives", tp)
    mlflow.log_metric("false_positives", fp)
    mlflow.log_metric("true_negatives", tn)
    mlflow.log_metric("false_negatives", fn)
    
    # Log additional derived metrics
    mlflow.log_metric("specificity", specificity)
    mlflow.log_metric("sensitivity", sensitivity)
    mlflow.log_metric("false_positive_rate", false_positive_rate)
    mlflow.log_metric("false_negative_rate", false_negative_rate)
    mlflow.log_metric("positive_predictive_value", positive_predictive_value)
    mlflow.log_metric("negative_predictive_value", negative_predictive_value)
    
    # Log Random Forest specific metrics
    mlflow.log_metric("mean_tree_depth", np.mean(tree_depths))
    mlflow.log_metric("std_tree_depth", np.std(tree_depths))
    mlflow.log_metric("max_tree_depth", np.max(tree_depths))
    mlflow.log_metric("min_tree_depth", np.min(tree_depths))
    mlflow.log_metric("top_feature_importance", feature_importance_df.iloc[0]['importance'])
    mlflow.log_metric("feature_importance_std", np.std(importances))
    
    # Log Out-of-Bag score if available
    if hasattr(rf_model, 'oob_score_'):
        mlflow.log_metric("oob_score", rf_model.oob_score_)
    
    # Log model
    mlflow.sklearn.log_model(rf_model, "random_forest_model")
    
    # Log all artifacts
    mlflow.log_artifact("rf_roc_curve.png")
    mlflow.log_artifact("rf_precision_recall_curve.png")
    mlflow.log_artifact("rf_confusion_matrix.png")
    mlflow.log_artifact("rf_feature_importance.png")
    mlflow.log_artifact("rf_prediction_distribution.png")
    mlflow.log_artifact("rf_calibration_plot.png")
    mlflow.log_artifact("rf_tree_depths.png")
    mlflow.log_artifact("rf_feature_ranking.png")
    
    # Save and log model files
    joblib.dump(rf_model, "random_forest_model.pkl")
    joblib.dump(label_encoders, "label_encoders.pkl")
    mlflow.log_artifact("random_forest_model.pkl")
    mlflow.log_artifact("label_encoders.pkl")
    
    # Log feature importance as JSON
    feature_importance_df.to_json("rf_feature_importance.json")
    mlflow.log_artifact("rf_feature_importance.json")
    
    # Log tree information
    tree_info = {
        'n_estimators': len(rf_model.estimators_),
        'tree_depths': tree_depths,
        'mean_depth': np.mean(tree_depths),
        'std_depth': np.std(tree_depths)
    }
    
    import json
    with open("rf_tree_info.json", "w") as f:
        json.dump(tree_info, f, indent=2)
    mlflow.log_artifact("rf_tree_info.json")
    
    print(f"\n✅ MLflow run completed. Run ID: {run.info.run_id}")

# ================== Predict on Same Data ==================

# Reload raw CSV
new_data = pd.read_csv(input_path)
original_data = new_data.copy()

# Drop unused columns
new_data = new_data.drop(['Customer_ID', 'Customer_Status', 'Churn_Category', 'Churn_Reason'], axis=1, errors='ignore')
new_data = new_data.dropna()

# Encode new data
for col in new_data.select_dtypes(include='object').columns:
    if col in label_encoders:
        new_data[col] = label_encoders[col].transform(new_data[col].astype(str))
    else:
        print(f"⚠️ No encoder found for column: {col}. Skipping.")

# Predict
new_predictions = rf_model.predict(new_data)
new_predictions_proba = rf_model.predict_proba(new_data)[:, 1]

# Append prediction to original
original_data = original_data.iloc[:len(new_predictions)].copy()
original_data['Customer_Status_Predicted'] = new_predictions
original_data['Churn_Probability'] = new_predictions_proba

# Filter churned
churned = original_data[original_data['Customer_Status_Predicted'] == 1]

# Save output
churned.to_csv(output_path, index=False)
print(f"\n✅ Churned customer predictions saved to: {output_path}")
print(f"📊 Total customers predicted to churn: {len(churned)}")
print(f"📊 Churn rate: {len(churned)/len(original_data)*100:.2f}%")
print(f"📊 Average churn probability: {new_predictions_proba.mean():.4f}")
print(f"📊 High-risk customers (>80% churn prob): {sum(new_predictions_proba > 0.8)}")
print(f"📊 Medium-risk customers (50-80% churn prob): {sum((new_predictions_proba > 0.5) & (new_predictions_proba <= 0.8))}")
print(f"📊 Low-risk customers (<50% churn prob): {sum(new_predictions_proba <= 0.5)}")

✅ Data loaded:
  Customer_ID  Gender  Age Married        State  Number_of_Referrals  \
0   19877-DEL    Male   35      No        Delhi                    7   
1   58353-MAH  Female   45     Yes  Maharashtra                   14   
2   25063-WES    Male   51      No  West Bengal                    4   
3   59787-KAR    Male   79      No    Karnataka                    3   
4   28544-TAM  Female   80      No   Tamil Nadu                    3   

   Tenure_in_Months Value_Deal Phone_Service Multiple_Lines  ...  \
0                27        NaN           Yes             No  ...   
1                13        NaN           Yes            Yes  ...   
2                35     Deal 5           Yes             No  ...   
3                21     Deal 4           Yes             No  ...   
4                 8        NaN           Yes             No  ...   

    Payment_Method Monthly_Charge Total_Charges Total_Refunds  \
0      Credit Card           65.6        593.30          0.00   
1      Credit




✅ MLflow run completed. Run ID: 0ea6f1a76a0045399ba7dd12100f51e5
🏃 View run RandomForest_Classifier at: http://127.0.0.1:5000/#/experiments/635998505895616251/runs/0ea6f1a76a0045399ba7dd12100f51e5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/635998505895616251

✅ Churned customer predictions saved to: C:\Users\Adity\OneDrive\Desktop\Predicted_Churn.csv
📊 Total customers predicted to churn: 686
📊 Churn rate: 34.42%
📊 Average churn probability: 0.3600
📊 High-risk customers (>80% churn prob): 452
📊 Medium-risk customers (50-80% churn prob): 234
📊 Low-risk customers (<50% churn prob): 1307
