In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, classification_report,
                            precision_recall_curve, average_precision_score)
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv("Dataset.csv")

churn_statuses = ["Withdrawn", "Dropped Out", "Rejected"] 
df['Churn'] = np.where(df['Status Description'].isin(churn_statuses), 1, 0)

date_cols = ['Learner SignUp DateTime', 'Opportunity End Date', 'Opportunity Start Date', 
             'Month of SignUp', 'Day of Week of SignUp', 'Day of Month of SignUp', 
             'Year of Signup', 'Year of Birth']

for col in date_cols:
    if col in df.columns:
        try:
            df[col] = pd.to_datetime(df[col], errors='coerce').astype(np.int64) // 10**9
        except:
            print(f"Could not convert {col} to datetime, skipping.")
            
X = df.drop(['Status Description', 'Churn'], axis=1)
y = df['Churn']


categorical_cols = X.select_dtypes(include=['object']).columns
if categorical_cols.size > 0:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    X_encoded = pd.DataFrame(ohe.fit_transform(X[categorical_cols]))
    X_encoded.columns = ohe.get_feature_names_out(categorical_cols)
    X = X.drop(categorical_cols, axis=1)
    X = pd.concat([X, X_encoded], axis=1)

non_numeric_cols = X.select_dtypes(exclude=np.number).columns.tolist()
if non_numeric_cols:
    print(f"Dropping additional non-numeric columns: {non_numeric_cols}")
    X = X.drop(non_numeric_cols, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_cols = X_train.select_dtypes(include=np.number).columns
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

model = XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

def evaluate_churn_model(y_true, y_pred, y_pred_proba, label_encoder, class_names, output_dir):
    print("--- Detailed Churn Model Evaluation ---")
    print(classification_report(y_true, y_pred, target_names=class_names))
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    try:
        roc_auc = roc_auc_score(y_true, y_pred_proba)  
    except ValueError as e:
        print(f"ROC AUC calculation failed: {e}")
        roc_auc = None
    results = {
      "Accuracy": accuracy,
      "Precision": precision,
      "Recall": recall,
      "F1 Score": f1,
       "ROC AUC Score": roc_auc
   }

    results_df = pd.DataFrame(results, index=[0])
    print("\nModel Metrics: ")
    print(results_df)
    conf_matrix = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Active', 'Churned'], yticklabels=['Active', 'Churned'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
    plt.close()
    importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
    plt.title("Feature Importance")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'feature_importance.png'))
    plt.close()
    
    if roc_auc is not None:
        from sklearn.metrics import roc_curve
        fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.2f}")
        plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
        plt.title("ROC Curve")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.legend()
        plt.savefig(os.path.join(output_dir, "roc_curve.png"))
        plt.close()
    
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
    average_precision = average_precision_score(y_true, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, label=f'Average Precision = {average_precision:.2f}')
    plt.title('Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    plt.savefig(os.path.join(output_dir, "precision_recall_curve.png"))
    plt.close()

    plt.figure(figsize=(8,6))
    sns.kdeplot(y_pred_proba, label="Probability of Churn")
    plt.title("Distribution of Predicted Churn Probabilities")
    plt.xlabel("Predicted Probability")
    plt.ylabel("Density")
    plt.legend()
    plt.savefig(os.path.join(output_dir, "predicted_probabilities.png"))
    plt.close()


    plt.figure(figsize=(5, 5))
    sns.countplot(x=y_train, palette='viridis')
    plt.title('Churn Distribution')
    plt.xlabel('Churned or Active (0=Active 1 = Churned)')
    plt.ylabel('Count')
    plt.savefig(os.path.join(output_dir, 'churn_distribution.png'))
    plt.close()


output_directory = 'model_output_images'
os.makedirs(output_directory, exist_ok=True)

evaluate_churn_model(y_test, y_pred, y_pred_proba, label_encoder, ["Active", "Churned"], output_directory)
joblib.dump(model, 'student_dropout_model.pkl')
print(f"Model and images are saved")




--- Detailed Churn Model Evaluation ---
              precision    recall  f1-score   support

      Active       0.98      0.95      0.96       893
     Churned       0.95      0.97      0.96       817

    accuracy                           0.96      1710
   macro avg       0.96      0.96      0.96      1710
weighted avg       0.96      0.96      0.96      1710


Model Metrics: 
   Accuracy  Precision    Recall  F1 Score  ROC AUC Score
0  0.960819   0.961264  0.960819  0.960836       0.990918
Model and images are saved
