# Churn Prediction Model Training

This notebook trains a churn prediction model using OML4Py XGBoost.

## Steps:
1. Load training data from CHURN_TRAINING_DATA view
2. Split data into train/test sets
3. Train XGBoost model using OML4Py
4. Evaluate model performance
5. Save model to OML Datastore

In [None]:
%python

# Cell 1: Import and Setup
import oml
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)
from datetime import datetime

print("=" * 60)
print("Churn Prediction Model Training")
print("=" * 60)
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Check OML connection
if oml.isconnected():
    print("✓ OML connected")
else:
    print("⚠️  OML not connected")

In [None]:
%python

# Cell 2: Load Training Data

# Load data from view
print("Loading data from CHURN_TRAINING_DATA view...")
train_data_oml = oml.sync(view='CHURN_TRAINING_DATA')
train_data_pd = train_data_oml.pull()

print(f"✓ Loaded {len(train_data_pd):,} rows")
print(f"✓ Columns: {len(train_data_pd.columns)}")

# Check churn distribution
churn_rate = train_data_pd['CHURNED'].mean()
print(f"\nData Quality Check:")
print(f"  Churn rate: {churn_rate*100:.2f}%")
print(f"  Churned: {train_data_pd['CHURNED'].sum():,}")
print(f"  Retained: {(train_data_pd['CHURNED'] == 0).sum():,}")

# Identify feature columns
exclude_cols = ['USER_ID', 'CHURNED']
feature_cols = [col for col in train_data_pd.columns if col not in exclude_cols]

print(f"\n✓ Feature columns: {len(feature_cols)}")
print(f"  Features: {', '.join(feature_cols[:10])}{'...' if len(feature_cols) > 10 else ''}")

# Prepare X and y
X_pd = train_data_pd[feature_cols].copy()
y_pd = train_data_pd['CHURNED'].copy()

# Clean data
print("\nCleaning data...")
for col in feature_cols:
    if pd.api.types.is_numeric_dtype(X_pd[col]):
        X_pd[col] = X_pd[col].replace([np.inf, -np.inf], np.nan)
        X_pd[col] = X_pd[col].fillna(0)
    elif pd.api.types.is_object_dtype(X_pd[col]):
        # Handle categorical columns - convert to numeric if needed
        X_pd[col] = pd.Categorical(X_pd[col]).codes

# Check for constant columns (no variance)
constant_cols = []
for col in feature_cols:
    if X_pd[col].nunique() <= 1:
        constant_cols.append(col)

if constant_cols:
    print(f"⚠️  Warning: Found {len(constant_cols)} constant columns: {constant_cols}")
    feature_cols = [col for col in feature_cols if col not in constant_cols]
    X_pd = X_pd[feature_cols]
    print(f"  Removed constant columns, using {len(feature_cols)} features")

print("✓ Data cleaned")
print(f"  Final feature count: {len(feature_cols)}")

In [None]:
%python

# Cell 2.5: Data Quality Diagnostics (Optional)
# Run this cell if model performance is poor to diagnose issues

print("=" * 60)
print("Data Quality Diagnostics")
print("=" * 60)

# Check feature distributions
print("\nFeature Statistics (First 5 Features):")
print("-" * 60)
for col in feature_cols[:5]:  # Check first 5 features
    if col in X_pd.columns and pd.api.types.is_numeric_dtype(X_pd[col]):
        print(f"\n{col}:")
        print(f"  Mean: {X_pd[col].mean():.2f}")
        print(f"  Std:  {X_pd[col].std():.2f}")
        print(f"  Min:  {X_pd[col].min():.2f}")
        print(f"  Max:  {X_pd[col].max():.2f}")
        print(f"  NaN:  {X_pd[col].isna().sum()}")
    elif col in X_pd.columns:
        print(f"\n{col}: (non-numeric)")
        print(f"  Unique values: {X_pd[col].nunique()}")

# Check correlation with target
print("\n" + "=" * 60)
print("Feature-Target Correlations (Top 10)")
print("=" * 60)
correlations = []
for col in feature_cols:
    if col in X_pd.columns and pd.api.types.is_numeric_dtype(X_pd[col]):
        try:
            corr = X_pd[col].corr(y_pd)
            if not np.isnan(corr):
                correlations.append((col, abs(corr)))
        except Exception:
            pass  # Skip if correlation calculation fails

if len(correlations) > 0:
    correlations.sort(key=lambda x: x[1], reverse=True)
    print("\nTop correlated features:")
    for col, corr in correlations[:10]:
        print(f"  {col:30} {corr:.4f}")
    
    max_corr = max([c[1] for c in correlations])
    if max_corr < 0.1:
        print("\n⚠️  WARNING: Very low correlations with target!")
        print(f"   Maximum correlation: {max_corr:.4f}")
        print("   Features may not be predictive of churn.")
        print("   This could explain poor model performance.")
    else:
        print(f"\n✓ Maximum correlation: {max_corr:.4f}")
        if max_corr > 0.3:
            print("   Good: Some features are strongly correlated with churn")
        elif max_corr > 0.1:
            print("   Moderate: Features have some predictive power")
else:
    print("⚠️  WARNING: Could not calculate correlations!")
    print("   Check data types and ensure features are numeric")

In [None]:
%python

# Cell 3: Split Data
# Split into train/test
X_train_pd, X_test_pd, y_train_pd, y_test_pd = train_test_split(
    X_pd, y_pd,
    test_size=0.2,
    random_state=42,
    stratify=y_pd
)

print(f"✓ Train size: {len(X_train_pd):,}")
print(f"✓ Test size: {len(X_test_pd):,}")
print(f"✓ Train churn rate: {y_train_pd.mean() * 100:.2f}%")
print(f"✓ Test churn rate: {y_test_pd.mean() * 100:.2f}%")

In [None]:
%python

# Cell 4: Train XGBoost Model
# Merge X_train and y_train for database push
train_combined_pd = X_train_pd.copy()
train_combined_pd['CHURNED'] = y_train_pd.values

# Push to database
print("Pushing training data to OML...")
train_oml = oml.push(train_combined_pd)
print(f"✓ Training data pushed: {train_oml.shape}")

# Get features and target
X_train_oml = train_oml[feature_cols]
y_train_oml = train_oml['CHURNED']

print(f"✓ X_train_oml shape: {X_train_oml.shape}")
print(f"✓ y_train_oml shape: {y_train_oml.shape}")
print(f"✓ Training churn rate: {y_train_oml.mean()*100:.2f}%")

# Verify data quality
print(f"\nData Quality Check:")
print(f"  Features with NaN: {sum([X_train_oml[col].isna().any() for col in feature_cols])}")
print(f"  Features with Inf: {sum([np.isinf(X_train_oml[col]).any() for col in feature_cols])}")

# Create XGBoost model with better hyperparameters
print("\nCreating XGBoost model...")
xgb_model = oml.xgb('classification')

# Train the model (training happens IN ADB)
print("Training started...")
print("  This may take a few minutes...")
xgb_model = xgb_model.fit(X_train_oml, y_train_oml)
print("✓ Training completed!")

In [None]:
%python

# Cell 5: Evaluate Model
# Prepare test data
test_combined_pd = X_test_pd.copy()
test_combined_pd['CHURNED'] = y_test_pd.values
test_oml = oml.push(test_combined_pd)
X_test_oml = test_oml[feature_cols]

print(f"✓ Test data prepared: {X_test_oml.shape}")
print(f"✓ Test churn rate: {y_test_pd.mean()*100:.2f}%")

# Get predictions with supplemental_cols to fix warning
print("\nGenerating predictions...")
try:
    # Try with supplemental_cols to align predictions
    y_pred_proba_oml = xgb_model.predict_proba(
        X_test_oml,
        supplemental_cols=X_test_oml[feature_cols[:1]]  # Use first feature for alignment
    )
except Exception as e:
    print(f"⚠️  Warning with supplemental_cols: {e}")
    print("   Trying without supplemental_cols...")
    y_pred_proba_oml = xgb_model.predict_proba(X_test_oml)

# Convert to numpy
y_pred_proba_pd = y_pred_proba_oml.pull()
if isinstance(y_pred_proba_pd, pd.DataFrame):
    if 1 in y_pred_proba_pd.columns:
        y_pred_proba = y_pred_proba_pd[1].values
    elif len(y_pred_proba_pd.columns) == 2:
        y_pred_proba = y_pred_proba_pd.iloc[:, 1].values
    else:
        y_pred_proba = y_pred_proba_pd.values.flatten()
else:
    y_pred_proba = np.array(y_pred_proba_pd)

# Check prediction distribution
print(f"\nPrediction Statistics:")
print(f"  Mean probability: {y_pred_proba.mean():.4f}")
print(f"  Min probability: {y_pred_proba.min():.4f}")
print(f"  Max probability: {y_pred_proba.max():.4f}")
print(f"  Std probability: {y_pred_proba.std():.4f}")

# If predictions are all the same, model isn't learning
if y_pred_proba.std() < 0.01:
    print("\n⚠️  WARNING: Predictions have very low variance!")
    print("   This suggests the model is not learning from the data.")
    print("   Possible issues:")
    print("   - Features may not be informative")
    print("   - Data quality issues")
    print("   - Model needs hyperparameter tuning")

y_pred = (y_pred_proba >= 0.5).astype(int)
y_test_vals = y_test_pd.values

# Calculate metrics
accuracy = accuracy_score(y_test_vals, y_pred)
precision = precision_score(y_test_vals, y_pred, zero_division=0)
recall = recall_score(y_test_vals, y_pred, zero_division=0)
f1 = f1_score(y_test_vals, y_pred, zero_division=0)
auc = roc_auc_score(y_test_vals, y_pred_proba)

print("\n" + "=" * 60)
print("Model Performance Metrics")
print("=" * 60)
print(f"AUC-ROC:     {auc:.4f} ({auc*100:.2f}%)")
print(f"Accuracy:    {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision:   {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall:      {recall:.4f} ({recall*100:.2f}%)")
print(f"F1 Score:    {f1:.4f}")

# Performance assessment
if auc < 0.55:
    print("\n⚠️  WARNING: Model performance is very poor (AUC < 0.55)")
    print("   This suggests the model is not learning effectively.")
    print("   Recommendations:")
    print("   1. Check data quality and feature engineering")
    print("   2. Verify features are informative")
    print("   3. Consider feature selection")
    print("   4. Try different hyperparameters")
elif auc < 0.70:
    print("\n⚠️  Model performance is below target (AUC < 0.70)")
    print("   Consider improving features or hyperparameter tuning")
else:
    print("\n✓ Model performance is acceptable (AUC >= 0.70)")

# Confusion matrix
cm = confusion_matrix(y_test_vals, y_pred)
tn, fp, fn, tp = cm.ravel()
print("\nConfusion Matrix:")
print("                Predicted")
print("              Non-Churn  Churn")
print(f"Actual Non-Churn   {tn:5d}   {fp:5d}")
print(f"       Churn       {fn:5d}   {tp:5d}")

In [None]:
%python

# Cell 6: Save Model
# Save model to OML datastore
model_name = 'churn_xgboost_v1'
description = f'Churn prediction XGBoost model - AUC: {auc:.4f}'

try:
    oml.ds.save(
        {'model': xgb_model},
        model_name,
        description=description,
        overwrite=True
    )
    print(f"✓ Model saved to OML datastore: {model_name}")
except Exception as e:
    print(f"❌ ERROR: Failed to save model: {e}")