# Churn Prediction Model Training

This notebook trains a churn prediction model using OML4Py XGBoost.

## Steps:
1. Load training data from CHURN_TRAINING_DATA view
2. Split data into train/test sets
3. Train XGBoost model using OML4Py
4. Evaluate model performance
5. Save model to OML Datastore

In [None]:
%python

# Cell 1: Import and Setup
import oml
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)
from datetime import datetime

print("=" * 60)
print("Churn Prediction Model Training")
print("=" * 60)
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Check OML connection
if oml.isconnected():
    print("‚úì OML connected")
else:
    print("‚ö†Ô∏è  OML not connected")

In [None]:
%python

# Cell 2: Load Training Data

# Load data from view
print("Loading data from CHURN_TRAINING_DATA view...")
train_data_oml = oml.sync(view='CHURN_TRAINING_DATA')
train_data_pd = train_data_oml.pull()

print(f"‚úì Loaded {len(train_data_pd):,} rows")
print(f"‚úì Columns: {len(train_data_pd.columns)}")

# Check churn distribution
churn_rate = train_data_pd['CHURNED'].mean()
print(f"\nData Quality Check:")
print(f"  Churn rate: {churn_rate*100:.2f}%")
print(f"  Churned: {train_data_pd['CHURNED'].sum():,}")
print(f"  Retained: {(train_data_pd['CHURNED'] == 0).sum():,}")

# Identify feature columns
exclude_cols = ['USER_ID', 'CHURNED']
feature_cols = [col for col in train_data_pd.columns if col not in exclude_cols]

print(f"\n‚úì Feature columns: {len(feature_cols)}")
print(f"  Features: {', '.join(feature_cols[:10])}{'...' if len(feature_cols) > 10 else ''}")

# Prepare X and y
X_pd = train_data_pd[feature_cols].copy()
y_pd = train_data_pd['CHURNED'].copy()

# Clean data - use robust type conversion like validation script
print("\nCleaning data...")
for col in feature_cols:
    if pd.api.types.is_numeric_dtype(X_pd[col]):
        # Replace infinity and NaN
        X_pd[col] = X_pd[col].replace([np.inf, -np.inf], np.nan)
        # Use pd.to_numeric with errors='coerce' for robust conversion
        X_pd[col] = pd.to_numeric(X_pd[col], errors='coerce').fillna(0)
    elif pd.api.types.is_object_dtype(X_pd[col]):
        # Handle categorical columns - convert to numeric if needed
        X_pd[col] = pd.Categorical(X_pd[col]).codes
        # Ensure numeric
        X_pd[col] = pd.to_numeric(X_pd[col], errors='coerce').fillna(0)

# Check for constant columns (no variance)
constant_cols = []
for col in feature_cols:
    if X_pd[col].nunique() <= 1:
        constant_cols.append(col)

if constant_cols:
    print(f"‚ö†Ô∏è  Warning: Found {len(constant_cols)} constant columns: {constant_cols}")
    feature_cols = [col for col in feature_cols if col not in constant_cols]
    X_pd = X_pd[feature_cols]
    print(f"  Removed constant columns, using {len(feature_cols)} features")

print("‚úì Data cleaned")
print(f"  Final feature count: {len(feature_cols)}")

In [None]:
%python

# Cell 2.5: Data Quality Diagnostics (Optional)
# Run this cell if model performance is poor to diagnose issues

print("=" * 60)
print("Data Quality Diagnostics")
print("=" * 60)

# Check feature distributions
print("\nFeature Statistics (First 5 Features):")
print("-" * 60)
for col in feature_cols[:5]:  # Check first 5 features
    if col in X_pd.columns and pd.api.types.is_numeric_dtype(X_pd[col]):
        print(f"\n{col}:")
        print(f"  Mean: {X_pd[col].mean():.2f}")
        print(f"  Std:  {X_pd[col].std():.2f}")
        print(f"  Min:  {X_pd[col].min():.2f}")
        print(f"  Max:  {X_pd[col].max():.2f}")
        print(f"  NaN:  {X_pd[col].isna().sum()}")
    elif col in X_pd.columns:
        print(f"\n{col}: (non-numeric)")
        print(f"  Unique values: {X_pd[col].nunique()}")

# Check correlation with target
print("\n" + "=" * 60)
print("Feature-Target Correlations (Top 10)")
print("=" * 60)
correlations = []
for col in feature_cols:
    if col in X_pd.columns and pd.api.types.is_numeric_dtype(X_pd[col]):
        try:
            corr = X_pd[col].corr(y_pd)
            if not np.isnan(corr):
                correlations.append((col, abs(corr)))
        except Exception:
            pass  # Skip if correlation calculation fails

if len(correlations) > 0:
    correlations.sort(key=lambda x: x[1], reverse=True)
    print("\nTop correlated features:")
    for col, corr in correlations[:10]:
        print(f"  {col:30} {corr:.4f}")
    
    max_corr = max([c[1] for c in correlations])
    if max_corr < 0.1:
        print("\n‚ö†Ô∏è  WARNING: Very low correlations with target!")
        print(f"   Maximum correlation: {max_corr:.4f}")
        print("   Features may not be predictive of churn.")
        print("   This could explain poor model performance.")
    else:
        print(f"\n‚úì Maximum correlation: {max_corr:.4f}")
        if max_corr > 0.3:
            print("   Good: Some features are strongly correlated with churn")
        elif max_corr > 0.1:
            print("   Moderate: Features have some predictive power")
else:
    print("‚ö†Ô∏è  WARNING: Could not calculate correlations!")
    print("   Check data types and ensure features are numeric")

In [None]:
%python

# Cell 3: Split Data
# Split into train/test
X_train_pd, X_test_pd, y_train_pd, y_test_pd = train_test_split(
    X_pd, y_pd,
    test_size=0.2,
    random_state=42,
    stratify=y_pd
)

print(f"‚úì Train size: {len(X_train_pd):,}")
print(f"‚úì Test size: {len(X_test_pd):,}")
print(f"‚úì Train churn rate: {y_train_pd.mean() * 100:.2f}%")
print(f"‚úì Test churn rate: {y_test_pd.mean() * 100:.2f}%")

In [None]:
# Cell 4.5: REMOVED
# This diagnostic cell was removed to avoid confusion about execution order.
# The notebook now follows a clear sequence: Cell 1 ‚Üí 2 ‚Üí 3 ‚Üí 4 ‚Üí 5

In [None]:
%python

# Cell 4: Train RandomForest Model
# Merge X_train and y_train for database push
train_combined_pd = X_train_pd.copy()
train_combined_pd['CHURNED'] = y_train_pd.values

# CRITICAL: Ensure all numeric columns are float64 to prevent data type corruption
# OML push can convert float64 to int64, losing precision and causing data corruption
print("Ensuring data types are preserved...")
for col in feature_cols:
    if col in train_combined_pd.columns:
        if pd.api.types.is_numeric_dtype(train_combined_pd[col]):
            # Force float64 to prevent int64 conversion during OML push
            train_combined_pd[col] = train_combined_pd[col].astype('float64')
print("‚úì Data types standardized to float64")

# Push to database
print("\nPushing training data to OML...")
train_oml = oml.push(train_combined_pd)
print(f"‚úì Training data pushed: {train_oml.shape}")

# Get features and target
X_train_oml = train_oml[feature_cols]
y_train_oml = train_oml['CHURNED']

print(f"‚úì X_train_oml shape: {X_train_oml.shape}")
print(f"‚úì y_train_oml shape: {y_train_oml.shape}")
print(f"‚úì Training churn rate: {y_train_oml.mean()*100:.2f}%")

# Verify data quality
print(f"\nData Quality Check:")
# Pull a sample to check data quality (OML DataFrames need to be pulled first)
try:
    sample_pd = X_train_oml.pull().head(100)
    nan_count = sum([sample_pd[col].isna().any() for col in feature_cols if col in sample_pd.columns])
    inf_count = sum([np.isinf(sample_pd[col]).any() for col in feature_cols if col in sample_pd.columns and pd.api.types.is_numeric_dtype(sample_pd[col])])
    print(f"  Features with NaN (sample): {nan_count}")
    print(f"  Features with Inf (sample): {inf_count}")
except Exception as e:
    print(f"  Data quality check skipped: {e}")

# Create and Train RandomForest model
# NOTE: OML4Py RandomForest uses the SAME API pattern as XGBoost:
#   - Both: oml.rf('classification').fit(X, y) or oml.xgb('classification').fit(X, y)
# NOTE: OML4Py XGBoost with default parameters achieved AUC ~0.50 (essentially random)
# RandomForest achieved AUC 0.9190 in Task 2.8 validation, so we're using it instead
print("\nCreating RandomForest model...")
print("  ‚úì RandomForest achieved AUC 0.9190 in Task 2.8 validation")
print("  ‚úì OML4Py XGBoost defaults gave AUC ~0.50 (random)")
print("  ‚ö†Ô∏è  Using default RandomForest parameters (OML4Py doesn't support hyperparameters)")

# OML4Py RandomForest API: Create model, then fit with model_name in .fit()
# Based on Oracle docs: oml.rf(**settings).fit(X, y, model_name='MODEL_NAME')
print("\nTraining started...")
print("  This may take a few minutes...")
print("  Training RandomForest in-database...")

model_type = None
try:
    # Create RandomForest model (no arguments - creates new model)
    xgb_model = oml.rf()
    print("  ‚úì RandomForest model created")
    model_type = "RandomForest"
    # Train with model_name in .fit() if needed (optional)
    xgb_model = xgb_model.fit(X_train_oml, y_train_oml)
    print("  ‚úì RandomForest training completed")
except Exception as e:
    print(f"  ‚ùå RandomForest failed: {e}")
    print("  ‚ö†Ô∏è  Falling back to XGBoost (performance may be poor, AUC ~0.50)")
    # Fallback to XGBoost (even though it performs poorly)
    xgb_model = oml.xgb('classification')
    model_type = "XGBoost"
    xgb_model = xgb_model.fit(X_train_oml, y_train_oml)
    print("  ‚ö†Ô∏è  Using XGBoost as fallback")

print("‚úì Training completed!")
print(f"\nüìä Model Type: {model_type}")
print(f"   If this shows 'XGBoost', RandomForest failed and performance will be poor (AUC ~0.50)")
print(f"   If this shows 'RandomForest', performance should be good (AUC ~0.90)")

# Try to get feature importance (if available)
try:
    # Some OML4Py versions support feature_importance
    if hasattr(xgb_model, 'feature_importance'):
        importance = xgb_model.feature_importance()
        print(f"\n‚úì Feature importance available")
    elif hasattr(xgb_model, 'get_feature_importance'):
        importance = xgb_model.get_feature_importance()
        print(f"\n‚úì Feature importance available")
    else:
        print(f"\n‚ö†Ô∏è  Feature importance not directly available")
        print("   This is normal for some OML4Py versions")
except Exception as e:
    print(f"\n‚ö†Ô∏è  Could not get feature importance: {e}")

In [None]:
%python

# Cell 5: Evaluate Model
# Prepare test data
test_combined_pd = X_test_pd.copy()
test_combined_pd['CHURNED'] = y_test_pd.values
test_oml = oml.push(test_combined_pd)
X_test_oml = test_oml[feature_cols]

print(f"‚úì Test data prepared: {X_test_oml.shape}")
print(f"‚úì Test churn rate: {y_test_pd.mean()*100:.2f}%")
print(f"‚úì Expected predictions: {len(y_test_pd)}")

# Get predictions - try without supplemental_cols first
print("\nGenerating predictions...")
try:
    y_pred_proba_oml = xgb_model.predict_proba(X_test_oml)
except Exception as e:
    print(f"‚ö†Ô∏è  Error: {e}")
    raise

# Convert to numpy - handle different return formats
y_pred_proba_pd = y_pred_proba_oml.pull()
print(f"‚úì Predictions pulled")
print(f"  Type: {type(y_pred_proba_pd)}")
if hasattr(y_pred_proba_pd, 'shape'):
    print(f"  Shape: {y_pred_proba_pd.shape}")

# Extract probabilities - handle different formats
if isinstance(y_pred_proba_pd, pd.DataFrame):
    print(f"  DataFrame columns: {list(y_pred_proba_pd.columns)}")
    if 1 in y_pred_proba_pd.columns:
        y_pred_proba = y_pred_proba_pd[1].values
    elif len(y_pred_proba_pd.columns) == 2:
        y_pred_proba = y_pred_proba_pd.iloc[:, 1].values
    else:
        y_pred_proba = y_pred_proba_pd.values.flatten()
elif isinstance(y_pred_proba_pd, pd.Series):
    y_pred_proba = y_pred_proba_pd.values
else:
    y_pred_proba = np.array(y_pred_proba_pd).flatten()

print(f"  Extracted probabilities shape: {y_pred_proba.shape}")

# Check if probabilities are in 0-100 range and normalize
if y_pred_proba.max() > 1.0:
    print("‚ö†Ô∏è  Probabilities appear to be in 0-100 range, normalizing to 0-1...")
    y_pred_proba = y_pred_proba / 100.0

# Ensure probabilities are in valid range
y_pred_proba = np.clip(y_pred_proba, 0.0, 1.0)

# Get test labels
y_test_vals = y_test_pd.values
print(f"\nShape check:")
print(f"  Test labels: {len(y_test_vals)}")
print(f"  Predictions: {len(y_pred_proba)}")

# Handle shape mismatch - take only the first len(y_test_vals) predictions
if len(y_pred_proba) != len(y_test_vals):
    print(f"\n‚ö†Ô∏è  WARNING: Shape mismatch detected!")
    print(f"   Predictions: {len(y_pred_proba)}")
    print(f"   Test labels: {len(y_test_vals)}")
    
    if len(y_pred_proba) > len(y_test_vals):
        print(f"   Taking first {len(y_test_vals)} predictions to match test set...")
        y_pred_proba = y_pred_proba[:len(y_test_vals)]
    else:
        print(f"   ERROR: Not enough predictions! This shouldn't happen.")
        raise ValueError(f"Predictions ({len(y_pred_proba)}) < Test labels ({len(y_test_vals)})")

# Check prediction distribution
print(f"\nPrediction Statistics:")
print(f"  Mean probability: {y_pred_proba.mean():.4f}")
print(f"  Min probability: {y_pred_proba.min():.4f}")
print(f"  Max probability: {y_pred_proba.max():.4f}")
print(f"  Std probability: {y_pred_proba.std():.4f}")

# If predictions are all the same, model isn't learning
if y_pred_proba.std() < 0.01:
    print("\n‚ö†Ô∏è  WARNING: Predictions have very low variance!")
    print("   This suggests the model is not learning from the data.")

# Create binary predictions
y_pred = (y_pred_proba >= 0.5).astype(int)

# Verify shapes match before calculating metrics
assert len(y_test_vals) == len(y_pred_proba) == len(y_pred), \
    f"Shape mismatch: y_test={len(y_test_vals)}, y_pred_proba={len(y_pred_proba)}, y_pred={len(y_pred)}"

# Calculate metrics
accuracy = accuracy_score(y_test_vals, y_pred)
precision = precision_score(y_test_vals, y_pred, zero_division=0)
recall = recall_score(y_test_vals, y_pred, zero_division=0)
f1 = f1_score(y_test_vals, y_pred, zero_division=0)
auc = roc_auc_score(y_test_vals, y_pred_proba)

print("\n" + "=" * 60)
print("Model Performance Metrics")
print("=" * 60)
print(f"AUC-ROC:     {auc:.4f} ({auc*100:.2f}%)")
print(f"Accuracy:    {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision:   {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall:      {recall:.4f} ({recall*100:.2f}%)")
print(f"F1 Score:    {f1:.4f}")

# Performance assessment
if auc < 0.55:
    print("\n‚ö†Ô∏è  WARNING: Model performance is very poor (AUC < 0.55)")
    print("   This suggests the model is not learning effectively.")
    print("   Recommendations:")
    print("   1. Check data quality and feature engineering")
    print("   2. Verify features are informative")
    print("   3. Consider feature selection")
    print("   4. Try different hyperparameters")
elif auc < 0.70:
    print("\n‚ö†Ô∏è  Model performance is below target (AUC < 0.70)")
    print("   Consider improving features or hyperparameter tuning")
else:
    print("\n‚úì Model performance is acceptable (AUC >= 0.70)")

# Confusion matrix
cm = confusion_matrix(y_test_vals, y_pred)
tn, fp, fn, tp = cm.ravel()
print("\nConfusion Matrix:")
print("                Predicted")
print("              Non-Churn  Churn")
print(f"Actual Non-Churn   {tn:5d}   {fp:5d}")
print(f"       Churn       {fn:5d}   {tp:5d}")

In [None]:
%python

# Cell 6: Save Model
# Save model to OML datastore
model_name = 'churn_xgboost_v1'
description = f'Churn prediction XGBoost model - AUC: {auc:.4f}'

try:
    oml.ds.save(
        {'model': xgb_model},
        model_name,
        description=description,
        overwrite=True
    )
    print(f"‚úì Model saved to OML datastore: {model_name}")
except Exception as e:
    print(f"‚ùå ERROR: Failed to save model: {e}")