In [None]:
# 🚀 Snowflake ML Platform: Unified Training Pipeline

This notebook implements a production-ready training pipeline that sources features from the feature store and trains both supervised and unsupervised models in a unified workflow.

## 🎯 What We're Building
- **Feature Store Integration**: Source all features from centralized feature store
- **Supervised ML**: Advanced adverse event prediction models (XGBoost + Random Forest)
- **Unsupervised ML**: Patient segmentation and anomaly detection
- **Model Comparison**: Automated model selection and performance comparison
- **Pipeline Orchestration**: End-to-end training workflow automation
- **Model Registry**: Automated model registration and versioning

## 🏗️ Training Pipeline Architecture
```
Feature Store → Feature Loading → Data Preparation
     ↓                ↓               ↓
Model Training → Model Evaluation → Model Registry
     ↓                ↓               ↓
Supervised ML    Performance      Version Control
Unsupervised ML  Comparison       Metadata Tracking
```

## 🎯 Enterprise Benefits
- **Consistency**: Same features for all models (eliminates train/serve skew)
- **Automation**: Fully automated training pipeline with minimal manual intervention
- **Reproducibility**: Version-controlled models with complete lineage tracking
- **Scalability**: Distributed training across Snowflake's elastic compute
- **Governance**: Centralized model management with approval workflows
- **Efficiency**: Reusable training components across multiple models

## 🏥 Healthcare-Specific Value
- **Multi-Model Insights**: Combine supervised predictions with unsupervised insights
- **Clinical Validation**: Automated model performance benchmarking
- **Risk Stratification**: Integrated patient segmentation and risk scoring
- **Regulatory Compliance**: Complete model lineage and audit trails


In [None]:
# Import libraries for unified training pipeline
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col, lit, current_timestamp, when
from snowflake.ml.modeling.ensemble import RandomForestClassifier
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.cluster import KMeans
from snowflake.ml.modeling.ensemble import IsolationForest
from snowflake.ml.modeling.preprocessing import StandardScaler
from snowflake.ml.modeling.metrics import accuracy_score, precision_score, recall_score, f1_score
from snowflake.ml.registry import Model
import datetime
import time
import uuid

print("🚀 Training Pipeline Libraries Loaded!")
print("🔗 Ready for unified ML training from Feature Store")

# Get current session
session = Session.builder.getOrCreate()

# Set context for training pipeline
session.use_database("ADVERSE_EVENT_MONITORING")
session.use_warehouse("ADVERSE_EVENT_WH")

print("✅ Session configured for training pipeline")
print(f"📍 Database: {session.get_current_database()}")
print(f"📍 Warehouse: {session.get_current_warehouse()}")

# Try to use GPU compute pool for XGBoost (with fallback)
try:
    session.sql("USE WAREHOUSE ML_GPU_POOL").collect()
    print(f"🔥 Using GPU compute pool for accelerated training")
    gpu_available = True
except:
    print(f"💻 Using standard warehouse (GPU pool not available)")
    gpu_available = False

print()


In [None]:
print("🏪 Loading Training Data from Feature Store...")
print("=" * 60)

# Load features from the centralized feature store
session.use_schema("FEATURE_STORE")

try:
    # Load training features from offline feature store
    training_data = session.table("OFFLINE_FEATURE_STORE")
    total_records = training_data.count()
    
    print(f"✅ Loaded training data from feature store: {total_records} patient records")
    
    # Define feature sets for different model types
    demographic_features = ["patient_age"]
    
    claims_features = [
        "total_claim_amount_sum",
        "num_claims", 
        "avg_claim_amount",
        "claims_last_30d",
        "claims_last_90d"
    ]
    
    medical_features = [
        "num_conditions",
        "num_medications",
        "chronic_conditions_count"
    ]
    
    risk_features = [
        "comorbidity_score",
        "medication_complexity_score", 
        "healthcare_utilization_score"
    ]
    
    # Complete feature set for supervised learning
    supervised_features = demographic_features + claims_features + medical_features + risk_features
    
    # Feature set for unsupervised learning (excluding target)
    unsupervised_features = supervised_features.copy()
    
    target_variable = "adverse_event_target"
    
    print(f"📊 Feature Set Configuration:")
    print(f"   👤 Demographics: {len(demographic_features)} features")
    print(f"   💰 Claims: {len(claims_features)} features") 
    print(f"   🏥 Medical: {len(medical_features)} features")
    print(f"   📈 Risk Scores: {len(risk_features)} features")
    print(f"   🎯 Total Supervised Features: {len(supervised_features)}")
    print(f"   🧠 Total Unsupervised Features: {len(unsupervised_features)}")
    
    # Prepare clean training dataset
    clean_training_data = training_data.select(
        col("entity_id").alias("patient_id"),
        *[col(feature) for feature in supervised_features],
        col(target_variable).alias("target")
    ).filter(
        # Remove rows with null values for robust training
        col("patient_age").isNotNull() &
        col("total_claim_amount_sum").isNotNull() &
        col("target").isNotNull()
    )
    
    clean_count = clean_training_data.count()
    print(f"✅ Prepared clean training dataset: {clean_count} patients")
    
    # Show data quality metrics
    target_distribution = clean_training_data.group_by("target").agg(
        col("target").count().alias("count")
    ).collect()
    
    print(f"\n📊 Training Data Quality Metrics:")
    for row in target_distribution:
        target_val = row["TARGET"]
        count = row["COUNT"]
        percentage = (count / clean_count) * 100
        label = "Adverse Event" if target_val else "No Adverse Event"
        print(f"   • {label}: {count} patients ({percentage:.1f}%)")
    
    # Calculate class balance
    positive_cases = sum(row["COUNT"] for row in target_distribution if row["TARGET"])
    negative_cases = sum(row["COUNT"] for row in target_distribution if not row["TARGET"])
    class_ratio = negative_cases / positive_cases if positive_cases > 0 else 1
    
    print(f"   • Class Imbalance Ratio: {class_ratio:.1f}:1 (negative:positive)")
    
    # Show feature statistics
    feature_stats = clean_training_data.select(
        col("patient_age").avg().alias("avg_age"),
        col("total_claim_amount_sum").avg().alias("avg_claims"),
        col("num_conditions").avg().alias("avg_conditions"),
        col("comorbidity_score").avg().alias("avg_comorbidity")
    ).collect()[0]
    
    print(f"\n📈 Feature Statistics:")
    print(f"   • Average Age: {feature_stats['AVG_AGE']:.1f} years")
    print(f"   • Average Claims: ${feature_stats['AVG_CLAIMS']:,.0f}")
    print(f"   • Average Conditions: {feature_stats['AVG_CONDITIONS']:.1f}")
    print(f"   • Average Comorbidity Score: {feature_stats['AVG_COMORBIDITY']:.2f}")
    
    # Split data for training and testing
    print(f"\n📊 Splitting Data for Training and Validation...")
    train_data, test_data = clean_training_data.random_split([0.8, 0.2], seed=42)
    
    train_count = train_data.count()
    test_count = test_data.count()
    
    print(f"   • Training Set: {train_count} patients ({(train_count/clean_count)*100:.1f}%)")
    print(f"   • Test Set: {test_count} patients ({(test_count/clean_count)*100:.1f}%)")
    
    print(f"\n✅ Feature Store integration successful - ready for training!")
    
except Exception as e:
    print(f"❌ Error loading from feature store: {e}")
    print("💡 Make sure you've run notebooks 09 (Feature Store) and 10 (Unsupervised ML)")
    raise


In [None]:
print("\n🎯 Supervised ML Training: Adverse Event Prediction...")
print("=" * 60)

# Store training results for comparison
training_results = {}

# Train XGBoost Model (with GPU acceleration if available)
print("🔥 Training XGBoost Model...")
start_time = time.time()

try:
    # Configure XGBoost with healthcare-optimized parameters
    xgb_params = {
        'n_estimators': 200,
        'max_depth': 8,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'scale_pos_weight': class_ratio,  # Handle class imbalance
        'eval_metric': 'auc',
        'early_stopping_rounds': 10,
        'random_state': 42,
        'n_jobs': -1
    }
    
    # Add GPU-specific parameters if available
    if gpu_available:
        xgb_params.update({
            'tree_method': 'gpu_hist',
            'gpu_id': 0
        })
        print("   🚀 Using GPU acceleration for XGBoost training")
    
    # Initialize XGBoost classifier
    xgb_model = XGBClassifier(
        input_cols=supervised_features,
        output_cols=["XGB_PREDICTION"],
        label_cols=["target"],
        **xgb_params
    )
    
    # Train XGBoost model
    fitted_xgb = xgb_model.fit(train_data)
    xgb_training_time = time.time() - start_time
    
    print(f"✅ XGBoost training completed in {xgb_training_time:.1f} seconds")
    
    # Evaluate XGBoost model
    xgb_predictions = fitted_xgb.predict(test_data)
    
    # Calculate metrics
    xgb_accuracy = accuracy_score(df=xgb_predictions, y_true_col_names="target", y_pred_col_names="XGB_PREDICTION")
    xgb_precision = precision_score(df=xgb_predictions, y_true_col_names="target", y_pred_col_names="XGB_PREDICTION")
    xgb_recall = recall_score(df=xgb_predictions, y_true_col_names="target", y_pred_col_names="XGB_PREDICTION")
    xgb_f1 = f1_score(df=xgb_predictions, y_true_col_names="target", y_pred_col_names="XGB_PREDICTION")
    
    # Store results
    training_results['XGBoost'] = {
        'model': fitted_xgb,
        'predictions': xgb_predictions,
        'accuracy': xgb_accuracy,
        'precision': xgb_precision,
        'recall': xgb_recall,
        'f1_score': xgb_f1,
        'training_time': xgb_training_time,
        'model_type': 'XGBoost'
    }
    
    print(f"📊 XGBoost Performance:")
    print(f"   • Accuracy: {xgb_accuracy:.4f}")
    print(f"   • Precision: {xgb_precision:.4f}")
    print(f"   • Recall: {xgb_recall:.4f}")
    print(f"   • F1-Score: {xgb_f1:.4f}")
    
except Exception as e:
    print(f"❌ XGBoost training failed: {e}")
    training_results['XGBoost'] = None

# Train Random Forest Model (for comparison)
print(f"\n🌲 Training Random Forest Model...")
start_time = time.time()

try:
    # Initialize Random Forest classifier
    rf_model = RandomForestClassifier(
        input_cols=supervised_features,
        output_cols=["RF_PREDICTION"],
        label_cols=["target"],
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    
    # Train Random Forest model
    fitted_rf = rf_model.fit(train_data)
    rf_training_time = time.time() - start_time
    
    print(f"✅ Random Forest training completed in {rf_training_time:.1f} seconds")
    
    # Evaluate Random Forest model
    rf_predictions = fitted_rf.predict(test_data)
    
    # Calculate metrics
    rf_accuracy = accuracy_score(df=rf_predictions, y_true_col_names="target", y_pred_col_names="RF_PREDICTION")
    rf_precision = precision_score(df=rf_predictions, y_true_col_names="target", y_pred_col_names="RF_PREDICTION")
    rf_recall = recall_score(df=rf_predictions, y_true_col_names="target", y_pred_col_names="RF_PREDICTION")
    rf_f1 = f1_score(df=rf_predictions, y_true_col_names="target", y_pred_col_names="RF_PREDICTION")
    
    # Store results
    training_results['RandomForest'] = {
        'model': fitted_rf,
        'predictions': rf_predictions,
        'accuracy': rf_accuracy,
        'precision': rf_precision,
        'recall': rf_recall,
        'f1_score': rf_f1,
        'training_time': rf_training_time,
        'model_type': 'RandomForest'
    }
    
    print(f"📊 Random Forest Performance:")
    print(f"   • Accuracy: {rf_accuracy:.4f}")
    print(f"   • Precision: {rf_precision:.4f}")
    print(f"   • Recall: {rf_recall:.4f}")
    print(f"   • F1-Score: {rf_f1:.4f}")
    
except Exception as e:
    print(f"❌ Random Forest training failed: {e}")
    training_results['RandomForest'] = None

# Model Comparison and Selection
print(f"\n🏆 Model Performance Comparison:")
print(f"=" * 50)

best_model = None
best_f1 = 0

for model_name, results in training_results.items():
    if results is not None:
        f1 = results['f1_score']
        training_time = results['training_time']
        
        print(f"\n📊 {model_name} Summary:")
        print(f"   🎯 F1-Score: {f1:.4f}")
        print(f"   ⚡ Training Time: {training_time:.1f}s")
        print(f"   📈 Accuracy: {results['accuracy']:.4f}")
        print(f"   🔍 Precision: {results['precision']:.4f}")
        print(f"   📊 Recall: {results['recall']:.4f}")
        
        # Select best model based on F1-score (important for healthcare)
        if f1 > best_f1:
            best_f1 = f1
            best_model = results
            best_model_name = model_name

if best_model:
    print(f"\n🥇 Best Model: {best_model_name}")
    print(f"   🏆 F1-Score: {best_f1:.4f}")
    print(f"   💡 Selected for deployment based on balanced performance")
else:
    print(f"\n❌ No models trained successfully")

print(f"\n✅ Supervised ML training completed!")


In [None]:
print("\n🧠 Unsupervised ML Training: Patient Segmentation...")
print("=" * 60)

# Train unsupervised models for additional insights
unsupervised_results = {}

# Prepare data for unsupervised learning (no target variable)
unsupervised_data = clean_training_data.select(
    "patient_id",
    *unsupervised_features
)

# Feature scaling for unsupervised learning
print("⚙️ Scaling features for unsupervised learning...")

try:
    # Initialize and apply StandardScaler
    scaler = StandardScaler(
        input_cols=unsupervised_features,
        output_cols=[f"{feature}_SCALED" for feature in unsupervised_features]
    )
    
    scaled_unsupervised_data = scaler.fit(unsupervised_data).transform(unsupervised_data)
    scaled_features = [f"{feature}_SCALED" for feature in unsupervised_features]
    
    print(f"✅ Feature scaling completed")
    
    # Train K-Means clustering for patient segmentation
    print(f"\n🎯 Training K-Means clustering...")
    start_time = time.time()
    
    kmeans = KMeans(
        n_clusters=4,  # Healthcare risk segments: Low, Medium, High, Critical
        input_cols=scaled_features,
        output_cols=["CLUSTER_ID"],
        random_state=42,
        max_iter=100
    )
    
    fitted_kmeans = kmeans.fit(scaled_unsupervised_data)
    clustering_time = time.time() - start_time
    
    print(f"✅ K-Means clustering completed in {clustering_time:.1f} seconds")
    
    # Apply clustering
    clustered_data = fitted_kmeans.predict(scaled_unsupervised_data)
    
    # Analyze cluster characteristics
    cluster_analysis = clustered_data.group_by("CLUSTER_ID").agg(
        col("patient_id").count().alias("patient_count"),
        col("patient_age").avg().alias("avg_age"),
        col("comorbidity_score").avg().alias("avg_comorbidity"),
        col("healthcare_utilization_score").avg().alias("avg_utilization")
    ).collect()
    
    print(f"\n📊 Patient Cluster Analysis:")
    for cluster in cluster_analysis:
        cluster_id = cluster["CLUSTER_ID"]
        count = cluster["PATIENT_COUNT"]
        avg_age = cluster["AVG_AGE"] 
        avg_comorbidity = cluster["AVG_COMORBIDITY"]
        
        risk_levels = ["🟢 LOW", "🟡 MEDIUM", "🟠 HIGH", "🔴 CRITICAL"]
        risk_level = risk_levels[min(cluster_id, 3)]
        
        print(f"   Cluster {cluster_id} ({risk_level}): {count} patients, Age: {avg_age:.1f}, Comorbidity: {avg_comorbidity:.2f}")
    
    unsupervised_results['KMeans'] = {
        'model': fitted_kmeans,
        'data': clustered_data,
        'training_time': clustering_time,
        'n_clusters': 4
    }
    
    # Train Isolation Forest for anomaly detection
    print(f"\n🚨 Training Isolation Forest for anomaly detection...")
    start_time = time.time()
    
    isolation_forest = IsolationForest(
        input_cols=scaled_features,
        output_cols=["ANOMALY_SCORE"],
        contamination=0.1,  # Expect ~10% anomalies
        random_state=42,
        n_estimators=100
    )
    
    fitted_isolation_forest = isolation_forest.fit(scaled_unsupervised_data)
    anomaly_time = time.time() - start_time
    
    print(f"✅ Isolation Forest completed in {anomaly_time:.1f} seconds")
    
    # Apply anomaly detection
    anomaly_data = fitted_isolation_forest.predict(scaled_unsupervised_data)
    
    # Count anomalies
    anomaly_count = anomaly_data.filter(col("ANOMALY_SCORE") < -0.1).count()
    total_patients = anomaly_data.count()
    anomaly_rate = (anomaly_count / total_patients) * 100
    
    print(f"📊 Anomaly Detection Results:")
    print(f"   🚨 Anomalous patients: {anomaly_count} ({anomaly_rate:.1f}%)")
    print(f"   ✅ Normal patients: {total_patients - anomaly_count} ({100-anomaly_rate:.1f}%)")
    
    unsupervised_results['IsolationForest'] = {
        'model': fitted_isolation_forest,
        'data': anomaly_data,
        'training_time': anomaly_time,
        'anomaly_count': anomaly_count,
        'anomaly_rate': anomaly_rate
    }
    
    print(f"\n✅ Unsupervised ML training completed!")
    
except Exception as e:
    print(f"❌ Unsupervised ML training failed: {e}")
    print("📝 Continuing with supervised models only...")

print(f"\n📊 Complete Training Pipeline Results:")
print(f"=" * 50)

total_training_time = sum(
    result['training_time'] for result in training_results.values() if result is not None
) + sum(
    result['training_time'] for result in unsupervised_results.values() if result is not None
)

print(f"⏱️  Total Training Time: {total_training_time:.1f} seconds")
print(f"🎯 Supervised Models: {len([r for r in training_results.values() if r is not None])}")
print(f"🧠 Unsupervised Models: {len([r for r in unsupervised_results.values() if r is not None])}")

if gpu_available:
    print(f"🔥 GPU Acceleration: Enabled")
else:
    print(f"💻 Compute: Standard CPU")


In [None]:
print("\n📋 Model Registry Integration...")
print("=" * 60)

# Register best performing models in Snowflake Model Registry
session.use_schema("ML_MODELS")

registry_results = {}

try:
    # Register the best supervised model
    if best_model is not None:
        print(f"📝 Registering best supervised model: {best_model_name}")
        
        # Generate unique model version
        model_version = f"v{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
        model_name = f"ADVERSE_EVENT_PREDICTOR_{best_model['model_type'].upper()}"
        
        # Prepare model metadata
        model_metadata = {
            "description": f"{best_model['model_type']} model for adverse health event prediction",
            "model_type": "BINARY_CLASSIFICATION",
            "target_variable": "adverse_event_target",
            "feature_count": len(supervised_features),
            "features_used": supervised_features,
            "training_data_source": "FEATURE_STORE.OFFLINE_FEATURE_STORE",
            "accuracy": float(best_model['accuracy']),
            "precision": float(best_model['precision']),
            "recall": float(best_model['recall']),
            "f1_score": float(best_model['f1_score']),
            "training_time_seconds": float(best_model['training_time']),
            "class_imbalance_ratio": float(class_ratio),
            "gpu_accelerated": gpu_available,
            "training_date": str(datetime.datetime.now()),
            "feature_store_version": "1.0",
            "pipeline_version": "unified_training_v1.0"
        }
        
        # Register model in Snowflake Model Registry
        registered_model = Model.upload_model(
            session=session,
            name=model_name,
            version=model_version,
            model=best_model['model'],
            metadata=model_metadata,
            comment=f"Best performing {best_model['model_type']} model from unified training pipeline"
        )
        
        print(f"✅ Supervised model registered: {model_name} {model_version}")
        print(f"   🎯 F1-Score: {best_model['f1_score']:.4f}")
        print(f"   📊 Model ID: {registered_model.model_id}")
        
        registry_results['supervised'] = {
            'model_name': model_name,
            'version': model_version,
            'model_id': registered_model.model_id,
            'metadata': model_metadata
        }
        
        # Also update custom MODEL_REGISTRY table
        session.sql(f"""
            INSERT INTO MODEL_REGISTRY (
                model_id, model_name, model_type, model_version, training_date,
                accuracy_score, precision_score, recall_score, f1_score, 
                model_status, created_by
            ) VALUES (
                '{registered_model.model_id}',
                '{model_name}',
                'CLASSIFICATION',
                '{model_version}',
                CURRENT_TIMESTAMP(),
                {best_model['accuracy']},
                {best_model['precision']},
                {best_model['recall']},
                {best_model['f1_score']},
                'STAGING',
                CURRENT_USER()
            )
        """).collect()
        
    else:
        print("⚠️ No supervised model available for registration")
    
    # Save unsupervised model results to feature store
    if unsupervised_results:
        print(f"\n💾 Saving unsupervised ML results...")
        
        session.use_schema("FEATURE_STORE")
        
        # Update the unsupervised ML insights table with new training results
        if 'KMeans' in unsupervised_results:
            clustering_data = unsupervised_results['KMeans']['data']
            
            # Save clustering results
            clustering_summary = clustering_data.group_by("CLUSTER_ID").agg(
                col("patient_id").count().alias("patient_count"),
                col("patient_age").avg().alias("avg_age"),
                col("comorbidity_score").avg().alias("avg_comorbidity")
            ).with_column(
                "training_date",
                lit(datetime.datetime.now())
            ).with_column(
                "model_version",
                lit("unified_pipeline_v1.0")
            )
            
            # Create or update clustering insights table
            session.sql("""
                CREATE TABLE IF NOT EXISTS CLUSTERING_INSIGHTS (
                    cluster_id INTEGER,
                    patient_count INTEGER,
                    avg_age FLOAT,
                    avg_comorbidity FLOAT,
                    training_date TIMESTAMP,
                    model_version VARCHAR(50)
                )
            """).collect()
            
            clustering_summary.write.mode("append").save_as_table("CLUSTERING_INSIGHTS")
            print(f"   ✅ K-Means clustering insights saved")
        
        if 'IsolationForest' in unsupervised_results:
            anomaly_rate = unsupervised_results['IsolationForest']['anomaly_rate']
            
            # Log anomaly detection results
            session.sql(f"""
                CREATE TABLE IF NOT EXISTS ANOMALY_DETECTION_LOGS (
                    training_date TIMESTAMP,
                    anomaly_rate FLOAT,
                    total_patients INTEGER,
                    model_version VARCHAR(50)
                )
            """).collect()
            
            session.sql(f"""
                INSERT INTO ANOMALY_DETECTION_LOGS VALUES (
                    CURRENT_TIMESTAMP(),
                    {anomaly_rate},
                    {total_patients},
                    'unified_pipeline_v1.0'
                )
            """).collect()
            
            print(f"   ✅ Anomaly detection results logged")
    
    print(f"\n🏆 Training Pipeline Completion Summary:")
    print(f"=" * 50)
    print(f"✅ Feature Store Integration: Complete")
    print(f"✅ Supervised ML Training: {len([r for r in training_results.values() if r is not None])} models")
    print(f"✅ Unsupervised ML Training: {len([r for r in unsupervised_results.values() if r is not None])} models")
    print(f"✅ Model Registry: {'Complete' if registry_results else 'Partial'}")
    print(f"✅ Total Training Time: {total_training_time:.1f} seconds")
    
    if best_model:
        print(f"\n🎯 Best Model for Deployment:")
        print(f"   📛 Name: {best_model_name}")
        print(f"   🏆 F1-Score: {best_model['f1_score']:.4f}")
        print(f"   📊 Model ID: {registry_results.get('supervised', {}).get('model_id', 'N/A')}")
    
    print(f"\n📋 Next Steps:")
    print(f"   1. Use `12_Inference_Pipeline` for real-time predictions")
    print(f"   2. Use `13_ML_Platform_Demo` for complete platform showcase")
    print(f"   3. Review models in Snowsight Model Registry")
    print(f"   4. Deploy best model for production inference")
    
    print(f"\n🚀 Unified Training Pipeline Complete!")
    print(f"🎉 Ready for production deployment and inference")
    
except Exception as e:
    print(f"❌ Error in model registry integration: {e}")
    print("💡 Models trained successfully but registration failed")
    print("📝 Manual registration may be required")
