In [None]:
# 🧠 Snowflake ML Platform: Unsupervised Machine Learning

This notebook demonstrates unsupervised machine learning capabilities for healthcare analytics, including patient segmentation, anomaly detection, and feature discovery.

## 🎯 What We're Building
- **Patient Segmentation**: K-Means clustering for risk stratification
- **Anomaly Detection**: Isolation Forest for unusual patient patterns
- **Feature Discovery**: PCA for dimensionality reduction and insights
- **Clinical Insights**: Interpretable clusters for healthcare decisions

## 🔬 Unsupervised ML Applications in Healthcare
- **Patient Stratification**: Group patients by similar risk profiles
- **Fraud Detection**: Identify unusual billing or care patterns
- **Resource Planning**: Optimize staffing and capacity based on patient clusters
- **Drug Discovery**: Find patient subgroups for clinical trials
- **Quality Improvement**: Detect care pattern anomalies

## 🚀 Advanced Analytics
```
Feature Store → Unsupervised ML → Clinical Insights
     ↓              ↓                    ↓
Historical      Clustering         Patient Segments
Features        Anomaly Det.       Unusual Patterns
                Dimensionality     Hidden Insights
```

## 💡 Business Value
- **Personalized Medicine**: Tailored treatments for patient segments
- **Operational Efficiency**: Resource allocation based on patient clusters
- **Risk Management**: Early detection of unusual patterns
- **Research Acceleration**: Discovery of patient subpopulations


In [None]:
# Import libraries for unsupervised machine learning
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col, lit, avg, count, stddev, min, max
from snowflake.ml.modeling.cluster import KMeans
from snowflake.ml.modeling.ensemble import IsolationForest
from snowflake.ml.modeling.decomposition import PCA
from snowflake.ml.modeling.preprocessing import StandardScaler
import datetime
import uuid

print("🧠 Unsupervised ML Libraries Loaded!")
print("🔬 Ready for patient segmentation and anomaly detection")

# Get current session
session = Session.builder.getOrCreate()

# Set context for unsupervised ML operations
session.use_database("ADVERSE_EVENT_MONITORING")
session.use_warehouse("ADVERSE_EVENT_WH")

print("✅ Session configured for unsupervised ML")
print(f"📍 Database: {session.get_current_database()}")
print(f"📍 Warehouse: {session.get_current_warehouse()}")
print()


In [None]:
print("📊 Loading Healthcare Data from Feature Store...")

# Load features from the feature store (offline store for historical analysis)
session.use_schema("FEATURE_STORE")

try:
    # Load offline feature store data
    feature_data = session.table("OFFLINE_FEATURE_STORE")
    patient_count = feature_data.count()
    
    print(f"✅ Loaded feature store data: {patient_count} patients")
    
    # Select features for unsupervised learning (exclude target variables)
    ml_features = [
        "patient_age",
        "total_claim_amount_sum", 
        "num_claims",
        "avg_claim_amount",
        "claims_last_30d",
        "claims_last_90d",
        "num_conditions",
        "num_medications", 
        "chronic_conditions_count",
        "comorbidity_score",
        "medication_complexity_score",
        "healthcare_utilization_score"
    ]
    
    # Prepare data for unsupervised learning
    unsupervised_data = feature_data.select(
        col("entity_id").alias("patient_id"),
        *[col(feature) for feature in ml_features]
    ).filter(
        # Remove any rows with null values for clean clustering
        col("patient_age").isNotNull() & 
        col("total_claim_amount_sum").isNotNull() &
        col("num_conditions").isNotNull()
    )
    
    clean_count = unsupervised_data.count()
    print(f"✅ Prepared clean dataset: {clean_count} patients with {len(ml_features)} features")
    
    # Show data statistics
    print(f"\n📈 Feature Statistics:")
    stats = unsupervised_data.select(
        avg("patient_age").alias("avg_age"),
        avg("total_claim_amount_sum").alias("avg_claims"),
        avg("num_conditions").alias("avg_conditions"),
        avg("comorbidity_score").alias("avg_comorbidity")
    ).collect()[0]
    
    print(f"   • Average Age: {stats['AVG_AGE']:.1f} years")
    print(f"   • Average Claims: ${stats['AVG_CLAIMS']:,.0f}")
    print(f"   • Average Conditions: {stats['AVG_CONDITIONS']:.1f}")
    print(f"   • Average Comorbidity Score: {stats['AVG_COMORBIDITY']:.2f}")
    
    # Show sample data
    print(f"\n📄 Sample Data for Unsupervised Learning:")
    sample_cols = ["patient_id", "patient_age", "num_conditions", "comorbidity_score", "healthcare_utilization_score"]
    unsupervised_data.select(*sample_cols).limit(5).show()
    
except Exception as e:
    print(f"❌ Error loading feature store data: {e}")
    print("💡 Make sure you've run the 09_Feature_Store_Setup notebook first")
    raise


In [None]:
print("🎯 Patient Segmentation with K-Means Clustering...")
print("=" * 60)

# First, scale the features for better clustering performance
print("⚙️ Scaling features for optimal clustering...")

# Initialize StandardScaler
scaler = StandardScaler(
    input_cols=ml_features,
    output_cols=[f"{feature}_SCALED" for feature in ml_features]
)

# Fit and transform the data
scaled_data = scaler.fit(unsupervised_data).transform(unsupervised_data)
scaled_features = [f"{feature}_SCALED" for feature in ml_features]

print(f"✅ Features scaled successfully")

# K-Means Clustering with optimal number of clusters for healthcare segmentation
print(f"\n🎯 Training K-Means clustering model...")

# Initialize K-Means with 4 clusters (common healthcare risk stratification)
kmeans = KMeans(
    n_clusters=4,              # 4 risk segments: Low, Medium, High, Critical
    input_cols=scaled_features,
    output_cols=["CLUSTER_ID"],
    random_state=42,
    max_iter=100
)

# Train the clustering model
try:
    fitted_kmeans = kmeans.fit(scaled_data)
    print(f"✅ K-Means clustering trained successfully")
    
    # Apply clustering to get patient segments
    clustered_data = fitted_kmeans.predict(scaled_data)
    
    print(f"🎉 Patient segmentation completed!")
    
    # Analyze cluster characteristics
    cluster_analysis = clustered_data.group_by("CLUSTER_ID").agg(
        count("patient_id").alias("patient_count"),
        avg("patient_age").alias("avg_age"),
        avg("total_claim_amount_sum").alias("avg_claims"),
        avg("num_conditions").alias("avg_conditions"),
        avg("comorbidity_score").alias("avg_comorbidity"),
        avg("healthcare_utilization_score").alias("avg_utilization")
    ).order_by("CLUSTER_ID")
    
    cluster_results = cluster_analysis.collect()
    
    print(f"\n📊 Patient Cluster Analysis:")
    print(f"=" * 80)
    
    # Define cluster interpretations based on characteristics
    cluster_names = {
        0: "🟢 LOW RISK",
        1: "🟡 MEDIUM RISK", 
        2: "🟠 HIGH RISK",
        3: "🔴 CRITICAL RISK"
    }
    
    total_patients = sum(row["PATIENT_COUNT"] for row in cluster_results)
    
    for cluster in cluster_results:
        cluster_id = cluster["CLUSTER_ID"]
        patient_count = cluster["PATIENT_COUNT"]
        percentage = (patient_count / total_patients) * 100
        
        # Determine cluster interpretation
        comorbidity = cluster["AVG_COMORBIDITY"]
        utilization = cluster["AVG_UTILIZATION"]
        
        if comorbidity <= 1.0 and utilization <= 2.0:
            risk_level = "🟢 LOW RISK"
            interpretation = "Healthy patients with minimal healthcare needs"
        elif comorbidity <= 2.0 and utilization <= 4.0:
            risk_level = "🟡 MEDIUM RISK"
            interpretation = "Moderate complexity patients requiring regular monitoring"
        elif comorbidity <= 3.0 or utilization <= 6.0:
            risk_level = "🟠 HIGH RISK" 
            interpretation = "Complex patients with multiple conditions"
        else:
            risk_level = "🔴 CRITICAL RISK"
            interpretation = "Very high complexity patients requiring intensive management"
        
        print(f"\nCluster {cluster_id}: {risk_level}")
        print(f"   📊 Patients: {patient_count} ({percentage:.1f}%)")
        print(f"   👤 Avg Age: {cluster['AVG_AGE']:.1f} years")
        print(f"   💰 Avg Claims: ${cluster['AVG_CLAIMS']:,.0f}")
        print(f"   🏥 Avg Conditions: {cluster['AVG_CONDITIONS']:.1f}")
        print(f"   📈 Comorbidity Score: {cluster['AVG_COMORBIDITY']:.2f}")
        print(f"   ⚡ Utilization Score: {cluster['AVG_UTILIZATION']:.2f}")
        print(f"   💡 Profile: {interpretation}")
    
except Exception as e:
    print(f"❌ Error in K-Means clustering: {e}")
    # Create a simple fallback clustering for demo
    clustered_data = unsupervised_data.with_column(
        "CLUSTER_ID", 
        (col("comorbidity_score")).cast("int")
    )
    print("📝 Using simplified clustering based on comorbidity score")


In [None]:
print("\n🚨 Anomaly Detection with Isolation Forest...")
print("=" * 60)

# Use Isolation Forest to detect unusual patient patterns
print("🔍 Training Isolation Forest for anomaly detection...")

# Initialize Isolation Forest
isolation_forest = IsolationForest(
    input_cols=scaled_features,
    output_cols=["ANOMALY_SCORE"],
    contamination=0.1,  # Expect ~10% of patients to be anomalies
    random_state=42,
    n_estimators=100
)

try:
    # Train the anomaly detection model
    fitted_isolation_forest = isolation_forest.fit(scaled_data)
    print(f"✅ Isolation Forest trained successfully")
    
    # Detect anomalies
    anomaly_data = fitted_isolation_forest.predict(scaled_data)
    
    print(f"🎉 Anomaly detection completed!")
    
    # Analyze anomalies
    # Note: Snowpark ML IsolationForest returns anomaly scores
    # Negative scores typically indicate anomalies
    
    # Add anomaly flag based on score threshold
    anomaly_results = anomaly_data.with_column(
        "IS_ANOMALY",
        col("ANOMALY_SCORE") < -0.1  # Threshold for anomaly classification
    )
    
    # Count anomalies
    anomaly_summary = anomaly_results.group_by("IS_ANOMALY").agg(
        count("patient_id").alias("patient_count")
    ).collect()
    
    total_patients = sum(row["PATIENT_COUNT"] for row in anomaly_summary)
    
    print(f"\n🔍 Anomaly Detection Results:")
    for row in anomaly_summary:
        is_anomaly = row["IS_ANOMALY"]
        count = row["PATIENT_COUNT"]
        percentage = (count / total_patients) * 100
        
        if is_anomaly:
            print(f"   🚨 Anomalous Patients: {count} ({percentage:.1f}%)")
        else:
            print(f"   ✅ Normal Patients: {count} ({percentage:.1f}%)")
    
    # Analyze characteristics of anomalous patients
    anomaly_characteristics = anomaly_results.filter(col("IS_ANOMALY") == True).agg(
        count("patient_id").alias("anomaly_count"),
        avg("patient_age").alias("avg_age"),
        avg("total_claim_amount_sum").alias("avg_claims"),
        avg("num_conditions").alias("avg_conditions"),
        avg("comorbidity_score").alias("avg_comorbidity"),
        avg("healthcare_utilization_score").alias("avg_utilization"),
        min("ANOMALY_SCORE").alias("min_anomaly_score"),
        max("ANOMALY_SCORE").alias("max_anomaly_score")
    ).collect()
    
    if anomaly_characteristics and anomaly_characteristics[0]["ANOMALY_COUNT"] > 0:
        anomaly_stats = anomaly_characteristics[0]
        
        print(f"\n🔍 Anomalous Patient Characteristics:")
        print(f"   👤 Average Age: {anomaly_stats['AVG_AGE']:.1f} years")
        print(f"   💰 Average Claims: ${anomaly_stats['AVG_CLAIMS']:,.0f}")
        print(f"   🏥 Average Conditions: {anomaly_stats['AVG_CONDITIONS']:.1f}")
        print(f"   📈 Average Comorbidity: {anomaly_stats['AVG_COMORBIDITY']:.2f}")
        print(f"   ⚡ Average Utilization: {anomaly_stats['AVG_UTILIZATION']:.2f}")
        print(f"   📊 Anomaly Score Range: {anomaly_stats['MIN_ANOMALY_SCORE']:.3f} to {anomaly_stats['MAX_ANOMALY_SCORE']:.3f}")
        
        # Show specific anomalous patients
        print(f"\n🔍 Sample Anomalous Patients:")
        anomalous_patients = anomaly_results.filter(col("IS_ANOMALY") == True).select(
            "patient_id", "patient_age", "total_claim_amount_sum", "num_conditions", 
            "comorbidity_score", "ANOMALY_SCORE"
        ).limit(3)
        anomalous_patients.show()
        
        # Clinical interpretation of anomalies
        print(f"\n💡 Clinical Interpretation of Anomalies:")
        print(f"   • Unusual patterns may indicate:")
        print(f"     - Billing fraud or coding errors")
        print(f"     - Rare disease conditions") 
        print(f"     - Exceptional treatment responses")
        print(f"     - Data quality issues requiring investigation")
        print(f"   • Recommend: Manual review by clinical experts")
    
    else:
        print(f"   ℹ️ No significant anomalies detected in current dataset")
    
except Exception as e:
    print(f"❌ Error in anomaly detection: {e}")
    print("📝 Continuing with simplified anomaly detection...")
    
    # Simple fallback: flag patients with extreme values
    anomaly_results = scaled_data.with_column(
        "IS_ANOMALY",
        (col("total_claim_amount_sum") > 50000) | (col("num_conditions") > 10)
    ).with_column(
        "ANOMALY_SCORE",
        lit(-0.5)  # Dummy score for fallback
    )


In [None]:
print("\n🔍 Feature Discovery with Principal Component Analysis (PCA)...")
print("=" * 60)

# Use PCA to discover hidden patterns and reduce dimensionality
print("📊 Training PCA for dimensionality reduction and feature discovery...")

# Initialize PCA with 4 components (matching our cluster count)
pca = PCA(
    input_cols=scaled_features,
    output_cols=["PC1", "PC2", "PC3", "PC4"],
    n_components=4
)

try:
    # Train PCA
    fitted_pca = pca.fit(scaled_data)
    print(f"✅ PCA trained successfully")
    
    # Transform data to principal components
    pca_data = fitted_pca.transform(scaled_data)
    
    print(f"🎉 Dimensionality reduction completed!")
    print(f"   📉 Reduced {len(ml_features)} features to 4 principal components")
    
    # Analyze principal component characteristics
    pc_stats = pca_data.select(
        avg("PC1").alias("avg_pc1"),
        stddev("PC1").alias("std_pc1"),
        avg("PC2").alias("avg_pc2"),
        stddev("PC2").alias("std_pc2"),
        avg("PC3").alias("avg_pc3"),
        stddev("PC3").alias("std_pc3"),
        avg("PC4").alias("avg_pc4"),
        stddev("PC4").alias("std_pc4")
    ).collect()[0]
    
    print(f"\n📊 Principal Component Statistics:")
    for i in range(1, 5):
        avg_key = f"AVG_PC{i}"
        std_key = f"STD_PC{i}"
        print(f"   PC{i}: μ={pc_stats[avg_key]:.3f}, σ={pc_stats[std_key]:.3f}")
    
    # Interpret principal components based on healthcare context
    print(f"\n💡 Healthcare Interpretation of Principal Components:")
    print(f"   🔬 PC1: Likely represents 'Disease Burden' (conditions + comorbidity)")
    print(f"   💰 PC2: Likely represents 'Healthcare Utilization' (claims + costs)")
    print(f"   👤 PC3: Likely represents 'Demographics' (age + complexity)")
    print(f"   ⚡ PC4: Likely represents 'Acute Care Needs' (recent utilization)")
    
    # Show sample transformed data
    print(f"\n📄 Sample PCA-Transformed Data:")
    pca_sample = pca_data.select("patient_id", "PC1", "PC2", "PC3", "PC4").limit(5)
    pca_sample.show()
    
    # Create comprehensive dataset with all unsupervised insights
    print(f"\n🔗 Combining All Unsupervised ML Results...")
    
    # Combine clustering, anomaly detection, and PCA results
    comprehensive_results = clustered_data.join(
        anomaly_results.select("patient_id", "IS_ANOMALY", "ANOMALY_SCORE"),
        "patient_id",
        "inner"
    ).join(
        pca_data.select("patient_id", "PC1", "PC2", "PC3", "PC4"),
        "patient_id", 
        "inner"
    )
    
    print(f"✅ Comprehensive unsupervised ML dataset created")
    
    # Show sample of comprehensive results
    print(f"\n📄 Sample Comprehensive Results:")
    comprehensive_sample = comprehensive_results.select(
        "patient_id", "CLUSTER_ID", "IS_ANOMALY", "PC1", "PC2", 
        "comorbidity_score", "healthcare_utilization_score"
    ).limit(5)
    comprehensive_sample.show()
    
except Exception as e:
    print(f"❌ Error in PCA analysis: {e}")
    print("📝 Continuing without PCA transformation...")
    
    # Simple fallback: use original scaled features as "components"
    pca_data = scaled_data.select(
        "patient_id",
        col(scaled_features[0]).alias("PC1"),
        col(scaled_features[1]).alias("PC2"), 
        col(scaled_features[2]).alias("PC3"),
        col(scaled_features[3]).alias("PC4")
    )
    
    comprehensive_results = clustered_data.join(
        anomaly_results.select("patient_id", "IS_ANOMALY", "ANOMALY_SCORE"),
        "patient_id",
        "inner"
    )


In [None]:
print("\n💾 Saving Unsupervised ML Results to Feature Store...")
print("=" * 60)

# Save comprehensive unsupervised ML results
session.use_schema("FEATURE_STORE")

try:
    # Create table for unsupervised ML insights
    session.sql("""
        CREATE TABLE IF NOT EXISTS UNSUPERVISED_ML_INSIGHTS (
            patient_id VARCHAR(50),
            cluster_id INTEGER,
            cluster_risk_level VARCHAR(20),
            is_anomaly BOOLEAN,
            anomaly_score FLOAT,
            pc1 FLOAT,
            pc2 FLOAT,
            pc3 FLOAT,
            pc4 FLOAT,
            patient_age INTEGER,
            comorbidity_score FLOAT,
            healthcare_utilization_score FLOAT,
            analysis_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP(),
            PRIMARY KEY (patient_id, analysis_date)
        )
    """).collect()
    
    # Prepare data for saving with risk level interpretation
    save_data = comprehensive_results.with_column(
        "cluster_risk_level",
        when(col("CLUSTER_ID") == 0, "LOW")
        .when(col("CLUSTER_ID") == 1, "MEDIUM")
        .when(col("CLUSTER_ID") == 2, "HIGH")
        .otherwise("CRITICAL")
    ).with_column(
        "analysis_date",
        lit(datetime.datetime.now())
    ).select(
        "patient_id",
        "CLUSTER_ID",
        "cluster_risk_level", 
        "IS_ANOMALY",
        "ANOMALY_SCORE",
        "PC1", "PC2", "PC3", "PC4",
        "patient_age",
        "comorbidity_score",
        "healthcare_utilization_score",
        "analysis_date"
    )
    
    # Save to table
    save_data.write.mode("overwrite").save_as_table("UNSUPERVISED_ML_INSIGHTS")
    
    saved_count = session.table("UNSUPERVISED_ML_INSIGHTS").count()
    print(f"✅ Saved {saved_count} patient insights to UNSUPERVISED_ML_INSIGHTS")
    
except Exception as e:
    print(f"❌ Error saving insights: {e}")

print(f"\n📈 Clinical and Business Insights from Unsupervised ML...")
print(f"=" * 60)

# Generate actionable insights
try:
    # Cross-analysis: Clusters vs Anomalies
    cluster_anomaly_analysis = comprehensive_results.group_by("CLUSTER_ID").agg(
        count("patient_id").alias("total_patients"),
        avg(col("IS_ANOMALY").cast("int")).alias("anomaly_rate"),
        avg("comorbidity_score").alias("avg_comorbidity")
    ).collect()
    
    print(f"🎯 Patient Stratification Insights:")
    for cluster in cluster_anomaly_analysis:
        cluster_id = cluster["CLUSTER_ID"]
        total = cluster["TOTAL_PATIENTS"]
        anomaly_rate = cluster["ANOMALY_RATE"] * 100
        comorbidity = cluster["AVG_COMORBIDITY"]
        
        risk_level = ["LOW", "MEDIUM", "HIGH", "CRITICAL"][min(cluster_id, 3)]
        
        print(f"\n   📊 {risk_level} RISK CLUSTER {cluster_id}:")
        print(f"      👥 Patients: {total}")
        print(f"      🚨 Anomaly Rate: {anomaly_rate:.1f}%")
        print(f"      📈 Comorbidity: {comorbidity:.2f}")
        
        # Clinical recommendations
        if risk_level == "LOW":
            print(f"      💡 Strategy: Preventive care, wellness programs")
        elif risk_level == "MEDIUM":
            print(f"      💡 Strategy: Regular monitoring, chronic disease management")
        elif risk_level == "HIGH":
            print(f"      💡 Strategy: Care coordination, specialist referrals")
        else:  # CRITICAL
            print(f"      💡 Strategy: Intensive management, frequent monitoring")
    
    print(f"\n💰 Healthcare Economics Impact:")
    
    # Calculate potential cost savings from targeted interventions
    total_patients = sum(cluster["TOTAL_PATIENTS"] for cluster in cluster_anomaly_analysis)
    
    # Estimate cost savings (healthcare industry benchmarks)
    preventive_care_savings = 0.15  # 15% savings for low-risk patients
    chronic_care_savings = 0.25     # 25% savings for medium-risk patients  
    complex_care_savings = 0.35     # 35% savings for high/critical-risk patients
    
    print(f"   📊 Population Analysis ({total_patients} patients):")
    
    estimated_total_cost = 0
    estimated_savings = 0
    
    for cluster in cluster_anomaly_analysis:
        cluster_id = cluster["CLUSTER_ID"]
        total = cluster["TOTAL_PATIENTS"]
        percentage = (total / total_patients) * 100
        
        # Estimate average cost per cluster (simplified model)
        if cluster_id == 0:  # Low risk
            avg_cost = 2000
            savings_rate = preventive_care_savings
        elif cluster_id == 1:  # Medium risk
            avg_cost = 8000
            savings_rate = chronic_care_savings
        elif cluster_id == 2:  # High risk
            avg_cost = 20000
            savings_rate = complex_care_savings
        else:  # Critical risk
            avg_cost = 50000
            savings_rate = complex_care_savings
        
        cluster_cost = total * avg_cost
        cluster_savings = cluster_cost * savings_rate
        
        estimated_total_cost += cluster_cost
        estimated_savings += cluster_savings
        
        risk_level = ["LOW", "MEDIUM", "HIGH", "CRITICAL"][min(cluster_id, 3)]
        print(f"      {risk_level}: {total} patients ({percentage:.1f}%) - ${cluster_savings:,.0f} potential savings")
    
    roi_percentage = (estimated_savings / estimated_total_cost) * 100
    
    print(f"\n   💡 Economic Impact Summary:")
    print(f"      📊 Total Healthcare Cost: ${estimated_total_cost:,.0f}")
    print(f"      💰 Potential Annual Savings: ${estimated_savings:,.0f}")
    print(f"      📈 ROI from ML Segmentation: {roi_percentage:.1f}%")
    
    print(f"\n🏥 Operational Recommendations:")
    print(f"   • 🎯 Implement personalized care pathways by risk segment")
    print(f"   • 🚨 Set up automated alerts for anomalous patients")
    print(f"   • 📊 Use clusters for resource planning and staffing")
    print(f"   • 🔍 Investigate anomalies for fraud prevention")
    print(f"   • 📈 Track cluster migration to measure intervention success")
    
except Exception as e:
    print(f"❌ Error generating insights: {e}")
    print(f"💡 Basic segmentation completed - manual analysis recommended")

print(f"\n✅ Unsupervised ML Analysis Complete!")
print(f"🎯 Key outputs: Patient clusters, anomaly detection, feature discovery")
print(f"📊 Next step: Use insights for training pipeline in notebook 11")
