# Comparison of All Three Forecasting Methods
ThisIsClay Co - HVAC Demand Forecasting

This script provides a comprehensive comparison of all three forecasting approaches:
1. Snowflake Cortex ML (SQL-based, managed)
2. XGBoost Time Series (Custom features, flexible)
3. Snowpark ML (End-to-end ML workflow)

Analysis includes:
- Total forecast comparisons
- Regional and product breakdowns
- Method strengths and weaknesses
- Recommendations for different use cases

In [None]:
import snowflake.snowpark as snowpark
from snowflake.snowpark import Session
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt  # Not available in Snowflake by default
# import seaborn as sns  # Not available in Snowflake by default
from datetime import datetime

# Set visualization style
# sns.set_style('whitegrid')  # Not available in Snowflake by default
# plt.rcParams['figure.figsize'] = (16, 8)  # Not available in Snowflake by default

def compare_total_forecasts(session: Session):
    """
    Compare total forecasted demand across all methods
    """
    
    print("\n" + "="*80)
    print("TOTAL FORECAST COMPARISON")
    print("="*80)
    
    comparison_query = """
    WITH method_totals AS (
        SELECT 
            'Cortex ML' AS METHOD,
            COUNT(*) AS NUM_FORECASTS,
            ROUND(SUM(FORECAST_DEMAND), 0) AS TOTAL_FORECAST,
            ROUND(AVG(FORECAST_DEMAND), 2) AS AVG_FORECAST,
            ROUND(STDDEV(FORECAST_DEMAND), 2) AS STDDEV_FORECAST
        FROM CORTEX_ML_FORECASTS
        
        UNION ALL
        
        SELECT 
            'XGBoost' AS METHOD,
            COUNT(*) AS NUM_FORECASTS,
            ROUND(SUM(FORECAST_DEMAND), 0) AS TOTAL_FORECAST,
            ROUND(AVG(FORECAST_DEMAND), 2) AS AVG_FORECAST,
            ROUND(STDDEV(FORECAST_DEMAND), 2) AS STDDEV_FORECAST
        FROM XGBOOST_FORECASTS
        
        UNION ALL
        
        SELECT 
            'Snowpark ML' AS METHOD,
            COUNT(*) AS NUM_FORECASTS,
            ROUND(SUM(FORECAST_DEMAND), 0) AS TOTAL_FORECAST,
            ROUND(AVG(FORECAST_DEMAND), 2) AS AVG_FORECAST,
            ROUND(STDDEV(FORECAST_DEMAND), 2) AS STDDEV_FORECAST
        FROM SNOWPARK_ML_FORECASTS
    )
    SELECT 
        METHOD,
        NUM_FORECASTS,
        TOTAL_FORECAST,
        ROUND(TOTAL_FORECAST / 52.0, 0) AS AVG_WEEKLY_TOTAL,
        AVG_FORECAST,
        STDDEV_FORECAST
    FROM method_totals
    ORDER BY TOTAL_FORECAST DESC
    """
    
    df_comparison = session.sql(comparison_query).to_pandas()
    
    print("\n📊 Overall Forecast Comparison:")
    print(df_comparison.to_string(index=False))
    
    # Calculate differences
    if len(df_comparison) >= 2:
        highest = df_comparison.iloc[0]['TOTAL_FORECAST']
        lowest = df_comparison.iloc[-1]['TOTAL_FORECAST']
        diff_pct = ((highest - lowest) / lowest * 100)
        
        print(f"\n📈 Spread Analysis:")
        print(f"  Highest forecast: {df_comparison.iloc[0]['METHOD']} - {highest:,.0f} units")
        print(f"  Lowest forecast: {df_comparison.iloc[-1]['METHOD']} - {lowest:,.0f} units")
        print(f"  Difference: {(highest - lowest):,.0f} units ({diff_pct:.1f}%)")
    
    return df_comparison


In [None]:
def compare_by_region(session: Session):
    """
    Compare forecasts by region across all methods
    """
    
    print("\n" + "="*80)
    print("REGIONAL FORECAST COMPARISON")
    print("="*80)
    
    regional_query = """
    WITH combined_forecasts AS (
        SELECT REGION, FORECAST_DEMAND, 'Cortex ML' AS METHOD FROM CORTEX_ML_FORECASTS
        UNION ALL
        SELECT REGION, FORECAST_DEMAND, 'XGBoost' AS METHOD FROM XGBOOST_FORECASTS
        UNION ALL
        SELECT REGION, FORECAST_DEMAND, 'Snowpark ML' AS METHOD FROM SNOWPARK_ML_FORECASTS
    )
    SELECT 
        REGION,
        ROUND(SUM(CASE WHEN METHOD = 'Cortex ML' THEN FORECAST_DEMAND END), 0) AS CORTEX_ML,
        ROUND(SUM(CASE WHEN METHOD = 'XGBoost' THEN FORECAST_DEMAND END), 0) AS XGBOOST,
        ROUND(SUM(CASE WHEN METHOD = 'Snowpark ML' THEN FORECAST_DEMAND END), 0) AS SNOWPARK_ML,
        ROUND(AVG(FORECAST_DEMAND), 0) AS AVG_ACROSS_METHODS,
        ROUND(STDDEV(FORECAST_DEMAND), 0) AS STDDEV_ACROSS_METHODS
    FROM combined_forecasts
    GROUP BY REGION
    ORDER BY AVG_ACROSS_METHODS DESC
    """
    
    df_regional = session.sql(regional_query).to_pandas()
    
    print("\n📍 Forecast by Region (52-week totals):")
    print(df_regional.to_string(index=False))
    
    # Find regions with highest variance
    if 'STDDEV_ACROSS_METHODS' in df_regional.columns:
        df_sorted = df_regional.sort_values('STDDEV_ACROSS_METHODS', ascending=False)
        print(f"\n🔍 Regions with Highest Forecast Variance:")
        print(f"  1. {df_sorted.iloc[0]['REGION']}: StdDev = {df_sorted.iloc[0]['STDDEV_ACROSS_METHODS']:,.0f}")
        print(f"  2. {df_sorted.iloc[1]['REGION']}: StdDev = {df_sorted.iloc[1]['STDDEV_ACROSS_METHODS']:,.0f}")
        print(f"  3. {df_sorted.iloc[2]['REGION']}: StdDev = {df_sorted.iloc[2]['STDDEV_ACROSS_METHODS']:,.0f}")
    
    return df_regional


In [None]:
def compare_by_product(session: Session):
    """
    Compare forecasts by product across all methods
    """
    
    print("\n" + "="*80)
    print("PRODUCT FORECAST COMPARISON")
    print("="*80)
    
    product_query = """
    WITH combined_forecasts AS (
        SELECT PRODUCT, FORECAST_DEMAND, 'Cortex ML' AS METHOD FROM CORTEX_ML_FORECASTS
        UNION ALL
        SELECT PRODUCT, FORECAST_DEMAND, 'XGBoost' AS METHOD FROM XGBOOST_FORECASTS
        UNION ALL
        SELECT PRODUCT, FORECAST_DEMAND, 'Snowpark ML' AS METHOD FROM SNOWPARK_ML_FORECASTS
    )
    SELECT 
        PRODUCT,
        ROUND(SUM(CASE WHEN METHOD = 'Cortex ML' THEN FORECAST_DEMAND END), 0) AS CORTEX_ML,
        ROUND(SUM(CASE WHEN METHOD = 'XGBoost' THEN FORECAST_DEMAND END), 0) AS XGBOOST,
        ROUND(SUM(CASE WHEN METHOD = 'Snowpark ML' THEN FORECAST_DEMAND END), 0) AS SNOWPARK_ML,
        ROUND(AVG(FORECAST_DEMAND), 0) AS AVG_ACROSS_METHODS
    FROM combined_forecasts
    GROUP BY PRODUCT
    ORDER BY AVG_ACROSS_METHODS DESC
    """
    
    df_product = session.sql(product_query).to_pandas()
    
    print("\n🔧 Forecast by Product (52-week totals):")
    print(df_product.to_string(index=False))
    
    return df_product


In [None]:
def compare_by_customer_segment(session: Session):
    """
    Compare forecasts by customer segment across all methods
    """
    
    print("\n" + "="*80)
    print("CUSTOMER SEGMENT FORECAST COMPARISON")
    print("="*80)
    
    segment_query = """
    WITH combined_forecasts AS (
        SELECT CUSTOMER_SEGMENT, FORECAST_DEMAND, 'Cortex ML' AS METHOD FROM CORTEX_ML_FORECASTS
        UNION ALL
        SELECT CUSTOMER_SEGMENT, FORECAST_DEMAND, 'XGBoost' AS METHOD FROM XGBOOST_FORECASTS
        UNION ALL
        SELECT CUSTOMER_SEGMENT, FORECAST_DEMAND, 'Snowpark ML' AS METHOD FROM SNOWPARK_ML_FORECASTS
    )
    SELECT 
        CUSTOMER_SEGMENT,
        ROUND(SUM(CASE WHEN METHOD = 'Cortex ML' THEN FORECAST_DEMAND END), 0) AS CORTEX_ML,
        ROUND(SUM(CASE WHEN METHOD = 'XGBoost' THEN FORECAST_DEMAND END), 0) AS XGBOOST,
        ROUND(SUM(CASE WHEN METHOD = 'Snowpark ML' THEN FORECAST_DEMAND END), 0) AS SNOWPARK_ML,
        ROUND(AVG(FORECAST_DEMAND), 0) AS AVG_ACROSS_METHODS,
        ROUND(STDDEV(FORECAST_DEMAND), 0) AS STDDEV_ACROSS_METHODS
    FROM combined_forecasts
    GROUP BY CUSTOMER_SEGMENT
    ORDER BY AVG_ACROSS_METHODS DESC
    """
    
    df_segment = session.sql(segment_query).to_pandas()
    
    print("\n👥 Forecast by Customer Segment (52-week totals):")
    print(df_segment.to_string(index=False))
    
    # Calculate segment percentages
    total_by_method = df_segment[['CORTEX_ML', 'XGBOOST', 'SNOWPARK_ML']].sum()
    
    print(f"\n📊 Segment Distribution:")
    for seg in df_segment['CUSTOMER_SEGMENT']:
        row = df_segment[df_segment['CUSTOMER_SEGMENT'] == seg].iloc[0]
        pct = (row['AVG_ACROSS_METHODS'] / df_segment['AVG_ACROSS_METHODS'].sum()) * 100
        print(f"  {seg}: {pct:.1f}% of total demand")
    
    return df_segment


In [None]:
def analyze_seasonal_patterns(session: Session):
    """
    Analyze seasonal forecast patterns across methods
    """
    
    print("\n" + "="*80)
    print("SEASONAL PATTERN COMPARISON")
    print("="*80)
    
    seasonal_query = """
    WITH combined_forecasts AS (
        SELECT 
            WEEK_START_DATE,
            CASE 
                WHEN MONTH(WEEK_START_DATE) IN (12, 1, 2) THEN 'Winter'
                WHEN MONTH(WEEK_START_DATE) IN (3, 4, 5) THEN 'Spring'
                WHEN MONTH(WEEK_START_DATE) IN (6, 7, 8) THEN 'Summer'
                WHEN MONTH(WEEK_START_DATE) IN (9, 10, 11) THEN 'Fall'
            END AS SEASON,
            FORECAST_DEMAND,
            'Cortex ML' AS METHOD 
        FROM CORTEX_ML_FORECASTS
        
        UNION ALL
        
        SELECT 
            WEEK_START_DATE,
            CASE 
                WHEN MONTH(WEEK_START_DATE) IN (12, 1, 2) THEN 'Winter'
                WHEN MONTH(WEEK_START_DATE) IN (3, 4, 5) THEN 'Spring'
                WHEN MONTH(WEEK_START_DATE) IN (6, 7, 8) THEN 'Summer'
                WHEN MONTH(WEEK_START_DATE) IN (9, 10, 11) THEN 'Fall'
            END AS SEASON,
            FORECAST_DEMAND,
            'XGBoost' AS METHOD 
        FROM XGBOOST_FORECASTS
        
        UNION ALL
        
        SELECT 
            WEEK_START_DATE,
            CASE 
                WHEN MONTH(WEEK_START_DATE) IN (12, 1, 2) THEN 'Winter'
                WHEN MONTH(WEEK_START_DATE) IN (3, 4, 5) THEN 'Spring'
                WHEN MONTH(WEEK_START_DATE) IN (6, 7, 8) THEN 'Summer'
                WHEN MONTH(WEEK_START_DATE) IN (9, 10, 11) THEN 'Fall'
            END AS SEASON,
            FORECAST_DEMAND,
            'Snowpark ML' AS METHOD 
        FROM SNOWPARK_ML_FORECASTS
    )
    SELECT 
        SEASON,
        ROUND(AVG(CASE WHEN METHOD = 'Cortex ML' THEN FORECAST_DEMAND END), 0) AS CORTEX_ML_AVG,
        ROUND(AVG(CASE WHEN METHOD = 'XGBoost' THEN FORECAST_DEMAND END), 0) AS XGBOOST_AVG,
        ROUND(AVG(CASE WHEN METHOD = 'Snowpark ML' THEN FORECAST_DEMAND END), 0) AS SNOWPARK_ML_AVG,
        ROUND(AVG(FORECAST_DEMAND), 0) AS OVERALL_AVG
    FROM combined_forecasts
    GROUP BY SEASON
    ORDER BY CASE SEASON WHEN 'Winter' THEN 1 WHEN 'Spring' THEN 2 WHEN 'Summer' THEN 3 WHEN 'Fall' THEN 4 END
    """
    
    df_seasonal = session.sql(seasonal_query).to_pandas()
    
    print("\n🌡️  Average Weekly Demand by Season:")
    print(df_seasonal.to_string(index=False))
    
    # Find peak and low seasons
    peak_season = df_seasonal.loc[df_seasonal['OVERALL_AVG'].idxmax(), 'SEASON']
    low_season = df_seasonal.loc[df_seasonal['OVERALL_AVG'].idxmin(), 'SEASON']
    
    print(f"\n📈 Seasonal Insights:")
    print(f"  Peak season: {peak_season}")
    print(f"  Low season: {low_season}")
    
    return df_seasonal


In [None]:
def method_recommendations(session: Session):
    """
    Provide recommendations for when to use each method
    """
    
    print("\n" + "="*80)
    print("METHOD RECOMMENDATIONS & DECISION GUIDE")
    print("="*80)
    
    print("""
For ThisIsClay Co HVAC demand forecasting:

🎯 RECOMMENDED APPROACH:

    METHOD 1: SNOWFLAKE CORTEX ML
    ✅ CHOOSE WHEN:
      • You need quick results with minimal setup
      • Your team primarily uses SQL
      • Standard time series patterns (seasonality, trend)
      • Rapid prototyping and business reporting
      
    METHOD 2: XGBOOST TIME SERIES
    ✅ CHOOSE WHEN:
      • You need maximum flexibility and control
      • Complex demand patterns with multiple drivers
      • Custom feature engineering is critical
      • You want model explainability
      
    METHOD 3: SNOWPARK ML
    ✅ CHOOSE WHEN:
      • Building production ML pipelines in Snowflake
      • Need model governance, versioning, lineage
      • Want integrated ML Ops capabilities
      • Standardizing on Snowflake ML platform

💡 KEY TAKEAWAY:
   Different methods serve different purposes. Use the right tool for each use case
   rather than forcing a single solution for all forecasting needs.
    """)
    
    return session


In [None]:
def create_summary_comparison_table(session: Session):
    """
    Create a consolidated comparison view
    """
    
    print("\n" + "="*80)
    print("CREATING COMPARISON VIEW")
    print("="*80)
    
    # Create base view combining all forecasts
    create_view = """
    CREATE OR REPLACE VIEW ALL_METHODS_COMBINED AS
    SELECT 
        WEEK_START_DATE,
        REGION,
        PRODUCT,
        CUSTOMER_SEGMENT,
        FORECAST_DEMAND,
        'Cortex_ML' AS METHOD
    FROM CORTEX_ML_FORECASTS
    
    UNION ALL
    
    SELECT 
        WEEK_START_DATE,
        REGION,
        PRODUCT,
        CUSTOMER_SEGMENT,
        FORECAST_DEMAND,
        'XGBoost' AS METHOD
    FROM XGBOOST_FORECASTS
    
    UNION ALL
    
    SELECT 
        WEEK_START_DATE,
        REGION,
        PRODUCT,
        CUSTOMER_SEGMENT,
        FORECAST_DEMAND,
        'Snowpark_ML' AS METHOD
    FROM SNOWPARK_ML_FORECASTS
    """
    
    session.sql(create_view).collect()
    print("\n✓ Created consolidated view: ALL_METHODS_COMBINED")
    
    return session


In [None]:
def main(session: Session):
    """
    Main comparison function
    """
    
    print("="*80)
    print("COMPREHENSIVE FORECASTING METHOD COMPARISON")
    print("ThisIsClay Co - HVAC Demand Forecasting")
    print("="*80)
    
    # Set context
    session.sql("USE ROLE HVAC_FORECAST_ROLE").collect()
    session.sql("USE WAREHOUSE HVAC_FORECAST_WH").collect()
    session.sql("USE DATABASE HVAC_FORECAST_DB").collect()
    session.sql("USE SCHEMA FORECAST_DATA").collect()
    
    print("\n✓ Connected to Snowflake")
    
    # Run all comparisons
    try:
        df_total = compare_total_forecasts(session)
        df_regional = compare_by_region(session)
        df_product = compare_by_product(session)
        df_segment = compare_by_customer_segment(session)
        df_seasonal = analyze_seasonal_patterns(session)
        create_summary_comparison_table(session)
        method_recommendations(session)
        
        print("\n" + "="*80)
        print("✅ COMPARISON ANALYSIS COMPLETE!")
        print("="*80)
        
        print("\n🎉 CONGRATULATIONS!")
        print("You have successfully completed the ThisIsClay Co HVAC Forecasting Lab!")
        print("\nYou now understand:")
        print("  ✓ How to use Snowflake Cortex ML for quick SQL-based forecasting")
        print("  ✓ How to build custom XGBoost models with advanced features")
        print("  ✓ How to leverage Snowpark ML for production ML workflows")
        print("  ✓ When to use each method for different business needs")
        
        print("\n📚 Next Steps:")
        print("  1. Apply these methods to your own forecasting problems")
        print("  2. Experiment with different features and hyperparameters")
        print("  3. Build automated retraining pipelines")
        print("  4. Integrate forecasts into your business applications")
        print("  5. Explore Snowflake's ML Observability features")
        
    except Exception as e:
        print(f"\n⚠️  Error during comparison: {str(e)}")
        print("\nMake sure you have run all three forecasting methods:")
        print("  1. 1_cortex_ml_forecasting.py")
        print("  2. 2_xgboost_time_series.py")
        print("  3. 3_snowpark_ml_forecasting.py")
    
    print("\n" + "="*80 + "\n")
    
    
    # ====================================================================================
    # VISUAL VALIDATION: CREATE VIEWS FOR CHARTING
    # ====================================================================================
    
    print("\n" + "="*80)
    print("📊 CREATING VISUALIZATION VIEWS")
    print("="*80)
    
    # Create views for visualization
    viz_timeseries = """
    CREATE OR REPLACE VIEW ALL_METHODS_VIZ_TIMESERIES AS
    SELECT 
        WEEK_START_DATE,
        METHOD,
        SUM(FORECAST_DEMAND) AS TOTAL_WEEKLY_FORECAST,
        AVG(FORECAST_DEMAND) AS AVG_FORECAST_PER_SERIES
    FROM ALL_METHODS_COMBINED
    GROUP BY WEEK_START_DATE, METHOD
    ORDER BY WEEK_START_DATE, METHOD
    """
    session.sql(viz_timeseries).collect()
    
    # Create a view for regional comparison
    viz_regional = """
    CREATE OR REPLACE VIEW ALL_METHODS_VIZ_REGIONAL AS
    SELECT 
        REGION,
        METHOD,
        SUM(FORECAST_DEMAND) AS TOTAL_FORECAST,
        COUNT(DISTINCT PRODUCT) AS NUM_PRODUCTS,
        COUNT(DISTINCT CUSTOMER_SEGMENT) AS NUM_SEGMENTS
    FROM ALL_METHODS_COMBINED
    GROUP BY REGION, METHOD
    ORDER BY REGION, METHOD
    """
    session.sql(viz_regional).collect()
    
    print("\n✅ Created visualization views!")
    print("\nYou can now create charts in Snowsight using:")
    print(f"  • ALL_METHODS_VIZ_TIMESERIES - Weekly forecast trend by method")
    print(f"  • ALL_METHODS_VIZ_REGIONAL - Regional comparison by method")
    
    # Display sample validation data
    print("\n" + "="*80)
    print("📈 VALIDATION: SAMPLE FORECAST DATA")
    print("="*80)
    
    sample_data = session.sql("""
        SELECT 
            WEEK_START_DATE,
            REGION,
            PRODUCT,
            CUSTOMER_SEGMENT,
            FORECAST_DEMAND,
            METHOD
        FROM ALL_METHODS_COMBINED
        WHERE WEEK_START_DATE <= (SELECT MIN(WEEK_START_DATE) + INTERVAL '3 weeks' FROM ALL_METHODS_COMBINED)
        ORDER BY WEEK_START_DATE, REGION, PRODUCT
        LIMIT 10
    """).to_pandas()
    
    print("\nSample Forecasts (First 3 Weeks):")
    print(sample_data.to_string(index=False))
    
    # Validation checks
    print("\n" + "="*80)
    print("✅ VALIDATION CHECKS - All Methods Comparison")
    print("="*80)
    
    checks = session.sql("""
        SELECT 
            COUNT(*) AS TOTAL_FORECASTS,
            COUNT(DISTINCT WEEK_START_DATE) AS UNIQUE_WEEKS,
            COUNT(DISTINCT REGION) AS UNIQUE_REGIONS,
            COUNT(DISTINCT PRODUCT) AS UNIQUE_PRODUCTS,
            MIN(FORECAST_DEMAND) AS MIN_FORECAST,
            MAX(FORECAST_DEMAND) AS MAX_FORECAST,
            AVG(FORECAST_DEMAND) AS AVG_FORECAST,
            CASE 
                WHEN COUNT(*) >= 52 THEN '✅ PASS' 
                ELSE '❌ FAIL'
            END AS WEEKS_CHECK,
            CASE 
                WHEN MIN(FORECAST_DEMAND) >= 0 THEN '✅ PASS'
                ELSE '❌ FAIL'
            END AS POSITIVE_CHECK
        FROM ALL_METHODS_COMBINED
    """).to_pandas()
    
    print("\n🔍 Data Quality Checks:")
    for col in checks.columns:
        val = checks[col].values[0]
        print(f"  {col}: {val}")
    
    print("\n" + "="*80)
    print("🎯 TO VISUALIZE IN SNOWSIGHT:")
    print("="*80)
    print("""
1. Go to Worksheets in Snowsight
2. Run: SELECT * FROM ALL_METHODS_COMBINED
3. Click 'Chart' button
4. Select 'Line Chart'
5. X-axis: WEEK_START_DATE
6. Y-axis: TOTAL_WEEKLY_FORECAST
    
This will show your 52-week forecast trend! 📈
    """)
    
    return session

    return session

# For Snowflake Notebooks

In [None]:
if __name__ == "__main__":
    session = snowpark.context.get_active_session()
    main(session)

