# Sales Forecasting Model with Snowpark ML

This notebook trains an XGBoost regression model to predict sales amounts based on:
- Date features (month, day of week)
- Region
- Product category

The model is registered in Snowflake's Model Registry and can be called as a tool from the Snowflake Intelligence Agent.

## 1. Setup and Imports

In [None]:
# Snowpark and ML imports
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col, month, dayofweek, year
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.preprocessing import OrdinalEncoder
from snowflake.ml.registry import Registry
import pandas as pd
import numpy as np

# For evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Get active Snowpark session (when running in Snowsight)
from snowflake.snowpark.context import get_active_session
session = get_active_session()

# IMPORTANT: Update these values based on your lab path
# For Manual path (si_*): DATABASE = 'SI_DB', MODEL_NAME = 'si_sales_forecast'
# For Cortex Code path (coco_*): DATABASE = 'COCO_DB', MODEL_NAME = 'coco_sales_forecast'

DATABASE = 'SI_DB'  # Change to 'COCO_DB' for Cortex Code path
SCHEMA = 'RETAIL'
MODEL_NAME = 'si_sales_forecast'  # Change to 'coco_sales_forecast' for Cortex Code path

# Set database and schema context - REQUIRED for temp table operations
session.use_database(DATABASE)
session.use_schema(SCHEMA)

print(f"Using database: {DATABASE}")
print(f"Using schema: {SCHEMA}")
print(f"Model will be registered as: {MODEL_NAME}")

## 2. Load and Prepare Data

In [None]:
# Load sales data with product info
sales_df = session.table(f"{DATABASE}.{SCHEMA}.SALES")
products_df = session.table(f"{DATABASE}.{SCHEMA}.PRODUCTS")

# Join sales with products to get category
df = sales_df.join(
    products_df,
    sales_df["PRODUCT_ID"] == products_df["PRODUCT_ID"],
    "left"
).select(
    sales_df["DATE"],
    sales_df["REGION"],
    products_df["CATEGORY"],
    sales_df["UNITS_SOLD"],
    sales_df["SALES_AMOUNT"]
)

print(f"Total records: {df.count()}")
df.show(5)

In [None]:
# Feature engineering: extract date features
df_features = df.with_column("MONTH", month(col("DATE"))) \
               .with_column("DAY_OF_WEEK", dayofweek(col("DATE"))) \
               .with_column("YEAR", year(col("DATE")))

# Select features for modeling
df_model = df_features.select(
    "REGION",
    "CATEGORY", 
    "MONTH",
    "DAY_OF_WEEK",
    "YEAR",
    "UNITS_SOLD",
    "SALES_AMOUNT"
).dropna()

print(f"Records after feature engineering: {df_model.count()}")
df_model.show(5)

## 3. Encode Categorical Features

In [None]:
# Encode categorical variables (REGION, CATEGORY)
categorical_cols = ["REGION", "CATEGORY"]
output_cols = ["REGION_ENCODED", "CATEGORY_ENCODED"]

encoder = OrdinalEncoder(
    input_cols=categorical_cols,
    output_cols=output_cols
)

encoder.fit(df_model)
df_encoded = encoder.transform(df_model)

df_encoded.show(5)

## 4. Train/Test Split

In [None]:
# Define feature columns and target
feature_cols = ["REGION_ENCODED", "CATEGORY_ENCODED", "MONTH", "DAY_OF_WEEK", "YEAR", "UNITS_SOLD"]
target_col = "SALES_AMOUNT"

# Split data 80/20
train_df, test_df = df_encoded.random_split([0.8, 0.2], seed=42)

print(f"Training records: {train_df.count()}")
print(f"Test records: {test_df.count()}")

## 5. Train XGBoost Model

In [None]:
# Create and train XGBoost regressor
model = XGBRegressor(
    input_cols=feature_cols,
    label_cols=[target_col],
    output_cols=["PREDICTED_SALES"],
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

print("Training XGBoost model...")
model.fit(train_df)
print("Training complete!")

## 6. Evaluate Model

In [None]:
# Make predictions on test set
predictions_df = model.predict(test_df)

# Convert to pandas for evaluation
results_pd = predictions_df.select(target_col, "PREDICTED_SALES").to_pandas()

# Calculate metrics
y_true = results_pd[target_col]
y_pred = results_pd["PREDICTED_SALES"]

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print("\n=== Model Performance ===")
print(f"RMSE: ${rmse:,.2f}")
print(f"MAE:  ${mae:,.2f}")
print(f"R2:   {r2:.3f}")

In [None]:
# Show sample predictions
print("\nSample Predictions:")
predictions_df.select(
    "REGION", "CATEGORY", "MONTH", target_col, "PREDICTED_SALES"
).show(10)

## 7. Register Model in Snowflake Model Registry

In [None]:
# Initialize registry
registry = Registry(session=session, database_name=DATABASE, schema_name=SCHEMA)

# Log the model to registry
model_version = registry.log_model(
    model_name=MODEL_NAME,
    version_name="v1",
    model=model,
    comment=f"XGBoost sales forecasting model. RMSE: ${rmse:,.2f}, R2: {r2:.3f}"
)

print(f"Model registered: {MODEL_NAME} v1")
print(f"Full path: {DATABASE}.{SCHEMA}.{MODEL_NAME}")

In [None]:
# List all models in registry
print("\nModels in registry:")
for m in registry.models():
    print(f"  - {m.name}")

## 8. Test Model Inference via SQL

In [None]:
# Test the model with sample data
# Note: After registration, you can call the model via SQL like:
# SELECT model_name!PREDICT(...) FROM ...

test_query = f"""
WITH sample_data AS (
    SELECT 
        1 AS REGION_ENCODED,  -- e.g., 'West'
        2 AS CATEGORY_ENCODED, -- e.g., 'Electronics'
        7 AS MONTH,
        3 AS DAY_OF_WEEK,
        2025 AS YEAR,
        50 AS UNITS_SOLD
)
SELECT * FROM sample_data
"""

sample_df = session.sql(test_query)
prediction = model.predict(sample_df)
prediction.show()

## 9. Call Model via SQL

The registered model can be called directly using SQL. This is the recommended approach for using ML models in Snowflake.

In [None]:
# Call the registered model directly via SQL
# The model exposes a PREDICT method that can be called using: MODEL_NAME!PREDICT(...)

inference_sql = f"""
WITH sample_input AS (
    SELECT 
        1.0::FLOAT AS REGION_ENCODED,
        2.0::FLOAT AS CATEGORY_ENCODED,
        8.0::FLOAT AS MONTH,
        3.0::FLOAT AS DAY_OF_WEEK,
        2025.0::FLOAT AS YEAR,
        100.0::FLOAT AS UNITS_SOLD
)
SELECT {DATABASE}.{SCHEMA}.{MODEL_NAME}!PREDICT(
    REGION_ENCODED, CATEGORY_ENCODED, MONTH, DAY_OF_WEEK, YEAR, UNITS_SOLD
):PREDICTED_SALES::FLOAT AS predicted_sales
FROM sample_input
"""

result = session.sql(inference_sql).collect()
print(f"Predicted sales (via SQL): ${result[0]['PREDICTED_SALES']:,.2f}")

In [None]:
# Show how to use the model on actual data
batch_inference_sql = f"""
SELECT 
    REGION,
    CATEGORY,
    MONTH,
    UNITS_SOLD,
    SALES_AMOUNT AS ACTUAL_SALES,
    {DATABASE}.{SCHEMA}.{MODEL_NAME}!PREDICT(
        REGION_ENCODED, CATEGORY_ENCODED, MONTH, DAY_OF_WEEK, YEAR, UNITS_SOLD
    ):PREDICTED_SALES::FLOAT AS PREDICTED_SALES
FROM (
    SELECT 
        s.REGION,
        p.CATEGORY,
        MONTH(s.DATE) AS MONTH,
        DAYOFWEEK(s.DATE) AS DAY_OF_WEEK,
        YEAR(s.DATE) AS YEAR,
        s.UNITS_SOLD,
        s.SALES_AMOUNT,
        -- Encoding (simplified)
        CASE s.REGION WHEN 'East' THEN 0 WHEN 'West' THEN 1 WHEN 'North' THEN 2 ELSE 3 END AS REGION_ENCODED,
        CASE p.CATEGORY WHEN 'Electronics' THEN 0 WHEN 'Fitness Wear' THEN 1 WHEN 'Home Appliances' THEN 2 WHEN 'Smart Home' THEN 3 ELSE 4 END AS CATEGORY_ENCODED
    FROM {DATABASE}.{SCHEMA}.SALES s
    JOIN {DATABASE}.{SCHEMA}.PRODUCTS p ON s.PRODUCT_ID = p.PRODUCT_ID
    LIMIT 10
)
"""

print("Batch predictions on sample data:")
session.sql(batch_inference_sql).show()

## Done!

The model is now registered in the Snowflake Model Registry and can be called via SQL.

**How to use the model:**
```sql
SELECT SI_DB.RETAIL.SI_SALES_FORECAST!PREDICT(
    region_encoded, category_encoded, month, day_of_week, year, units_sold
):PREDICTED_SALES::FLOAT AS prediction
FROM your_table
```

**Next Steps:**
1. Go to Snowflake Intelligence
2. Edit your agent
3. Add a new tool of type \"Cortex Analyst\" pointing to the semantic model
4. The agent can now query sales data and the model can be used for predictions