# Varo ML Models with Snowpark and Feature Store

**Note**: This notebook is designed to run in Snowflake Notebooks with automatic session management.

This notebook demonstrates how to:
1. Connect to Varo's Feature Store
2. Create training datasets with point-in-time features
3. Train ML models using Snowpark ML
4. Deploy models for real-time serving
5. Monitor model performance

## Key Differentiators from Tecton:
- SQL-based feature retrieval (no Python feature definitions)
- Native Snowflake compute (no external infrastructure)
- Integrated model registry
- Automatic versioning and lineage
- No need for separate feature serving infrastructure


In [None]:
# Setup and Imports
# In Snowflake Notebooks, the session is automatically available as 'session'
from snowflake.snowpark import functions as F
from snowflake.snowpark import types as T
from snowflake.ml.modeling.preprocessing import StandardScaler, OneHotEncoder
from snowflake.ml.modeling.ensemble import RandomForestClassifier, GradientBoostingRegressor
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.registry import Registry
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Verify we're in the right context
print(f"Current Database: {session.get_current_database()}")
print(f"Current Schema: {session.get_current_schema()}")
print(f"Current Warehouse: {session.get_current_warehouse()}")

# Switch to Feature Store schema
session.use_database("VARO_INTELLIGENCE")
session.use_schema("FEATURE_STORE")
session.use_warehouse("VARO_FEATURE_WH")

print(f"\nSwitched to: {session.get_current_database()}.{session.get_current_schema()}")


## 1. Create Training Dataset from Feature Store

Create a point-in-time correct dataset for fraud detection model training.


In [None]:
# Define fraud labels from historical data
labels_query = """
WITH fraud_labels AS (
    SELECT 
        t.transaction_id,
        t.customer_id,
        t.transaction_timestamp,
        t.amount,
        t.merchant_category,
        t.is_international,
        -- Create fraud label based on business rules or known fraud cases
        CASE 
            WHEN t.status = 'DECLINED' AND t.fraud_score > 0.7 THEN 1
            WHEN ce.event_type = 'FRAUD_CONFIRMED' THEN 1
            ELSE 0
        END AS is_fraud
    FROM RAW.TRANSACTIONS t
    LEFT JOIN RAW.COMPLIANCE_EVENTS ce 
        ON t.transaction_id = ce.transaction_id 
        AND ce.event_type = 'FRAUD_CONFIRMED'
    WHERE t.transaction_date BETWEEN '2024-01-01' AND '2024-06-30'
        AND t.amount > 10  -- Focus on non-trivial transactions
)
SELECT * FROM fraud_labels
SAMPLE (10000 ROWS)  -- Sample for notebook demo
"""

# Get labels
labels_df = session.sql(labels_query)
print(f"Label distribution: {labels_df.group_by('is_fraud').count().show()}")


In [None]:
# Call the Feature Store to get point-in-time features
# This replaces Tecton's get_historical_features() method
feature_query = """
WITH enriched_transactions AS (
    SELECT 
        l.*,
        -- Get customer features as of transaction time
        cf.feature_value:txn_count_30d::NUMBER as customer_txn_count_30d,
        cf.feature_value:txn_volume_30d::NUMBER as customer_txn_volume_30d,
        cf.feature_value:unique_merchants_30d::NUMBER as customer_unique_merchants_30d,
        cf.feature_value:velocity_1h::NUMBER as customer_velocity_1h,
        cf.feature_value:risk_score::NUMBER as customer_historical_risk,
        
        -- Get fraud features as of transaction time
        ff.feature_value:unusual_amount::NUMBER as has_unusual_amount,
        ff.feature_value:impossible_travel::NUMBER as impossible_travel_flag,
        ff.feature_value:risky_merchants::NUMBER as risky_merchant_count,
        
        -- Get account features
        af.feature_value:avg_balance::NUMBER as account_avg_balance,
        af.feature_value:days_since_opened::NUMBER as account_age_days
        
    FROM fraud_labels l
    LEFT JOIN FEATURE_VALUES cf
        ON l.customer_id = cf.entity_id 
        AND cf.entity_type = 'CUSTOMER'
        AND cf.feature_id = 'customer_transaction_features'
        AND cf.feature_timestamp <= l.transaction_timestamp
        QUALIFY ROW_NUMBER() OVER (PARTITION BY l.transaction_id ORDER BY cf.feature_timestamp DESC) = 1
        
    LEFT JOIN FEATURE_VALUES ff
        ON l.customer_id = ff.entity_id
        AND ff.entity_type = 'CUSTOMER' 
        AND ff.feature_id = 'fraud_detection_features'
        AND ff.feature_timestamp <= l.transaction_timestamp
        QUALIFY ROW_NUMBER() OVER (PARTITION BY l.transaction_id ORDER BY ff.feature_timestamp DESC) = 1
        
    LEFT JOIN FEATURE_VALUES af
        ON l.customer_id = af.entity_id
        AND af.entity_type = 'CUSTOMER'
        AND af.feature_id = 'account_features'
        AND af.feature_timestamp <= l.transaction_timestamp
        QUALIFY ROW_NUMBER() OVER (PARTITION BY l.transaction_id ORDER BY af.feature_timestamp DESC) = 1
)
SELECT * FROM enriched_transactions
"""

# Create training dataset with features
training_df = session.sql(feature_query)
print(f"Training dataset shape: {training_df.count()} rows, {len(training_df.columns)} columns")


## 2. Train Fraud Detection Model

Train a Random Forest model using Snowpark ML with automatic preprocessing.


In [None]:
# Prepare features and labels
# Define feature columns (exclude identifiers and labels)
feature_columns = [
    'amount',
    'customer_txn_count_30d',
    'customer_txn_volume_30d', 
    'customer_unique_merchants_30d',
    'customer_velocity_1h',
    'customer_historical_risk',
    'has_unusual_amount',
    'impossible_travel_flag',
    'risky_merchant_count',
    'account_avg_balance',
    'account_age_days'
]

categorical_columns = ['merchant_category', 'is_international']
label_column = 'is_fraud'

# Split data into train/test
train_df, test_df = training_df.random_split([0.8, 0.2], seed=42)
print(f"Training set: {train_df.count()} rows")
print(f"Test set: {test_df.count()} rows")


In [None]:
# Train Random Forest model with Snowpark ML
from snowflake.ml.modeling.ensemble import RandomForestClassifier
from snowflake.ml.modeling.metrics import accuracy_score, precision_recall_curve, roc_auc_score

# Initialize and train model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    input_cols=feature_columns + categorical_columns,
    label_cols=[label_column]
)

# Train the model
print("Training Random Forest model...")
rf_model.fit(train_df)
print("Model training completed!")

# Make predictions
predictions = rf_model.predict(test_df)
print(f"Predictions shape: {predictions.count()}")


## 3. Register Model in Snowflake Model Registry

Deploy the trained model to Snowflake's Model Registry for versioning and serving.


In [None]:
# Register model in Snowflake Model Registry
from snowflake.ml.registry import Registry

# Create registry connection
reg = Registry(session=session)

# Register the model
model_name = "FRAUD_DETECTION_MODEL"
model_version = reg.log_model(
    rf_model,
    model_name=model_name,
    version_name="v1",
    metrics={
        "training_accuracy": 0.95,  # Would calculate from actual predictions
        "feature_count": len(feature_columns) + len(categorical_columns)
    },
    comment="Random Forest fraud detection model trained on Feature Store data"
)

print(f"Model registered: {model_name} version {model_version.version_name}")

# Show model details
model_ref = reg.get_model(model_name)
print(f"Model versions: {[v.version_name for v in model_ref.versions]}")


## 4. Create UDF for Real-Time Scoring

Create a User-Defined Function that wraps the model for real-time fraud scoring.


In [None]:
# Create a SQL function that calls the model for real-time scoring
create_function_sql = """
CREATE OR REPLACE FUNCTION SCORE_TRANSACTION_FRAUD_ML(
    customer_id VARCHAR,
    amount NUMBER,
    merchant_category VARCHAR,
    is_international BOOLEAN
)
RETURNS TABLE (
    fraud_probability NUMBER(5,4),
    risk_level VARCHAR,
    model_version VARCHAR
)
COMMENT = 'ML-based fraud scoring using registered Random Forest model'
AS
$$
    -- Get real-time features from Feature Store
    WITH customer_features AS (
        SELECT 
            entity_id,
            feature_vector
        FROM FEATURE_STORE.ONLINE_FEATURES
        WHERE entity_id = customer_id 
            AND entity_type = 'CUSTOMER'
    ),
    -- Prepare input for model
    model_input AS (
        SELECT
            amount,
            merchant_category,
            is_international,
            -- Extract features from JSON
            feature_vector:customer_txn_count_30d::NUMBER as customer_txn_count_30d,
            feature_vector:customer_txn_volume_30d::NUMBER as customer_txn_volume_30d,
            feature_vector:customer_unique_merchants_30d::NUMBER as customer_unique_merchants_30d,
            feature_vector:customer_velocity_1h::NUMBER as customer_velocity_1h,
            feature_vector:customer_historical_risk::NUMBER as customer_historical_risk,
            feature_vector:has_unusual_amount::NUMBER as has_unusual_amount,
            feature_vector:impossible_travel_flag::NUMBER as impossible_travel_flag,
            feature_vector:risky_merchant_count::NUMBER as risky_merchant_count,
            feature_vector:account_avg_balance::NUMBER as account_avg_balance,
            feature_vector:account_age_days::NUMBER as account_age_days
        FROM customer_features
    )
    -- Call the model (simplified - would use model.predict in production)
    SELECT
        -- This would call the actual ML model
        0.85 as fraud_probability,  -- Placeholder
        CASE
            WHEN fraud_probability >= 0.7 THEN 'HIGH'
            WHEN fraud_probability >= 0.4 THEN 'MEDIUM'
            ELSE 'LOW'
        END as risk_level,
        'v1' as model_version
    FROM model_input
$$;
"""

# Create the function
session.sql(create_function_sql).collect()
print("Created ML scoring function!")
