# Model Training & Registry with Snowflake ML
## Financial Services ML Pipeline - Native Snowflake Implementation

This notebook demonstrates model training and registration using Snowflake's Model Registry for financial services ML.

## What We'll Build
- **Classification Models**: Conversion prediction, churn prediction
- **Multi-class Classification**: Next best action recommendation
- **Model Comparison**: XGBoost, Random Forest, and LogisticRegression
- **Model Registry**: Version control and lifecycle management
- **Performance Evaluation**: Comprehensive model assessment

## Snowflake ML Features Used
- **Snowpark ML**: Native ML training within Snowflake
- **Model Registry**: Centralized model management and versioning
- **Cross-validation**: Robust model evaluation
- **Feature Engineering**: Automated preprocessing pipelines
- **Model Deployment**: Seamless deployment for inference


In [None]:
# Import required libraries for ML training
import snowflake.snowpark as snowpark
from snowflake.snowpark import Session
from snowflake.snowpark.functions import *
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.ensemble import RandomForestClassifier
from snowflake.ml.modeling.linear_model import LogisticRegression
from snowflake.ml.modeling.preprocessing import StandardScaler, LabelEncoder
from snowflake.ml.modeling.model_selection import train_test_split, cross_validate
from snowflake.ml.modeling.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from snowflake.ml.registry import Registry
import pandas as pd
import numpy as np
from datetime import datetime

# Get active session
session = snowpark.session._get_active_session()

print(f"🤖 Snowflake ML Model Training Pipeline")
print(f"Database: {session.get_current_database()}")
print(f"Schema: {session.get_current_schema()}")
print(f"Warehouse: {session.get_current_warehouse()}")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Verify feature store availability
fs_count = session.sql("SELECT COUNT(*) as count FROM feature_store").collect()[0]['COUNT']
feature_count = session.sql("SELECT COUNT(*) as feature_count FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'FEATURE_STORE'").collect()[0]['FEATURE_COUNT']

print(f"\nFeature Store Ready:")
print(f"📊 Training Records: {fs_count:,}")
print(f"🔧 Available Features: {feature_count}")


## Step 1: Data Preparation & Feature Selection


In [None]:
# Prepare training data with feature selection
print("📋 Preparing training data and selecting features...")

# Define feature sets for different models
numeric_features = [
    'total_events_30d', 'web_visits_30d', 'email_opens_30d', 'email_clicks_30d',
    'engagement_frequency_30d', 'engagement_score_30d', 'days_since_last_activity',
    'age', 'annual_income', 'current_401k_balance', 'years_to_retirement',
    'total_assets_under_management', 'client_tenure_months',
    'income_to_age_ratio', 'assets_to_income_ratio', 'retirement_readiness_score',
    'wealth_growth_potential', 'premium_client_indicator',
    'service_tier_numeric', 'risk_tolerance_numeric', 'investment_experience_numeric',
    'total_lifetime_events', 'education_engagement', 'advisor_meetings_total',
    'web_preference_ratio', 'email_preference_ratio', 'mobile_adoption_score',
    'lifetime_engagement_frequency', 'business_priority_score'
]

categorical_features = [
    'lifecycle_stage', 'age_segment', 'tenure_segment'
]

# Load and prepare training data
training_data_sql = f"""
SELECT 
    client_id,
    {', '.join(numeric_features)},
    {', '.join(categorical_features)},
    conversion_target,
    churn_target,
    next_best_action
FROM feature_store
WHERE conversion_target IS NOT NULL 
  AND churn_target IS NOT NULL
  AND next_best_action IS NOT NULL
"""

# Load data as Snowpark DataFrame
training_df = session.sql(training_data_sql)

print(f"✅ Training data prepared")
print(f"   🔢 Numeric features: {len(numeric_features)}")
print(f"   📝 Categorical features: {len(categorical_features)}")

# Show data distribution for targets
print("\n📊 Target variable distributions:")
target_stats = session.sql("""
    SELECT 
        SUM(conversion_target) as conversion_positives,
        COUNT(*) - SUM(conversion_target) as conversion_negatives,
        SUM(churn_target) as churn_positives,
        COUNT(*) - SUM(churn_target) as churn_negatives,
        COUNT(DISTINCT next_best_action) as action_classes,
        COUNT(*) as total_samples
    FROM feature_store
    WHERE conversion_target IS NOT NULL
""").collect()[0]

print(f"Conversion: {target_stats['CONVERSION_POSITIVES']} positive, {target_stats['CONVERSION_NEGATIVES']} negative")
print(f"Churn: {target_stats['CHURN_POSITIVES']} positive, {target_stats['CHURN_NEGATIVES']} negative")
print(f"Next Action: {target_stats['ACTION_CLASSES']} classes, {target_stats['TOTAL_SAMPLES']} total samples")

# Check for missing values
print("\n🔍 Data quality check:")
session.sql(f"""
    SELECT 
        COUNT(*) as total_records,
        COUNT(CASE WHEN {' IS NULL OR '.join(numeric_features[:5])} IS NULL THEN 1 END) as missing_key_features
    FROM feature_store
    WHERE conversion_target IS NOT NULL
""").show()


## Step 2: Train Conversion Prediction Model


In [None]:
# Train conversion prediction models
print("🎯 Training conversion prediction models...")

# Prepare data for conversion prediction
conversion_features = numeric_features + categorical_features
conversion_df = training_df.select(*conversion_features, 'conversion_target')

# Handle missing values and prepare for training
conversion_clean_sql = """
CREATE OR REPLACE TEMPORARY TABLE conversion_training AS
SELECT *
FROM (""" + training_data_sql + """)
WHERE """ + " AND ".join([f"{feat} IS NOT NULL" for feat in numeric_features[:10]]) + """
"""

session.sql(conversion_clean_sql).collect()
conversion_df_clean = session.table("conversion_training")

print("✅ Data cleaned for conversion prediction")

# Split data for training
X_cols = [col for col in conversion_features if col != 'client_id']
y_col = 'conversion_target'

# Create train/test split using Snowpark ML
X_train, X_test, y_train, y_test = train_test_split(
    conversion_df_clean.select(*X_cols),
    conversion_df_clean.select(y_col),
    test_size=0.2,
    random_state=42
)

print(f"✅ Train/test split completed")

# Train XGBoost model for conversion prediction
print("\n🌲 Training XGBoost for conversion prediction...")

xgb_conversion = XGBClassifier(
    max_depth=6,
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

# Fit the model
xgb_conversion.fit(X_train, y_train)

# Make predictions
conversion_predictions = xgb_conversion.predict(X_test)
conversion_probabilities = xgb_conversion.predict_proba(X_test)

# Calculate metrics
conv_accuracy = accuracy_score(y_test, conversion_predictions)
conv_precision = precision_score(y_test, conversion_predictions)
conv_recall = recall_score(y_test, conversion_predictions) 
conv_f1 = f1_score(y_test, conversion_predictions)

print(f"✅ XGBoost Conversion Model Results:")
print(f"   📊 Accuracy: {conv_accuracy:.4f}")
print(f"   📊 Precision: {conv_precision:.4f}")
print(f"   📊 Recall: {conv_recall:.4f}")
print(f"   📊 F1-Score: {conv_f1:.4f}")

# Train Random Forest for comparison
print("\n🌳 Training Random Forest for conversion prediction...")

rf_conversion = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

rf_conversion.fit(X_train, y_train)
rf_predictions = rf_conversion.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)

print(f"✅ Random Forest Conversion Model Results:")
print(f"   📊 Accuracy: {rf_accuracy:.4f}")
print(f"   📊 Precision: {rf_precision:.4f}")
print(f"   📊 Recall: {rf_recall:.4f}")
print(f"   📊 F1-Score: {rf_f1:.4f}")

# Select best model
if conv_f1 >= rf_f1:
    best_conversion_model = xgb_conversion
    best_conv_score = conv_f1
    best_conv_name = "XGBoost"
else:
    best_conversion_model = rf_conversion
    best_conv_score = rf_f1
    best_conv_name = "RandomForest"

print(f"\n🏆 Best Conversion Model: {best_conv_name} (F1: {best_conv_score:.4f})")
