In [None]:
import streamlit as st
st.image("Snowflake_Logo.svg", width=300)

# Lereta Intelligence Agent - ML Models

**Training 3 Machine Learning Models for Tax & Flood Intelligence**

This notebook trains 3 ML models for the Lereta Intelligence Agent:
1. **TAX_DELINQUENCY_PREDICTOR** - Predicts property tax delinquency risk
2. **CLIENT_CHURN_PREDICTOR** - Predicts client churn risk
3. **LOAN_RISK_CLASSIFIER** - Classifies loans by risk level (LOW/MEDIUM/HIGH)

---

## Prerequisites
- Database: `LERETA_INTELLIGENCE`
- Schema: `ML_MODELS`
- Feature views created (V_TAX_DELINQUENCY_FEATURES, V_CLIENT_CHURN_FEATURES, V_LOAN_RISK_FEATURES)
- Packages: `snowflake-ml-python`, `scikit-learn`, `pandas`

In [None]:
import os
print(os.listdir('.'))  # Lists all files in current directory

## Setup and Imports

In [None]:
# Import required libraries
from snowflake.snowpark import Session
from snowflake.ml.modeling.ensemble import RandomForestClassifier
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.preprocessing import OneHotEncoder
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.registry import Registry
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")

In [None]:
# Get current session
session = Session.builder.getOrCreate()

# Set context
session.use_database("LERETA_INTELLIGENCE")
session.use_schema("ML_MODELS")
session.use_warehouse("LERETA_WH")

print("✅ Session configured")
print(f"Database: {session.get_current_database()}")
print(f"Schema: {session.get_current_schema()}")
print(f"Warehouse: {session.get_current_warehouse()}")

In [None]:
# Initialize Model Registry
registry = Registry(
    session=session,
    database_name="LERETA_INTELLIGENCE",
    schema_name="ML_MODELS"
)

print("✅ Model Registry initialized")

---
## Model 1: Tax Delinquency Predictor

**Objective**: Predict property tax delinquency risk  
**Labels**: 0=Not Delinquent, 1=Delinquent  
**Algorithm**: Random Forest Classifier  
**Features**: Property type, tax amount, jurisdiction, payment history

In [None]:
# Load tax delinquency feature data
tax_df = session.table("LERETA_INTELLIGENCE.ANALYTICS.V_TAX_DELINQUENCY_FEATURES")

print(f"✅ Loaded {tax_df.count()} records for tax delinquency prediction")
tax_df.show(5)

In [None]:
# Split data for training and testing
train_tax, test_tax = tax_df.random_split([0.8, 0.2], seed=42)

# Drop ID columns not needed for training
train_tax = train_tax.drop("TAX_RECORD_ID")
test_tax = test_tax.drop("TAX_RECORD_ID")

print(f"Training set: {train_tax.count()} records")
print(f"Test set: {test_tax.count()} records")

In [None]:
# Create tax delinquency prediction pipeline - optimized for <10s execution
# Using simpler model: fewer trees, shallow depth, no scaling
tax_pipeline = Pipeline([
    ("Encoder", OneHotEncoder(
        input_cols=["PROPERTY_TYPE", "FLOOD_ZONE", "JURISDICTION_TYPE", "LOAN_TYPE", "CLIENT_TYPE", "LOAN_STATUS", "CLIENT_STATUS"],
        output_cols=["PROPERTY_TYPE_ENC", "FLOOD_ZONE_ENC", "JURISDICTION_TYPE_ENC", "LOAN_TYPE_ENC", "CLIENT_TYPE_ENC", "LOAN_STATUS_ENC", "CLIENT_STATUS_ENC"],
        drop_input_cols=True,
        handle_unknown="ignore"
    )),
    ("Classifier", RandomForestClassifier(
        label_cols=["ACTUAL_DELINQUENT"],
        output_cols=["PREDICTED_DELINQUENT"],
        n_estimators=3,
        max_depth=3,
        random_state=42
    ))
])

print("✅ Tax delinquency pipeline created (optimized for speed)")

In [None]:
# Train the tax delinquency prediction model
print("Training tax delinquency prediction model...")
tax_pipeline.fit(train_tax)
print("✅ Tax delinquency model trained")

In [None]:
# Evaluate model on test set
test_predictions = tax_pipeline.predict(test_tax)
test_results = test_predictions.select("ACTUAL_DELINQUENT", "PREDICTED_DELINQUENT").to_pandas()

from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(test_results['ACTUAL_DELINQUENT'], test_results['PREDICTED_DELINQUENT'])

print(f"Test Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(
    test_results['ACTUAL_DELINQUENT'], 
    test_results['PREDICTED_DELINQUENT']
))

In [None]:
# Delete existing model if it exists to force fresh registration
try:
    registry.delete_model("TAX_DELINQUENCY_PREDICTOR")
    print("✅ Deleted existing TAX_DELINQUENCY_PREDICTOR")
except:
    print("No existing model to delete")

# Register model in Model Registry
# Drop label column from sample data - model signature should only include features
sample_data = train_tax.drop("ACTUAL_DELINQUENT").limit(100)

registry.log_model(
    model=tax_pipeline,
    model_name="TAX_DELINQUENCY_PREDICTOR",
    target_platforms=['WAREHOUSE'],
    sample_input_data=sample_data,
    comment="Predicts property tax delinquency risk"
)

print("✅ TAX_DELINQUENCY_PREDICTOR registered in Model Registry")

---
## Model 2: Client Churn Predictor

**Objective**: Predict client subscription status  
**Labels**: ACTIVE, EXPIRED, PENDING_RENEWAL  
**Algorithm**: XGBoost Classifier  
**Features**: Client type, subscription tier, support metrics, revenue

In [None]:
# Load client churn feature data
churn_df = session.table("LERETA_INTELLIGENCE.ANALYTICS.V_CLIENT_CHURN_FEATURES")

print(f"✅ Loaded {churn_df.count()} records for client churn prediction")
churn_df.show(5)

In [None]:
# Split data
train_churn, test_churn = churn_df.random_split([0.8, 0.2], seed=42)

train_churn = train_churn.drop("CLIENT_ID")
test_churn = test_churn.drop("CLIENT_ID")

print(f"Training set: {train_churn.count()} records")
print(f"Test set: {test_churn.count()} records")

In [None]:
# Create client churn prediction pipeline - optimized for speed
churn_pipeline = Pipeline([
    ("Encoder", OneHotEncoder(
        input_cols=["CLIENT_TYPE", "SERVICE_TYPE", "SUBSCRIPTION_TIER", "BILLING_CYCLE"],
        output_cols=["CLIENT_TYPE_ENC", "SERVICE_TYPE_ENC", "SUBSCRIPTION_TIER_ENC", "BILLING_CYCLE_ENC"],
        drop_input_cols=True,
        handle_unknown="ignore"
    )),
    ("Classifier", XGBClassifier(
        label_cols=["CHURN_RISK_LABEL"],
        output_cols=["PREDICTED_RISK"],
        n_estimators=3,
        max_depth=3,
        random_state=42
    ))
])

print("✅ Client churn pipeline created (optimized for speed)")

In [None]:
# Train the client churn prediction model
print("Training client churn prediction model...")
churn_pipeline.fit(train_churn)
print("✅ Client churn model trained")

In [None]:
# Evaluation skipped - proceed directly to registration
print("✅ Skipping evaluation, registering model...")

In [None]:
# Delete existing model if it exists
try:
    registry.delete_model("CLIENT_CHURN_PREDICTOR")
    print("✅ Deleted existing CLIENT_CHURN_PREDICTOR")
except:
    print("No existing model to delete")

# Register model
# Drop label column from sample data
sample_data = train_churn.drop("CHURN_RISK_LABEL").limit(100)

registry.log_model(
    model=churn_pipeline,
    model_name="CLIENT_CHURN_PREDICTOR",
    target_platforms=['WAREHOUSE'],
    sample_input_data=sample_data,
    comment="Predicts client churn risk (0=Low, 1=Medium, 2=High)"
)

print("✅ CLIENT_CHURN_PREDICTOR registered in Model Registry")

---
## Model 3: Loan Risk Classifier

**Objective**: Classify loan risk level  
**Labels**: LOW, MEDIUM, HIGH  
**Algorithm**: Random Forest Classifier  
**Features**: Loan details, flood zone, tax compliance

In [None]:
# Load loan risk feature data
risk_df = session.table("LERETA_INTELLIGENCE.ANALYTICS.V_LOAN_RISK_FEATURES")

print(f"✅ Loaded {risk_df.count()} records for loan risk classification")
risk_df.show(5)

In [None]:
# Split data
train_risk, test_risk = risk_df.random_split([0.8, 0.2], seed=42)

train_risk = train_risk.drop("LOAN_ID")
test_risk = test_risk.drop("LOAN_ID")

print(f"Training set: {train_risk.count()} records")
print(f"Test set: {test_risk.count()} records")

In [None]:
# Create loan risk classification pipeline - optimized for speed
risk_pipeline = Pipeline([
    ("Encoder", OneHotEncoder(
        input_cols=["LOAN_TYPE", "LOAN_STATUS", "PROPERTY_TYPE", "FLOOD_ZONE", "PROPERTY_STATE", "JURISDICTION_TYPE", "CLIENT_TYPE"],
        output_cols=["LOAN_TYPE_ENC", "LOAN_STATUS_ENC", "PROPERTY_TYPE_ENC", "FLOOD_ZONE_ENC", "PROPERTY_STATE_ENC", "JURISDICTION_TYPE_ENC", "CLIENT_TYPE_ENC"],
        drop_input_cols=True,
        handle_unknown="ignore"
    )),
    ("Classifier", RandomForestClassifier(
        label_cols=["RISK_LEVEL"],
        output_cols=["PREDICTED_RISK"],
        n_estimators=3,
        max_depth=3,
        random_state=42
    ))
])

print("✅ Loan risk pipeline created (optimized for speed)")

In [None]:
# Train the loan risk classification model
print("Training loan risk classification model...")
risk_pipeline.fit(train_risk)
print("✅ Loan risk model trained")

In [None]:
# Evaluate model
test_predictions = risk_pipeline.predict(test_risk)
test_results = test_predictions.select("RISK_LEVEL", "PREDICTED_RISK").to_pandas()

accuracy = accuracy_score(test_results['RISK_LEVEL'], test_results['PREDICTED_RISK'])

print(f"Test Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(
    test_results['RISK_LEVEL'], 
    test_results['PREDICTED_RISK']
))

In [None]:
# Delete existing model if it exists
try:
    registry.delete_model("LOAN_RISK_CLASSIFIER")
    print("✅ Deleted existing LOAN_RISK_CLASSIFIER")
except:
    print("No existing model to delete")

# Register model
# Drop label column from sample data
sample_data = train_risk.drop("RISK_LEVEL").limit(100)

registry.log_model(
    model=risk_pipeline,
    model_name="LOAN_RISK_CLASSIFIER",
    target_platforms=['WAREHOUSE'],
    sample_input_data=sample_data,
    comment="Classifies loans by risk level (LOW/MEDIUM/HIGH)"
)

print("✅ LOAN_RISK_CLASSIFIER registered in Model Registry")

---
## Summary and Verification

In [None]:
# List all registered models
models = session.sql("SHOW MODELS IN SCHEMA ML_MODELS").collect()

print("\n" + "="*80)
print("REGISTERED MODELS")
print("="*80)
for model in models:
    print(f"✅ {model['name']}")

print("\n" + "="*80)
print("MODEL TRAINING COMPLETE")
print("="*80)
print("\n3 ML models successfully trained and registered:")
print("1. TAX_DELINQUENCY_PREDICTOR - Predicts tax delinquency risk")
print("2. CLIENT_CHURN_PREDICTOR - Predicts client churn risk")
print("3. LOAN_RISK_CLASSIFIER - Classifies loan risk level")
print("\nNext steps:")
print("1. Run sql/ml/07_ml_model_wrappers.sql to create SQL functions")
print("2. Run sql/agent/08_create_ai_agent.sql to configure agent")
print("3. Test agent with sample questions from docs/questions.md")