In [None]:
import streamlit as st
st.image("Snowflake_Logo.svg", width=300)

# Lereta Intelligence Agent - ML Models

**Training 3 Machine Learning Models for Tax & Flood Intelligence**

This notebook trains 3 ML models for the Lereta Intelligence Agent:
1. **TAX_DELINQUENCY_PREDICTOR** - Predicts property tax delinquency risk
2. **CLIENT_CHURN_PREDICTOR** - Predicts client churn risk
3. **LOAN_RISK_CLASSIFIER** - Classifies loans by risk level (LOW/MEDIUM/HIGH)

---

## Prerequisites
- Database: `LERETA_INTELLIGENCE`
- Schema: `ML_MODELS`
- Feature views created (V_TAX_DELINQUENCY_FEATURES, V_CLIENT_CHURN_FEATURES, V_LOAN_RISK_FEATURES)
- Packages: `snowflake-ml-python`, `scikit-learn`, `pandas`

In [None]:
import os
print(os.listdir('.'))  # Lists all files in current directory

## Setup and Imports

In [None]:
# Import required libraries
from snowflake.snowpark import Session
from snowflake.ml.modeling.ensemble import RandomForestClassifier
from snowflake.ml.modeling.linear_model import LogisticRegression
from snowflake.ml.modeling.preprocessing import OneHotEncoder, StandardScaler
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.registry import Registry
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")

In [None]:
# Get current session
session = Session.builder.getOrCreate()

# Set context
session.use_database("LERETA_INTELLIGENCE")
session.use_schema("ML_MODELS")
session.use_warehouse("LERETA_WH")

print("✅ Session configured")
print(f"Database: {session.get_current_database()}")
print(f"Schema: {session.get_current_schema()}")
print(f"Warehouse: {session.get_current_warehouse()}")

In [None]:
# Initialize Model Registry
registry = Registry(
    session=session,
    database_name="LERETA_INTELLIGENCE",
    schema_name="ML_MODELS"
)

print("✅ Model Registry initialized")

---
## Model 1: Tax Delinquency Predictor

**Objective**: Predict property tax delinquency risk  
**Labels**: 0=Not Delinquent, 1=Delinquent  
**Algorithm**: Random Forest Classifier  
**Features**: Property type, tax amount, jurisdiction, payment history

In [None]:
# Load tax delinquency feature data
tax_df = session.table("LERETA_INTELLIGENCE.ANALYTICS.V_TAX_DELINQUENCY_FEATURES")

print(f"✅ Loaded {tax_df.count()} records for tax delinquency prediction")
tax_df.show(5)

In [None]:
# Split data for training and testing
train_tax, test_tax = tax_df.random_split([0.8, 0.2], seed=42)

# Drop ID columns not needed for training
train_tax = train_tax.drop("TAX_RECORD_ID")
test_tax = test_tax.drop("TAX_RECORD_ID")

print(f"Training set: {train_tax.count()} records")
print(f"Test set: {test_tax.count()} records")

In [None]:
# Create tax delinquency prediction pipeline - optimized for <10s execution
# Using simpler model: fewer trees, shallow depth, no scaling
tax_pipeline = Pipeline([
    ("Encoder", OneHotEncoder(
        input_cols=["PROPERTY_TYPE", "FLOOD_ZONE", "JURISDICTION_TYPE", "LOAN_TYPE", "CLIENT_TYPE", "LOAN_STATUS", "CLIENT_STATUS"],
        output_cols=["PROPERTY_TYPE_ENC", "FLOOD_ZONE_ENC", "JURISDICTION_TYPE_ENC", "LOAN_TYPE_ENC", "CLIENT_TYPE_ENC", "LOAN_STATUS_ENC", "CLIENT_STATUS_ENC"],
        drop_input_cols=True,
        handle_unknown="ignore"
    )),
    ("Classifier", RandomForestClassifier(
        label_cols=["ACTUAL_DELINQUENT"],
        output_cols=["PREDICTED_DELINQUENT"],
        n_estimators=3,
        max_depth=3,
        random_state=42
    ))
])

print("✅ Tax delinquency pipeline created (optimized for speed)")

In [None]:
# Train the tax delinquency prediction model
print("Training tax delinquency prediction model...")
tax_pipeline.fit(train_tax)
print("✅ Tax delinquency model trained")

In [None]:
# Evaluate model on test set
test_predictions = tax_pipeline.predict(test_tax)
test_results = test_predictions.select("ACTUAL_DELINQUENT", "PREDICTED_DELINQUENT").to_pandas()

from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(test_results['ACTUAL_DELINQUENT'], test_results['PREDICTED_DELINQUENT'])

print(f"Test Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(
    test_results['ACTUAL_DELINQUENT'], 
    test_results['PREDICTED_DELINQUENT']
))

In [None]:
# Delete existing model if it exists to force fresh registration
try:
    registry.delete_model("TAX_DELINQUENCY_PREDICTOR")
    print("✅ Deleted existing TAX_DELINQUENCY_PREDICTOR")
except:
    print("No existing model to delete")

# Register model in Model Registry
# Drop label column from sample data - model signature should only include features
sample_data = train_tax.drop("ACTUAL_DELINQUENT").limit(100)

registry.log_model(
    model=tax_pipeline,
    model_name="TAX_DELINQUENCY_PREDICTOR",
    target_platforms=['WAREHOUSE'],
    sample_input_data=sample_data,
    comment="Predicts property tax delinquency risk"
)

print("✅ TAX_DELINQUENCY_PREDICTOR registered in Model Registry")

---
## Model 2: Campaign ROI Predictor

**Objective**: Predict campaign ROI likelihood  
**Labels**: 0=Low ROI, 1=Medium ROI, 2=High ROI  
**Algorithm**: Logistic Regression  
**Features**: Objective, budget, duration, posts

In [None]:
# Load campaign ROI feature data
roi_df = session.table("HOOTSUITE_INTELLIGENCE.ANALYTICS.V_CAMPAIGN_ROI_FEATURES")

print(f"✅ Loaded {roi_df.count()} records for ROI prediction")
roi_df.show(5)

In [None]:
# Split data
train_roi, test_roi = roi_df.random_split([0.8, 0.2], seed=42)

train_roi = train_roi.drop("CAMPAIGN_ID")
test_roi = test_roi.drop("CAMPAIGN_ID")

print(f"Training set: {train_roi.count()} records")
print(f"Test set: {test_roi.count()} records")

In [None]:
# Create FAST ROI prediction pipeline - optimized for <10s execution
# Using LogisticRegression with fewer iterations, no scaling
roi_pipeline = Pipeline([
    ("Encoder", OneHotEncoder(
        input_cols=["OBJECTIVE"],
        output_cols=["OBJECTIVE_ENC"],
        drop_input_cols=True,
        handle_unknown="ignore"
    )),
    ("Classifier", LogisticRegression(
        label_cols=["ROI_LABEL"],
        output_cols=["PREDICTED_ROI"],
        max_iter=100
    ))
])

print("✅ ROI prediction pipeline created (optimized for speed)")

In [None]:
# Train the ROI prediction model
print("Training ROI prediction model...")
roi_pipeline.fit(train_roi)
print("✅ ROI prediction model trained")

In [None]:
# Evaluation skipped - proceed directly to registration
print("✅ Skipping evaluation, registering model...")

In [None]:
# Delete existing model if it exists to force fresh registration
try:
    registry.delete_model("CAMPAIGN_ROI_PREDICTOR")
    print("✅ Deleted existing CAMPAIGN_ROI_PREDICTOR")
except:
    print("No existing model to delete")

# Register model
# Drop label column from sample data - model signature should only include features
sample_data = train_roi.drop("ROI_LABEL").limit(100)

registry.log_model(
    model=roi_pipeline,
    model_name="CAMPAIGN_ROI_PREDICTOR",
    target_platforms=['WAREHOUSE'],
    sample_input_data=sample_data,
    comment="Predicts campaign ROI with 3 outcomes (Low/Medium/High)"
)

print("✅ CAMPAIGN_ROI_PREDICTOR registered in Model Registry")

---
## Model 3: Ticket Priority Classifier

**Objective**: Classify support ticket priority  
**Labels**: 0=Low, 1=Medium, 2=High, 3=Urgent  
**Algorithm**: Random Forest Classifier  
**Features**: Category, issue summary

In [None]:
# Load ticket priority feature data
ticket_df = session.table("HOOTSUITE_INTELLIGENCE.ANALYTICS.V_TICKET_PRIORITY_FEATURES")

print(f"✅ Loaded {ticket_df.count()} records for ticket priority classification")
ticket_df.show(5)

In [None]:
# Split data
train_ticket, test_ticket = ticket_df.random_split([0.8, 0.2], seed=42)

train_ticket = train_ticket.drop("TICKET_ID")
test_ticket = test_ticket.drop("TICKET_ID")

print(f"Training set: {train_ticket.count()} records")
print(f"Test set: {test_ticket.count()} records")

In [None]:
# Create FAST ticket priority pipeline - optimized for <10s execution
# Using simpler RandomForest: fewer trees, shallow depth, no scaling
ticket_pipeline = Pipeline([
    ("Encoder", OneHotEncoder(
        input_cols=["CATEGORY"],
        output_cols=["CATEGORY_ENC"],
        drop_input_cols=True,
        handle_unknown="ignore"
    )),
    ("Classifier", RandomForestClassifier(
        label_cols=["PRIORITY_LABEL"],
        output_cols=["PREDICTED_PRIORITY"],
        n_estimators=3,
        max_depth=3,
        random_state=42
    ))
])

print("✅ Ticket priority pipeline created (optimized for speed)")

In [None]:
# Train the ticket priority model
print("Training ticket priority model...")
ticket_pipeline.fit(train_ticket)
print("✅ Ticket priority model trained")

In [None]:
# Evaluate model
test_predictions = ticket_pipeline.predict(test_ticket)
test_results = test_predictions.select("PRIORITY_LABEL", "PREDICTED_PRIORITY").to_pandas()

accuracy = accuracy_score(test_results['PRIORITY_LABEL'], test_results['PREDICTED_PRIORITY'])

print(f"Test Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(
    test_results['PRIORITY_LABEL'], 
    test_results['PREDICTED_PRIORITY']
))

In [None]:
# Delete existing model if it exists to force fresh registration
try:
    registry.delete_model("TICKET_PRIORITY_CLASSIFIER")
    print("✅ Deleted existing TICKET_PRIORITY_CLASSIFIER")
except:
    print("No existing model to delete")

# Register model
# Drop label column from sample data - model signature should only include features
sample_data = train_ticket.drop("PRIORITY_LABEL").limit(100)

registry.log_model(
    model=ticket_pipeline,
    model_name="TICKET_PRIORITY_CLASSIFIER",
    target_platforms=['WAREHOUSE'],
    sample_input_data=sample_data,
    comment="Classifies ticket priority with 4 levels (Low/Medium/High/Urgent)"
)

print("✅ TICKET_PRIORITY_CLASSIFIER registered in Model Registry")

---
## Summary and Verification

In [None]:
# List all registered models
models = session.sql("SHOW MODELS IN SCHEMA ML_MODELS").collect()

print("\n" + "="*80)
print("REGISTERED MODELS")
print("="*80)
for model in models:
    print(f"✅ {model['name']}")

print("\n" + "="*80)
print("MODEL TRAINING COMPLETE")
print("="*80)
print("\n3 ML models successfully trained and registered:")
print("1. CHURN_RISK_PREDICTOR - Predicts churn risk (3 classes)")
print("2. CAMPAIGN_ROI_PREDICTOR - Predicts ROI (3 classes)")
print("3. TICKET_PRIORITY_CLASSIFIER - Classifies priority (4 classes)")
print("\nNext steps:")
print("1. Run hootsuite_07_ml_model_functions.sql to create SQL procedures")
print("2. Run hootsuite_08_intelligence_agent.sql to configure agent")
print("3. Test agent with sample questions from hootsuite_questions.md")