In [None]:
import streamlit as st
st.image("Snowflake_Logo.svg", width=300)

# Origence Intelligence Agent - ML Models

**Training 3 Machine Learning Models for Credit Union Lending**

This notebook trains 3 ML models for the Origence Intelligence Agent:
1. **LOAN_DEFAULT_PREDICTOR** - Predicts loan default risk (4 classes)
2. **LOAN_APPROVAL_PREDICTOR** - Predicts loan approval likelihood (3 classes)
3. **FRAUD_DETECTION_MODEL** - Detects fraudulent applications (3 classes)

---

## Prerequisites
- Database: `ORIGENCE_INTELLIGENCE`
- Schema: `ML_MODELS`
- Feature views created (V_LOAN_DEFAULT_FEATURES, V_LOAN_APPROVAL_FEATURES, V_FRAUD_DETECTION_FEATURES)
- Packages: `snowflake-ml-python`, `scikit-learn`, `pandas`

In [None]:
import os
print(os.listdir('.'))  # Lists all files in current directory

## Setup and Imports

In [None]:
# Import required libraries
from snowflake.snowpark import Session
from snowflake.ml.modeling.ensemble import RandomForestClassifier
from snowflake.ml.modeling.linear_model import LogisticRegression
from snowflake.ml.modeling.preprocessing import OneHotEncoder, StandardScaler
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.registry import Registry
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")

In [None]:
# Get current session
session = Session.builder.getOrCreate()

# Set context
session.use_database("ORIGENCE_INTELLIGENCE")
session.use_schema("ML_MODELS")
session.use_warehouse("ORIGENCE_WH")

print("✅ Session configured")
print(f"Database: {session.get_current_database()}")
print(f"Schema: {session.get_current_schema()}")
print(f"Warehouse: {session.get_current_warehouse()}")

In [None]:
# Initialize Model Registry
registry = Registry(
    session=session,
    database_name="ORIGENCE_INTELLIGENCE",
    schema_name="ML_MODELS"
)

print("✅ Model Registry initialized")

---
## Model 1: Loan Default Risk Predictor

**Objective**: Predict likelihood of loan default  
**Labels**: 0=Low Risk, 1=Medium Risk, 2=High Risk, 3=Critical Risk  
**Algorithm**: Random Forest Classifier  
**Features**: Loan amount, term, interest rate, credit score, DTI, LTV, payment history

In [None]:
# Load loan default feature data
default_df = session.table("ORIGENCE_INTELLIGENCE.ANALYTICS.V_LOAN_DEFAULT_FEATURES")

print(f"✅ Loaded {default_df.count()} records for default prediction")
default_df.show(5)

In [None]:
# Split data for training and testing
train_default, test_default = default_df.random_split([0.8, 0.2], seed=42)

# Drop ID columns not needed for training
train_default = train_default.drop("LOAN_ID")
test_default = test_default.drop("LOAN_ID")

print(f"Training set: {train_default.count()} records")
print(f"Test set: {test_default.count()} records")

In [None]:
# Create FAST default prediction pipeline - optimized for <10s execution
# Using simpler model: fewer trees, shallow depth, no scaling
default_pipeline = Pipeline([
    ("Encoder", OneHotEncoder(
        input_cols=["LOAN_TYPE"],
        output_cols=["LOAN_TYPE_ENC"],
        drop_input_cols=True,
        handle_unknown="ignore"
    )),
    ("Classifier", RandomForestClassifier(
        label_cols=["DEFAULT_RISK_LABEL"],
        output_cols=["PREDICTED_RISK"],
        n_estimators=3,
        max_depth=3,
        random_state=42
    ))
])

print("✅ Default prediction pipeline created (optimized for speed)")

In [None]:
# Train the default prediction model
print("Training default prediction model...")
default_pipeline.fit(train_default)
print("✅ Default prediction model trained")

In [None]:
# Evaluate model on test set
test_predictions = default_pipeline.predict(test_default)
test_results = test_predictions.select("DEFAULT_RISK_LABEL", "PREDICTED_RISK").to_pandas()

from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(test_results['DEFAULT_RISK_LABEL'], test_results['PREDICTED_RISK'])

print(f"Test Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(
    test_results['DEFAULT_RISK_LABEL'], 
    test_results['PREDICTED_RISK']
))

In [None]:
# Register model in Model Registry
# Drop label column from sample data - model signature should only include features
sample_data = train_default.drop("DEFAULT_RISK_LABEL").limit(100)

registry.log_model(
    model=default_pipeline,
    model_name="LOAN_DEFAULT_PREDICTOR",
    target_platforms=['WAREHOUSE'],
    sample_input_data=sample_data,
    comment="Predicts loan default risk with 4 risk levels (Low/Medium/High/Critical)"
)

print("✅ LOAN_DEFAULT_PREDICTOR registered in Model Registry")

---
## Model 2: Loan Approval Predictor

**Objective**: Predict loan application approval likelihood  
**Labels**: 0=Likely Deny, 1=Needs Review, 2=Likely Approve  
**Algorithm**: Logistic Regression  
**Features**: Credit score, income, DTI, employment, LTV, collateral

In [None]:
# Load loan approval feature data
approval_df = session.table("ORIGENCE_INTELLIGENCE.ANALYTICS.V_LOAN_APPROVAL_FEATURES")

print(f"✅ Loaded {approval_df.count()} records for approval prediction")
approval_df.show(5)

In [None]:
# Split data
train_approval, test_approval = approval_df.random_split([0.8, 0.2], seed=42)

train_approval = train_approval.drop("APPLICATION_ID")
test_approval = test_approval.drop("APPLICATION_ID")

print(f"Training set: {train_approval.count()} records")
print(f"Test set: {test_approval.count()} records")

In [None]:
# Create FAST approval prediction pipeline - optimized for <10s execution
# Using LogisticRegression with fewer iterations, no scaling
approval_pipeline = Pipeline([
    ("Encoder", OneHotEncoder(
        input_cols=["LOAN_TYPE"],
        output_cols=["LOAN_TYPE_ENC"],
        drop_input_cols=True,
        handle_unknown="ignore"
    )),
    ("Classifier", LogisticRegression(
        label_cols=["APPROVAL_LABEL"],
        output_cols=["PREDICTED_APPROVAL"],
        max_iter=100
    ))
])

print("✅ Approval prediction pipeline created (optimized for speed)")

In [None]:
# Train the approval prediction model
print("Training approval prediction model...")
approval_pipeline.fit(train_approval)
print("✅ Approval prediction model trained")

In [None]:
# Evaluate model
test_predictions = approval_pipeline.predict(test_approval)
test_results = test_predictions.select("APPROVAL_LABEL", "PREDICTED_APPROVAL").to_pandas()

accuracy = accuracy_score(test_results['APPROVAL_LABEL'], test_results['PREDICTED_APPROVAL'])

print(f"Test Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(
    test_results['APPROVAL_LABEL'], 
    test_results['PREDICTED_APPROVAL'],
    target_names=['Likely Deny', 'Needs Review', 'Likely Approve']
))

In [None]:
# Register model
# Drop label column from sample data - model signature should only include features
sample_data = train_approval.drop("APPROVAL_LABEL").limit(100)

registry.log_model(
    model=approval_pipeline,
    model_name="LOAN_APPROVAL_PREDICTOR",
    target_platforms=['WAREHOUSE'],
    sample_input_data=sample_data,
    comment="Predicts loan approval likelihood with 3 outcomes (Deny/Review/Approve)"
)

print("✅ LOAN_APPROVAL_PREDICTOR registered in Model Registry")

---
## Model 3: Fraud Detection Model

**Objective**: Detect potentially fraudulent loan applications  
**Labels**: 0=Clean, 1=Suspicious, 2=High Risk  
**Algorithm**: Random Forest Classifier  
**Features**: Application velocity, income verification, credit anomalies, DTI flags

In [None]:
# Load fraud detection feature data
fraud_df = session.table("ORIGENCE_INTELLIGENCE.ANALYTICS.V_FRAUD_DETECTION_FEATURES")

print(f"✅ Loaded {fraud_df.count()} records for fraud detection")
fraud_df.show(5)

In [None]:
# Split data
train_fraud, test_fraud = fraud_df.random_split([0.8, 0.2], seed=42)

train_fraud = train_fraud.drop("APPLICATION_ID")
test_fraud = test_fraud.drop("APPLICATION_ID")

print(f"Training set: {train_fraud.count()} records")
print(f"Test set: {test_fraud.count()} records")

In [None]:
# Create FAST fraud detection pipeline - optimized for <10s execution
# Using simpler RandomForest: fewer trees, shallow depth, no scaling
fraud_pipeline = Pipeline([
    ("Encoder", OneHotEncoder(
        input_cols=["LOAN_TYPE"],
        output_cols=["LOAN_TYPE_ENC"],
        drop_input_cols=True,
        handle_unknown="ignore"
    )),
    ("Classifier", RandomForestClassifier(
        label_cols=["FRAUD_RISK_LABEL"],
        output_cols=["PREDICTED_FRAUD_RISK"],
        n_estimators=3,
        max_depth=3,
        random_state=42
    ))
])

print("✅ Fraud detection pipeline created (optimized for speed)")

In [None]:
# Train the fraud detection model
print("Training fraud detection model...")
fraud_pipeline.fit(train_fraud)
print("✅ Fraud detection model trained")

In [None]:
# Evaluate model
test_predictions = fraud_pipeline.predict(test_fraud)
test_results = test_predictions.select("FRAUD_RISK_LABEL", "PREDICTED_FRAUD_RISK").to_pandas()

accuracy = accuracy_score(test_results['FRAUD_RISK_LABEL'], test_results['PREDICTED_FRAUD_RISK'])

print(f"Test Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(
    test_results['FRAUD_RISK_LABEL'], 
    test_results['PREDICTED_FRAUD_RISK'],
    target_names=['Clean', 'Suspicious', 'High Risk']
))

In [None]:
# Register model
# Drop label column from sample data - model signature should only include features
sample_data = train_fraud.drop("FRAUD_RISK_LABEL").limit(100)

registry.log_model(
    model=fraud_pipeline,
    model_name="FRAUD_DETECTION_MODEL",
    target_platforms=['WAREHOUSE'],
    sample_input_data=sample_data,
    comment="Detects fraudulent applications with 3 risk levels (Clean/Suspicious/High Risk)"
)

print("✅ FRAUD_DETECTION_MODEL registered in Model Registry")

---
## Summary and Verification

In [None]:
# List all registered models
models = session.sql("SHOW MODELS IN SCHEMA ML_MODELS").collect()

print("\n" + "="*80)
print("REGISTERED MODELS")
print("="*80)
for model in models:
    print(f"✅ {model['name']}")

print("\n" + "="*80)
print("MODEL TRAINING COMPLETE")
print("="*80)
print("\n3 ML models successfully trained and registered:")
print("1. LOAN_DEFAULT_PREDICTOR - Predicts default risk (4 classes)")
print("2. LOAN_APPROVAL_PREDICTOR - Predicts approval likelihood (3 classes)")
print("3. FRAUD_DETECTION_MODEL - Detects fraud risk (3 classes)")
print("\nNext steps:")
print("1. Run origence_07_model_wrapper_functions.sql to create SQL procedures")
print("2. Run origence_08_create_intelligence_agent.sql to configure agent")
print("3. Test agent with sample questions from origence_questions.md")