# ❄️ End-to-end ML OPs Demo ❄️
In this worfklow we will work through the following
- Use Feature Store to track engineered features
    - Store feature defintions in feature store for reproducible computation of ML features
- Train two SnowML Models
    - Xgboost with tree booster
    - Xgboost with linear booster
- Register both models in Snowflake model registry
    - Explore model registry capabilities such as metadata tracking, inference, and explainability
- Set up Model Monitor to track 1 year of predicted and actual loan repayments
    - Compute performance metrics such a F1, Precision, Recall
    - Inspect model drift (i.e. how much has the average predicted repayment rate changed day-to-day)
    - Compare models side-by-side to understand which model should be used in production
    - Identify and understand data issues
- Track data and model lineage throughout
    - View and understand
      - The origin of the data used for computed features
      - The data used for model training
      - The available model versions being monitored

In [None]:
!pip install shap

In [None]:
VERSION_NUM = 'gartner_demo'

In [1]:
import pandas as pd
import numpy as np
import sklearn
import math
import pickle
import datetime
import shap

# Snowpark ML
from snowflake.ml.modeling.xgboost import XGBRegressor, XGBClassifier
from snowflake.ml._internal.utils import identifier
from snowflake.ml.registry import Registry

#Snowflake feature store
from snowflake.ml.feature_store import FeatureStore, FeatureView, Entity, CreationMode

# Snowpark session
from snowflake.snowpark import DataFrame
from snowflake.snowpark.functions import col, to_timestamp, min, max, month, dayofmonth, dayofweek, dayofyear, avg, median, lag, sum 
from snowflake.snowpark.types import IntegerType
from snowflake.snowpark import Window




# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()
session

In [None]:
try:
    print("Reading table data...")
    df = session.table("MORTGAGE_LENDING_DEMO_DATA")
    df.show(5)
except:
    print("Table not found! Uploading data to snowflake table")
    df_pandas = pd.read_csv("MORTGAGE_LENDING_DEMO_DATA.csv.zip")
    session.write_pandas(df_pandas, "MORTGAGE_LENDING_DEMO_DATA", auto_create_table=True)
    df = session.table("MORTGAGE_LENDING_DEMO_DATA")
    df.show(5)

In [None]:
df.select(min('TS'), max('TS'))

In [None]:
#Create a dict with keys for feature names and values containing transform code

feature_eng_dict = dict()

#Timstamp features
feature_eng_dict["TIMESTAMP"] = to_timestamp("TS")
feature_eng_dict["MONTH"] = month("TIMESTAMP")
feature_eng_dict["DAY_OF_YEAR"] = dayofyear("TIMESTAMP") 
feature_eng_dict["DOTW"] = dayofweek("TIMESTAMP")

# df= df.with_columns(feature_eng_dict.keys(), feature_eng_dict.values())

#Income and loan features
feature_eng_dict["LOAN_AMOUNT"] = col("LOAN_AMOUNT_000s")*1000
feature_eng_dict["INCOME"] = col("APPLICANT_INCOME_000s")*1000
feature_eng_dict["INCOME_LOAN_RATIO"] = col("INCOME")/col("LOAN_AMOUNT")

county_window_spec = Window.partition_by("COUNTY_NAME")
feature_eng_dict["MEDIAN_COUNTY_INCOME"] = median("INCOME").over(county_window_spec)
feature_eng_dict["HIGH_INCOME_FLAG"] = (col("INCOME")>col("MEDIAN_COUNTY_INCOME")).astype(IntegerType())

day_window_spec = Window.order_by("DAY_OF_YEAR").rows_between(-30,0)
feature_eng_dict["AVG_THIRTY_DAY_LOAN_AMOUNT"] =  avg("LOAN_AMOUNT").over(day_window_spec)

df = df.with_columns(feature_eng_dict.keys(), feature_eng_dict.values())
df.show(3)

In [None]:
print(df.explain())

In [None]:
fs = FeatureStore(
    session=session, 
    database=session.get_current_database(), 
    name=session.get_current_schema(), 
    default_warehouse=session.get_current_warehouse(),
    creation_mode=CreationMode.CREATE_IF_NOT_EXIST
)

In [None]:
fs.list_entities()

In [None]:
#First try to retrieve an existing entity definition, if not define a new one and register
try:
    #retrieve existing entity
    loan_id_entity = fs.get_entity('LOAN_ENTITY') 
    print('Retrieved existing entity')
except:
#define new entity
    loan_id_entity = Entity(
        name = "LOAN_ENTITY",
        join_keys = ["LOAN_ID"],
        desc = "Features defined on a per loan level - expanded FE")
    #register
    fs.register_entity(loan_id_entity)
    print("Registered new entity")

In [None]:
#Create a dataframe with just the ID, timestamp, and engineered features. We will use this to define our feature view
feature_df = df.select(["LOAN_ID"]+list(feature_eng_dict.keys()))
feature_df.show(5)

In [None]:
feature_df.explain()

In [None]:
#define and register feature view
loan_fv = FeatureView(
    name="Mortgage_Feature_View",
    entities=[loan_id_entity],
    feature_df=feature_df,
    timestamp_col="TIMESTAMP",
    refresh_freq="1 day")

loan_fv = fs.register_feature_view(loan_fv, version=VERSION_NUM, overwrite=True)

In [None]:
fs.list_feature_views().limit(3)

In [None]:
import streamlit as st

org_name = session.sql('SELECT CURRENT_ORGANIZATION_NAME()').collect()[0][0]
account_name = session.sql('SELECT CURRENT_ACCOUNT_NAME()').collect()[0][0]
db_name = session.sql('SELECT CURRENT_DATABASE()').collect()[0][0]
schema_name = session.sql('SELECT CURRENT_SCHEMA()').collect()[0][0]

st.write(f'https://app.snowflake.com/{org_name}/{account_name}/#/features/database/{db_name}/store/{schema_name}')

In [None]:
ds = fs.generate_dataset(
    name=f"MORTGAGE_DATASET_EXTENDED_FEATURES_{VERSION_NUM}",
    spine_df=df.select("LOAN_ID", "TIMESTAMP", "LOAN_PURPOSE_NAME","MORTGAGERESPONSE"), #only need the features used to fetch rest of feature view
    features=[loan_fv],
    spine_timestamp_col="TIMESTAMP",
    spine_label_cols=["MORTGAGERESPONSE"]
)

In [None]:
ds_sp = ds.read.to_snowpark_dataframe()
ds_sp.show(5)

In [None]:
import snowflake.ml.modeling.preprocessing as snowml
from snowflake.snowpark.types import StringType

OHE_COLS = ds_sp.select([col.name for col in ds_sp.schema if col.datatype ==StringType()]).columns
OHE_POST_COLS = [i+"_OHE" for i in OHE_COLS]


# Encode categoricals to numeric columns
snowml_ohe = snowml.OneHotEncoder(input_cols=OHE_COLS, output_cols = OHE_COLS, drop_input_cols=True)
ds_sp_ohe = snowml_ohe.fit(ds_sp).transform(ds_sp)
ds_sp_ohe.columns

In [None]:
train, test = ds_sp_ohe.random_split(weights=[0.70, 0.30], seed=0)

In [None]:
train = train.fillna(0)
test = test.fillna(0)

In [None]:
#save train and test data out to temporary snowflake tables
train.write.save_as_table(table_name = "train_temp",table_type="temporary")
test.write.save_as_table(table_name = "test_temp",table_type="temporary")

In [None]:
train_pd = train.to_pandas()
test_pd = test.to_pandas()

In [None]:
from snowflake.ml.modeling.xgboost import XGBClassifier

snow_xgb_base = XGBClassifier(
    input_cols=train.drop(["TIMESTAMP", "LOAN_ID", "MORTGAGERESPONSE"]).columns,
    label_cols=train.select("MORTGAGERESPONSE").columns,
    output_cols="MORTGAGE_PREDICTION",
    max_depth=50,
    n_estimators=3,
    learning_rate = 0.75,
    booster = 'gbtree')

In [None]:
snow_xgb_base.fit(train)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
train_preds_base = snow_xgb_base.to_xgboost().predict(train_pd.drop(["TIMESTAMP", "LOAN_ID", "MORTGAGERESPONSE"],axis=1).values)

f1_base_train = f1_score(train_pd.MORTGAGERESPONSE, train_preds_base)
precision_base_train = precision_score(train_pd.MORTGAGERESPONSE, train_preds_base)
recall_base_train = recall_score(train_pd.MORTGAGERESPONSE, train_preds_base)

print(f'F1: {f1_base_train} \nPrecision {precision_base_train} \nRecall: {recall_base_train}')

In [None]:
#Create a KPI that measures average loan losses per model/data set
## Compute total loan value for false negatives (loans that were not predicted to be defaults, but were defaulted on)
## and average over all loans
def compute_financial_losses(model, df):
    data = df.copy()
    data['prediction'] = model.to_xgboost().predict(data.drop(["TIMESTAMP", "LOAN_ID", "MORTGAGERESPONSE"],axis=1).values)
    false_positves = data[(data.MORTGAGERESPONSE==0) & (data.prediction==1)]
    avg_losses = np.around(false_positves.LOAN_AMOUNT.sum()/data.shape[0], 2)
    avg_losses = f"${avg_losses}"
    return avg_losses


base_train_losses = compute_financial_losses(snow_xgb_base, train_pd)
base_test_losses = compute_financial_losses(snow_xgb_base, test_pd)
print(f"Average loss for train set - {base_train_losses}")
print(f"Average loss for test set - {base_test_losses}")

# Model Registry

In [None]:
#Create a snowflake model registry object 
from snowflake.ml.registry import Registry
from snowflake.ml._internal.utils import identifier
from snowflake.ml.model import model_signature

db = identifier._get_unescaped_name(session.get_current_database())
schema = identifier._get_unescaped_name(session.get_current_schema())


# Define model name
model_name = f"MORTGAGE_LENDING_MLOPS_{VERSION_NUM}"

# Create a registry to log the model to
model_registry = Registry(session=session, 
                          database_name=db, 
                          schema_name=schema,
                          options={"enable_monitoring": True})

In [None]:
#Deploy the base model to the model registry
base_version_name = 'XGB_BASE'

try:
    mv_base = model_registry.get_model(model_name).version(base_version_name)
    print("Found existing model version!")
except:
    print("Logging new model version...")
    mv_base = model_registry.log_model(
        model_name=model_name,
        model=snow_xgb_base, 
        version_name=base_version_name,
        comment = """ML model for predicting loan approval likelihood.
                    This model was trained using Snowflake ML xgboost classifier.
                    Hyperparameters used were:
                    max_depth=50, n_estimators=3, learning_rate = 0.75, algorithm = gbtree.
                    Special considerations and limitations: None.
                    """,
    )
    mv_base.set_metric(metric_name="Train_F1_Score", value=f1_base_train)
    mv_base.set_metric(metric_name="Train_Precision_Score", value=precision_base_train)
    mv_base.set_metric(metric_name="Train_Recall_score", value=recall_base_train)

In [None]:
CREATE OR REPLACE TAG DEV;

In [None]:
m = model_registry.get_model(model_name)
m.set_tag("DEV", base_version_name)
m.comment = "Loan approval prediction models."

In [None]:
model_registry.show_models()

In [None]:
model_registry.get_model(model_name).show_versions()

In [None]:
print(mv_base)
print(mv_base.show_metrics())

In [None]:
mv_base.show_functions()

In [None]:
reg_preds = mv_base.run(test, function_name = "predict")
reg_preds.show(10)

In [None]:
preds_pd = reg_preds.select("MORTGAGERESPONSE", "MORTGAGE_PREDICTION").to_pandas()
f1_base_test = f1_score(preds_pd.MORTGAGERESPONSE, preds_pd.MORTGAGE_PREDICTION)
precision_base_test = precision_score(preds_pd.MORTGAGERESPONSE, preds_pd.MORTGAGE_PREDICTION)
recall_base_test = recall_score(preds_pd.MORTGAGERESPONSE, preds_pd.MORTGAGE_PREDICTION)

#log metrics to model registry model
mv_base.set_metric(metric_name="Test_F1_Score", value=f1_base_test)
mv_base.set_metric(metric_name="Test_Precision_Score", value=precision_base_test)
mv_base.set_metric(metric_name="Test_Recall_score", value=recall_base_test)

print(f'F1: {f1_base_test} \nPrecision {precision_base_test} \nRecall: {recall_base_test}')

# Oh no! Our model's performance seems to have dropped off significantly from training to our test set. 
## This is evidence that our model is overfit - can we fix this with Distributed Hyperparameter Optimization??

In [None]:
from snowflake.ml.modeling.model_selection.grid_search_cv import GridSearchCV

#Define our hyperparameter grid - skipping max_depth of 100 as that is likely the culprit for overfitting
hyper_param_grid = dict(
            max_depth= [10, 50, 100],
            learning_rate = [0.5, 0.75],
            n_estimators= [1,5,10]
    
        )

#Define the grid search model
grid_search_model = GridSearchCV(estimator=XGBClassifier(), 
                   param_grid=hyper_param_grid, 
                   cv=5, 
                   input_cols=train.drop(["TIMESTAMP", "LOAN_ID", "MORTGAGERESPONSE"]).columns,
                   label_cols=train.select("MORTGAGERESPONSE").columns,
                   output_cols="MORTGAGE_PREDICTION",
                   scoring="f1",
                   refit=True)
#Train the model
grid_search_model.fit(train)

#Print out the best params
grid_search_model.to_sklearn().best_params_

In [None]:
#Quick code to rename columns for prediction
rename_dict = {}
for n,i in enumerate(train_pd.columns):
    rename_dict[i] = train.columns[n]

#Generate predictions
xgb_opt_preds = grid_search_model.to_sklearn().best_estimator_.predict(train_pd.drop(["TIMESTAMP", "LOAN_ID", "MORTGAGERESPONSE"],axis=1).rename(columns=rename_dict))

#Generate performance metrics
f1_opt_train = f1_score(train_pd.MORTGAGERESPONSE, xgb_opt_preds)
precision_opt_train = precision_score(train_pd.MORTGAGERESPONSE, xgb_opt_preds)
recall_opt_train = recall_score(train_pd.MORTGAGERESPONSE, xgb_opt_preds)

print(f'F1: {f1_opt_train} \nPrecision {precision_opt_train} \nRecall: {recall_opt_train}')

In [None]:
#Generate test predictions
xgb_opt_preds_test = grid_search_model.to_sklearn().best_estimator_.predict(test_pd.drop(["TIMESTAMP", "LOAN_ID", "MORTGAGERESPONSE"],axis=1).rename(columns=rename_dict))

#Generate performance metrics on test data
f1_opt_test = f1_score(test_pd.MORTGAGERESPONSE, xgb_opt_preds_test)
precision_opt_test = precision_score(test_pd.MORTGAGERESPONSE, xgb_opt_preds_test)
recall_opt_test = recall_score(test_pd.MORTGAGERESPONSE, xgb_opt_preds_test)

print(f'F1: {f1_opt_test} \nPrecision {precision_opt_test} \nRecall: {recall_opt_test}')

# Here we see the HPO model has a more modest train accuracy than our base model - but the peformance doesn't drop off during testing

In [None]:
#Log the optimized model to the model registry
optimized_version_name = 'XGB_Optimized'

try:
    mv_opt = model_registry.get_model(model_name).version(optimized_version_name)
    print("Found existing model version!")
except:
    print("Logging new model version...")
    mv_opt = model_registry.log_model(
        model_name=model_name,
        model=grid_search_model, 
        version_name=optimized_version_name,
        comment = """ML model for predicting loan approval likelihood.
                    This model was optimized using GridSearch Hyper parameter optimization. 
                    It generalizes better on new data than the base model. 
                    Special considerations and limitations: None.
                    """,
    )
    mv_opt.set_metric(metric_name="Train_F1_Score", value=f1_opt_train)
    mv_opt.set_metric(metric_name="Train_Precision_Score", value=precision_opt_train)
    mv_opt.set_metric(metric_name="Train_Recall_score", value=recall_opt_train)

    mv_opt.set_metric(metric_name="Test_F1_Score", value=f1_opt_test)
    mv_opt.set_metric(metric_name="Test_Precision_Score", value=precision_opt_test)
    mv_opt.set_metric(metric_name="Test_Recall_score", value=recall_opt_test)

In [None]:
#Here we see the BASE version is our default version
model_registry.get_model(model_name).default

In [None]:
#Now we'll set the optimized model to be the default model version going forward
model_registry.get_model(model_name).default = optimized_version_name

In [None]:
#Now we see our optimized version we have now recently promoted to our DEFAULT model version
model_registry.get_model(model_name).default

In [None]:
CREATE OR REPLACE TAG "PROD";

In [None]:
m = model_registry.get_model(model_name)
m.set_tag("PROD", optimized_version_name)

In [None]:
from snowflake import snowpark
from snowflake.snowpark.types import StringType, IntegerType
from snowflake.snowpark.functions import col, to_timestamp, min, max, month, dayofmonth, dayofweek, dayofyear, avg, median, lag, sum 


def compute_loan_losses(session: snowpark.Session, table_name: str, modelname: str, modelversion: str) -> str:
    from snowflake.snowpark.functions import col, sum 

    reg = Registry(session=session)
    m = reg.get_model(model_name)  # Fetch the model using the registry
    mv = m.version(modelversion)
    
    # Read the input table to a dataframe
    df = session.table(table_name)

    # Perform prediction using the model
    results = mv.run(df, function_name="predict")  # 'results' is the output DataFrame with predictions

    #Filter results data to false_positives
    false_positives = results.filter((col("MORTGAGERESPONSE") == 0) & (col("MORTGAGE_PREDICTION") == 1))
    #compute average losses
    avg_losses = round(false_positives.select(sum(col("LOAN_AMOUNT"))).collect()[0][0]/df.count(),2)
    avg_losses = f"${avg_losses}"

    #Set metric 
    mv.set_metric(metric_name=f"avg_loan_losses_{table_name[0:table_name.find('_')]}", value=avg_losses)
    return avg_losses

# Register the stored procedure
session.sproc.register(
    func=compute_loan_losses,
    name="compute_loan_losses_sproc",
    replace=True,
    is_permanent=True,
    stage_location="@ML_STAGE",
    packages=['snowflake-snowpark-python', 'snowflake-ml-python'],
    return_type=StringType()
)

In [None]:
SHOW PROCEDURES like '%COMPUTE_LOAN_LOSSES%'

In [None]:
CALL compute_loan_losses_sproc('train_temp','{{model_name}}', '{{base_version_name}}');
CALL compute_loan_losses_sproc('test_temp','{{model_name}}', '{{base_version_name}}');
CALL compute_loan_losses_sproc('train_temp','{{model_name}}', '{{optimized_version_name}}');
CALL compute_loan_losses_sproc('test_temp','{{model_name}}', '{{optimized_version_name}}');

In [None]:
mv_base.show_metrics()

In [None]:
mv_opt.show_metrics()

## Explainability

In [None]:
test_pd_sample=test_pd.rename(columns=rename_dict).sample(n=1000, random_state = 100).reset_index(drop=True)
shap_pd = mv_base.run(test_pd_sample, function_name="explain")

In [None]:
import shap 
just_input_vals = test_pd_sample.drop(["LOAN_ID","MORTGAGERESPONSE", "TIMESTAMP"], axis=1)

shap.summary_plot(np.array(shap_pd), just_input_vals, feature_names = just_input_vals.columns)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

all_shap = test_pd_sample.merge(shap_pd, right_index=True, left_index=True, how='outer')
income_0_to_200k = all_shap[(all_shap.INCOME>0) & (all_shap.INCOME<200000)]
sns.scatterplot(data = income_0_to_200k, x ="INCOME", y = '"INCOME_explanation"', color='darkblue', s=30, alpha=0.75, edgecolor='w',)
sns.regplot(data = income_0_to_200k, x ="INCOME", y = '"INCOME_explanation"', scatter=False, color='red', line_kws={"lw":2},ci =100, lowess=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

loan_200k_to_2M = all_shap[(all_shap.LOAN_AMOUNT>200000) & (all_shap.LOAN_AMOUNT<1000000)]
sns.scatterplot(data = loan_200k_to_2M, x ="LOAN_AMOUNT", y = '"LOAN_AMOUNT_explanation"', color='darkblue', s=30, alpha=0.75, edgecolor='w',)
sns.regplot(data = loan_200k_to_2M, x ="LOAN_AMOUNT", y = '"LOAN_AMOUNT_explanation"', scatter=False, color='red', line_kws={"lw":2},ci =100, lowess=True)

In [None]:
#Loan types of home purchases are more likely to be approved versus other types of loans
sns.boxplot(data = all_shap, x ='"LOAN_PURPOSE_NAME_Home purchase"', y = '"LOAN_PURPOSE_NAME_Home purchase_explanation"',  palette='Set1', width=0.75, linewidth=1, fliersize=2)

# Distributed model training
## For demonstrations sake - below we have an example doing distributed model training
### Snowflake will set up a ray cluster on all available nodes in your compute pool (CPU or GPU) and execute the distributed training job

In [None]:
from snowflake.ml.modeling.distributors.xgboost.xgboost_estimator import XGBEstimator, XGBScalingConfig
from snowflake.ml.data.data_connector import DataConnector
dc = DataConnector.from_dataframe(train)

#Specify Scaling Config 
scaling_config = XGBScalingConfig(use_gpu=True)

#Define distributed xgb estimator
dist_gpu_xgb = XGBEstimator(
    params = {"booster": "gbtree",
              "n_estimators":10,},
    scaling_config = scaling_config)

dist_gpu_xgb.fit(dc,
                 input_cols = train.drop(["TIMESTAMP", "LOAN_ID", "MORTGAGERESPONSE"]).columns,
                 label_col = "MORTGAGERESPONSE")

# Model Monitoring setup

In [None]:
train.write.save_as_table(f"DEMO_MORTGAGE_LENDING_TRAIN_{VERSION_NUM}", mode="overwrite")
test.write.save_as_table(f"DEMO_MORTGAGE_LENDING_TEST_{VERSION_NUM}", mode="overwrite")

In [None]:
session.sql("CREATE stage IF NOT EXISTS ML_STAGE").collect()

In [None]:
from snowflake import snowpark
from snowflake.ml.registry import Registry
import joblib
import os
import logging
from snowflake.ml.modeling.pipeline import Pipeline
import snowflake.ml.modeling.preprocessing as pp
from snowflake.snowpark.types import StringType, IntegerType
import snowflake.snowpark.functions as F


def demo_inference_sproc(session: snowpark.Session, table_name: str, modelname: str, modelversion: str) -> str:
    
    database=session.get_current_database()
    schema=session.get_current_schema()
    reg = Registry(session=session)
    m = reg.get_model(model_name)  # Fetch the model using the registry
    mv = m.version(modelversion)
    
    input_table_name=table_name
    pred_col = f'{modelversion}_PREDICTION'

    # Read the input table to a dataframe
    df = session.table(input_table_name)

    # Perform prediction using the model
    results = mv.run(df, function_name="predict").select("LOAN_ID","MORTGAGE_PREDICTION").withColumnRenamed("MORTGAGE_PREDICTION", pred_col)  # 'results' is the output DataFrame with predictions
    # results = results.withColumnRenamed("MORTGAGE_PREDICTION", pred_col)
    
    final = df.join(results, on="LOAN_ID", how="full")
    # Write results back to Snowflake table
    final.write.save_as_table(table_name, mode='overwrite',enable_schema_evolution=True)

    return "Success"

# Register the stored procedure
session.sproc.register(
    func=demo_inference_sproc,
    name="model_inference_sproc",
    replace=True,
    is_permanent=True,
    stage_location="@ML_STAGE",
    packages=['joblib', 'snowflake-snowpark-python', 'snowflake-ml-python'],
    return_type=StringType()
)


In [None]:
CALL model_inference_sproc('DEMO_MORTGAGE_LENDING_TRAIN_{{VERSION_NUM}}','{{model_name}}', '{{base_version_name}}');

In [None]:
CALL model_inference_sproc('DEMO_MORTGAGE_LENDING_TEST_{{VERSION_NUM}}','{{model_name}}', '{{base_version_name}}');

In [None]:
CALL model_inference_sproc('DEMO_MORTGAGE_LENDING_TRAIN_{{VERSION_NUM}}','{{model_name}}', '{{optimized_version_name}}');

In [None]:
CALL model_inference_sproc('DEMO_MORTGAGE_LENDING_TEST_{{VERSION_NUM}}','{{model_name}}', '{{optimized_version_name}}');

In [None]:
select * FROM DEMO_MORTGAGE_LENDING_TRAIN_{{VERSION_NUM}} limit 5

In [None]:
# from snowflake.ml.monitoring.entities.model_monitor_config import ModelMonitorConfig, ModelMonitorSourceConfig
# # snowflake/ml/monitoring/entities/model_monitor_config.py

# # Set up source/baseline table config for base model
# base_source_config = ModelMonitorSourceConfig(
#     baseline = "DEMO_MORTGAGE_LENDING_TRAIN",
#     source="DEMO_MORTGAGE_LENDING_TEST",
#     timestamp_column="TIMESTAMP",
#     prediction_score_columns=["XGB_BASE_PREDICTION"],
#     actual_score_columns=["MORTGAGERESPONSE"],
#     id_columns=["LOAN_ID"]
# )

# # Set up model config for tree booster
# base_monitor_config = ModelMonitorConfig(
#     model_version=mv_base,
#     model_function_name="predict",
#     background_compute_warehouse_name="ML_WH"
# )

# # Set up source/baseline table config for opt model
# opt_source_config = ModelMonitorSourceConfig(
#     baseline = "DEMO_MORTGAGE_LENDING_TRAIN",
#     source="DEMO_MORTGAGE_LENDING_TEST",
#     timestamp_column="TIMESTAMP",
#     prediction_score_columns=["XGB_OPTIMIZED_PREDICTION"],
#     actual_score_columns=["MORTGAGERESPONSE"],
#     id_columns=["LOAN_ID"]
# )

# # Set up model config for linear booster
# opt_monitor_config = ModelMonitorConfig(
#     model_version=mv_opt,
#     model_function_name="predict",
#     background_compute_warehouse_name="ML_WH"
# )

In [None]:
# # Add a new ModelMonitor
# model_monitor = model_registry.add_monitor(
#     name="GB_TREE_MORTGAGE_LENDING_MODEL_MONITOR", 
#     source_config=tree_source_config,
#     model_monitor_config=tree_monitor_config,
# )


# model_monitor = model_registry.add_monitor(
#     name="GB_MORTGAGE_LENDING_MODEL_MONITOR", 
#     source_config=linear_source_config,
#     model_monitor_config=linear_monitor_config,
# )

In [None]:
CREATE OR REPLACE MODEL MONITOR MORTGAGE_LENDING_BASE_MODEL_MONITOR
WITH
    MODEL={{model_name}}
    VERSION={{base_version_name}}
    FUNCTION=predict
    SOURCE=DEMO_MORTGAGE_LENDING_TEST_{{VERSION_NUM}}
    BASELINE=DEMO_MORTGAGE_LENDING_TRAIN_{{VERSION_NUM}}
    TIMESTAMP_COLUMN=TIMESTAMP
    PREDICTION_CLASS_COLUMNS=(XGB_BASE_PREDICTION)  
    ACTUAL_CLASS_COLUMNS=(MORTGAGERESPONSE)
    ID_COLUMNS=(LOAN_ID)
    WAREHOUSE=SMALL
    REFRESH_INTERVAL='1 min'
    AGGREGATION_WINDOW='1 day';

In [None]:
CREATE OR REPLACE MODEL MONITOR MORTGAGE_LENDING_OPTIMIZED_MODEL_MONITOR
WITH
    MODEL={{model_name}}
    VERSION={{optimized_version_name}}
    FUNCTION=predict
    SOURCE=DEMO_MORTGAGE_LENDING_TEST_{{VERSION_NUM}}
    BASELINE=DEMO_MORTGAGE_LENDING_TRAIN_{{VERSION_NUM}}
    TIMESTAMP_COLUMN=TIMESTAMP
    PREDICTION_CLASS_COLUMNS=(XGB_OPTIMIZED_PREDICTION)  
    ACTUAL_CLASS_COLUMNS=(MORTGAGERESPONSE)
    ID_COLUMNS=(LOAN_ID)
    WAREHOUSE=SMALL
    REFRESH_INTERVAL='1 min'
    AGGREGATION_WINDOW='1 day';

# Debug Montitoring Findings [WIP]

In [None]:
# debug_df = session.table("DEMO_MORTGAGE_LENDING_TEST").to_pandas()
# low_accuracy_period= debug_df[(debug_df.TIMESTAMP>datetime.datetime(2024,9,1)) & (debug_df.TIMESTAMP<datetime.datetime(2024,9,30))]
# f1_score(low_accuracy_period.MORTGAGERESPONSE, low_accuracy_period.GB_TREE_PREDICTION)

# SPCS Deployment setup 
## We will now create a container service behind the 

In [None]:
image_repo_name = "MORTGAGE_LENDING_IMAGE_REPO_LLM"
cp_name = "MORTGAGE_LENDING_INFERENCE_CP"
num_spcs_nodes = '3'
spcs_instance_family = 'CPU_X64_L'
service_name = 'MORTGAGE_LENDING_PREDICTION_SERVICE'

current_database = session.get_current_database().replace('"', '')
current_schema = session.get_current_schema().replace('"', '')
extended_image_repo_name = f"{current_database}.{current_schema}.{image_repo_name}"
extended_service_name = f'{current_database}.{current_schema}.{service_name}'

In [None]:
# session.sql(f"alter compute pool if exists {cp_name} stop all").collect()
# session.sql(f"drop compute pool if exists {cp_name}").collect()
# session.sql(f"create compute pool {cp_name} min_nodes={num_spcs_nodes} max_nodes={num_spcs_nodes} instance_family={spcs_instance_family} auto_resume=True auto_suspend_secs=300").collect()
# session.sql(f"describe compute pool {cp_name}").show()

In [None]:
# session.sql(f"create image repository if not exists {extended_image_repo_name}").collect()

In [None]:
# mv_opt.create_service(
#     service_name=extended_service_name,
#     service_compute_pool=cp_name,
#     image_repo=extended_image_repo_name,
#     ingress_enabled=True,
#     max_instances=int(num_spcs_nodes),
#     build_external_access_integration="ALLOW_ALL_INTEGRATION"
# )

In [None]:
mv_container = model_registry.get_model("MORTGAGE_LENDING_MLOPS_128").default
mv_container.list_services()

In [None]:
# mv_container.run(test, function_name = "predict", service_name = "MORTGAGE_LENDING_PREDICTION_SERVICE")

In [None]:
# feature_columns = [feature.name for feature in mv_opt.show_functions()[1]["signature"].inputs]
# feature_columns

# test.limit(1).select(*feature_columns).show()

# input_features = test.limit(1).select(*feature_columns).to_pandas().values.tolist()[0]
# data = {"data": [[0, *input_features]]}
# print(data)

## Conclusion

#### 🛠️ Snowflake Feature Store tracks feature definitions and maintains lineage of sources and destinations 🛠️
#### 🚀 Snowflake Model Registry gives users a secure and flexible framework to deploy track and monitor models 🚀
#### 🔮 All model versions logged in the Model Registry can be accessed for inference, explainability, lineage tracking, visibility and more 🔮
