# ‚ùÑÔ∏è End-to-end ML Demo ‚ùÑÔ∏è

In this worfklow we will work through the following elements of a typical tabular machine learning pipeline.

### 1. Use Feature Store to track engineered features
* Store feature defintions in feature store for reproducible computation of ML features
      
### 2. Train two Models using the Snowflake ML APIs
* Baseline XGboost
* XGboost with optimal hyper-parameters identified via Snowflake ML distributed HPO methods

### 3. Register both models in Snowflake model registry
* Explore model registry capabilities such as **metadata tracking, inference, and explainability**
* Compare model metrics on train/test set to identify any issues of model performance or overfitting
* Tag the best performing model version as 'default' version
### 4. Set up Model Monitor to track 1 year of predicted and actual loan repayments
* **Compute performance metrics** such a F1, Precision, Recall
* **Inspect model drift** (i.e. how much has the average predicted repayment rate changed day-to-day)
* **Compare models** side-by-side to understand which model should be used in production
* Identify and understand **data issues**

### 5. Track data and model lineage throughout
* View and understand
  * The **origin of the data** used for computed features
  * The **data used** for model training
  * The **available model versions** being monitored

In [None]:
!pip install snowflake-ml-python==1.18.0

In [None]:
#Update this VERSION_NUM to version your features, models etc!
VERSION_NUM = '0'
DB = "E2E_SNOW_MLOPS_DB" 
SCHEMA = "MLOPS_SCHEMA" 
COMPUTE_WAREHOUSE = "E2E_SNOW_MLOPS_WH" 
ROLE = "E2E_SNOW_MLOPS_ROLE"

In [1]:
import pandas as pd
import numpy as np
import sklearn
import math
import pickle
import shap
from datetime import datetime
from xgboost import XGBClassifier

from versioning import version_featureview, version_data

# Snowpark ML
from snowflake.ml.registry import Registry
from snowflake.ml.modeling.tune import get_tuner_context
from snowflake.ml.modeling import tune
from entities import search_algorithm

#Snowflake feature store
from snowflake.ml.feature_store import FeatureStore, FeatureView, Entity, CreationMode

# Snowpark session
from snowflake.snowpark import DataFrame
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col, to_timestamp, min, max, month, dayofweek, dayofyear, avg, date_add, sql_expr
from snowflake.snowpark.types import IntegerType, StringType
from snowflake.snowpark import Window

#setup snowpark session
from snowflake.snowpark.context import get_active_session
session = get_active_session()
# session.use_role('')
session.use_role(ROLE)
session.use_warehouse(COMPUTE_WAREHOUSE)
session.use_database(DB)
session.use_schema(SCHEMA)


In [None]:
df = session.table("MORTGAGE_LENDING_DEMO_DATA")
df.show(5)

## Observe Snowflake Snowpark table properties

In [None]:
df.select(min('TS'), max('TS')).show()

In [None]:
#Get current date and time
current_time = datetime.now()
df_max_time = datetime.strptime(str(df.select(max("TS")).collect()[0][0]), "%Y-%m-%d %H:%M:%S.%f")

#Find delta between latest existing timestamp and today's date
timedelta = current_time- df_max_time

## Feature Engineering with Snowpark APIs

In [None]:
#Create a dict with keys for feature names and values containing transform code

feature_eng_dict = dict()

#Timstamp features
feature_eng_dict["TIMESTAMP"] = date_add(to_timestamp("TS"), timedelta.days-1)
feature_eng_dict["MONTH"] = month("TIMESTAMP")
feature_eng_dict["DAY_OF_YEAR"] = dayofyear("TIMESTAMP") 
feature_eng_dict["DOTW"] = dayofweek("TIMESTAMP")

# df= df.with_columns(feature_eng_dict.keys(), feature_eng_dict.values())

#Income and loan features
feature_eng_dict["LOAN_AMOUNT"] = col("LOAN_AMOUNT_000s")*1000
feature_eng_dict["INCOME"] = (col("APPLICANT_INCOME_000s")*1000).astype(IntegerType())
feature_eng_dict["INCOME_LOAN_RATIO"] = col("INCOME")/col("LOAN_AMOUNT")

df_eng = df.with_columns(feature_eng_dict.keys(), feature_eng_dict.values())
df_eng.show(3)

In [None]:
# Create county/year level income data
avg_income = df_eng.with_column("TIMESTAMP",F.date_trunc("YEAR","TIMESTAMP")).group_by("COUNTY_NAME","TIMESTAMP").agg(F.mean("INCOME").alias("YEAR_AVG_INCOME"))
avg_income.show()

In [None]:
# create one hot encoded features

cat_cols = ["LOAN_PURPOSE_NAME"]

ohe_dict = {}
for c in cat_cols:
    vals = df_eng.select(c).distinct().collect()

    for v in vals:
        key = f"{c}_{v[c].replace(' ','_').upper()}"
        ohe_dict[key] = (col(c)==v[c]).astype(IntegerType())
        
ohe_df = df_eng.with_columns(ohe_dict.keys(), ohe_dict.values())

ohe_df = ohe_df.select(["LOAN_ID","TIMESTAMP"]+list(ohe_dict.keys()))
ohe_df.show()

## Create a Snowflake Feature Store

In [None]:
fs = FeatureStore(
    session=session, 
    database=DB, 
    name=SCHEMA, 
    default_warehouse=COMPUTE_WAREHOUSE,
    creation_mode=CreationMode.CREATE_IF_NOT_EXIST
)

In [None]:
fs.list_entities()

## Feature Store configuration
- create/register entities of interest

In [None]:
#First try to retrieve an existing entity definition, if not define a new one and register
try:
    #retrieve existing entity
    loan_id_entity = fs.get_entity('LOAN_ENTITY') 
    print('Retrieved existing entity')
except:
#define new entity
    loan_id_entity = Entity(
        name = "LOAN_ENTITY",
        join_keys = ["LOAN_ID"],
        desc = "Features defined on a per loan level")
    #register
    fs.register_entity(loan_id_entity)
    print("Registered new entity")


We can define the dataframe via the use of Snowpark APIs, and use that dataframe (or a function that returns a dataframe) as the feature view definition, below.

In [None]:
#Create a dataframe with just the ID, timestamp, and engineered features. We will use this to define our feature view
feature_df = df_eng.select(["LOAN_ID"]+list(feature_eng_dict.keys()))
feature_df.show(5)

#define and register feature view
loan_fv = FeatureView(
    name="Mortgage_Feature_View",
    entities=[loan_id_entity],
    feature_df=feature_df,
    timestamp_col="TIMESTAMP",
    refresh_freq="1 day")

#add feature level descriptions

loan_fv = loan_fv.attach_feature_desc(
    {
        "MONTH": "Month of loan",
        "DAY_OF_YEAR": "Day of calendar year of loan",
        "DOTW": "Day of the week of loan",
        "LOAN_AMOUNT": "Loan amount in $USD",
        "INCOME": "Household income in $USD",
        "INCOME_LOAN_RATIO": "Ratio of LOAN_AMOUNT/INCOME",
    }
)

loan_fv = fs.register_feature_view(loan_fv, version=VERSION_NUM,overwrite=True)

# alternatively, use version hashing
#version = version_featureview(loan_fv)
#loan_fv = fs.register_feature_view(loan_fv, version=version)

In [None]:
#define and register feature view for one hot ended categories
cat_fv = FeatureView(
    name="Mortgage_Feature_View_CATEGORIES",
    entities=[loan_id_entity],
    feature_df=ohe_df,
    timestamp_col="TIMESTAMP",
)

cat_fv = fs.register_feature_view(cat_fv, version=VERSION_NUM,overwrite=True)

# alternatively, use version hashing
#version = version_featureview(cat_fv)
#cat_fv = fs.register_feature_view(cat_fv, version=version)

In [None]:
#First try to retrieve an existing entity definition, if not define a new one and register
try:
    #retrieve existing entity
    year_entity = fs.get_entity('COUNTY') 
    print('Retrieved existing entity')
except:
#define new entity
    year_entity = Entity(
        name = "COUNTY",
        join_keys = ["COUNTY_NAME"],
        desc = "Features defined on a county level")
    #register
    fs.register_entity(year_entity)
    print("Registered new entity")

In [None]:
#define and register feature view
year_fv = FeatureView(
    name="Mortgage_Feature_View_Years",
    entities=[year_entity],
    feature_df=avg_income,
    timestamp_col="TIMESTAMP",
)

year_fv = fs.register_feature_view(year_fv, version=VERSION_NUM,overwrite=True)

# alternatively, use version hashing
#version = version_featureview(cat_fv)
#cat_fv = fs.register_feature_view(cat_fv, version=version)

## Retrieve a Dataset from the featureview

Snowflake Datasets are immutable, file-based objects that exist within your Snowpark session. 

They can be written to persistent Snowflake objects as needed. 

In [None]:
# subset of data, only need the features used to fetch rest of feature view
spine_df = df_eng.select("LOAN_ID", "TIMESTAMP", "MORTGAGERESPONSE", "COUNTY_NAME").filter(month("TIMESTAMP")==10)

# automatically join (time aware) all 3 feature views
ds = fs.generate_dataset(
    name=f"MORTGAGE_DATASET_EXTENDED_FEATURES",
    spine_df=spine_df, 
    features=[loan_fv, cat_fv, year_fv],
    spine_timestamp_col="TIMESTAMP",
    spine_label_cols=["MORTGAGERESPONSE"]
)

In [None]:
# split data

ds_sp = ds.read.to_snowpark_dataframe()

train, test = ds_sp.random_split(weights=[0.70, 0.30], seed=0)

In [None]:
#Define model config
xgb_base = XGBClassifier(
    max_depth=50,
    n_estimators=3,
    learning_rate = 0.75,
    booster = 'gbtree')

#Split train data into X, y
train_pd = train.to_pandas()
X_train_pd = train_pd.drop(["TIMESTAMP", "LOAN_ID", "MORTGAGERESPONSE","COUNTY_NAME"],axis=1) #remove
y_train_pd = train_pd.MORTGAGERESPONSE

#train model
xgb_base.fit(X_train_pd,y_train_pd)


In [None]:
#Create a snowflake model registry object 
from snowflake.ml.registry import Registry

# Define model name
model_name = f"MORTGAGE_LENDING_MLOPS_{VERSION_NUM}"

# Create a registry to log the model to
model_registry = Registry(session=session, 
                          database_name=DB, 
                          schema_name=SCHEMA,
                          options={"enable_monitoring": True})

In [None]:
#Log the base model to the model registry (if not already there)
base_version_name = 'XGB_BASE'

try:
    #Check for existing model
    mv_base = model_registry.get_model(model_name).version(base_version_name)
    print("Found existing model version!")
except:
    print("Logging new model version...")
    #Log model to registry
    mv_base = model_registry.log_model(
        model_name=model_name,
        model=xgb_base, 
        version_name=base_version_name,
        sample_input_data = train.drop(["TIMESTAMP", "LOAN_ID", "MORTGAGERESPONSE","COUNTY_NAME"]).limit(100), #using snowpark df to maintain lineage
        comment = f"""ML model for predicting loan approval likelihood.
                    This model was trained using XGBoost classifier.
                    Hyperparameters used were:
                    max_depth={xgb_base.max_depth}, 
                    n_estimators={xgb_base.n_estimators}, 
                    learning_rate = {xgb_base.learning_rate}, 
                    algorithm = {xgb_base.booster}
                    """,
        target_platforms= ["WAREHOUSE", "SNOWPARK_CONTAINER_SERVICES"],
        options= {"enable_explainability": True}

    )
    
    #set metrics
    mv_base.set_metric(metric_name="Train_F1_Score", value=f1_base_train)
    mv_base.set_metric(metric_name="Train_Precision_Score", value=precision_base_train)
    mv_base.set_metric(metric_name="Train_Recall_score", value=recall_base_train)

In [None]:
reg_preds = mv_base.run(test, function_name = "predict").rename(col('"output_feature_0"'), "MORTGAGE_PREDICTION")
reg_preds.show(10)

In [None]:
train.write.save_as_table(f"DEMO_MORTGAGE_LENDING_TRAIN_{VERSION_NUM}", mode="overwrite")
test.write.save_as_table(f"DEMO_MORTGAGE_LENDING_TEST_{VERSION_NUM}", mode="overwrite")

session.sql("CREATE stage IF NOT EXISTS ML_STAGE").collect()

from snowflake import snowpark

def demo_inference_sproc(session: snowpark.Session, table_name: str, modelname: str, modelversion: str) -> str:

    reg = Registry(session=session)
    m = reg.get_model(model_name)  # Fetch the model using the registry
    mv = m.version(modelversion)
    
    input_table_name=table_name
    pred_col = f'{modelversion}_PREDICTION'

    # Read the input table to a dataframe
    df = session.table(input_table_name)
    results = mv.run(df, function_name="predict").select("LOAN_ID",'"output_feature_0"').withColumnRenamed('"output_feature_0"', pred_col)
    # 'results' is the output DataFrame with predictions

    final = df.join(results, on="LOAN_ID", how="full")
    # Write results back to Snowflake table
    final.write.save_as_table(table_name, mode='overwrite',enable_schema_evolution=True)

    return "Success"

# Register the stored procedure
session.sproc.register(
    func=demo_inference_sproc,
    name="model_inference_sproc",
    replace=True,
    is_permanent=True,
    stage_location="@ML_STAGE",
    packages=['joblib', 'snowflake-snowpark-python', 'snowflake-ml-python'],
    return_type=StringType()
)

In [None]:
CALL model_inference_sproc('DEMO_MORTGAGE_LENDING_TRAIN_{{VERSION_NUM}}','{{model_name}}', '{{base_version_name}}');
CALL model_inference_sproc('DEMO_MORTGAGE_LENDING_TEST_{{VERSION_NUM}}','{{model_name}}', '{{base_version_name}}');

In [None]:
CREATE OR REPLACE MODEL MONITOR MORTGAGE_LENDING_BASE_MODEL_MONITOR
WITH
    MODEL={{model_name}}
    VERSION={{base_version_name}}
    FUNCTION=predict
    SOURCE=DEMO_MORTGAGE_LENDING_TEST_{{VERSION_NUM}}
    BASELINE=DEMO_MORTGAGE_LENDING_TRAIN_{{VERSION_NUM}}
    TIMESTAMP_COLUMN=TIMESTAMP
    PREDICTION_CLASS_COLUMNS=(XGB_BASE_PREDICTION)  
    ACTUAL_CLASS_COLUMNS=(MORTGAGERESPONSE)
    ID_COLUMNS=(LOAN_ID)
    WAREHOUSE={{COMPUTE_WAREHOUSE}}
    REFRESH_INTERVAL='12 hours'
    AGGREGATION_WINDOW='1 day';

## Conclusion 

#### üõ†Ô∏è Snowflake Feature Store tracks feature definitions and maintains lineage of sources and destinations üõ†Ô∏è
#### üöÄ Snowflake Model Registry gives users a secure and flexible framework to log models, tag candidates for production, and run inference and explainability jobs üöÄ
#### üìà ML observability in Snowflake allows users to montior model performance over time and detect model, feature, and concept drift üìà
#### üîÆ All models logged in the Model Registry can be accessed for inference, explainability, lineage tracking, visibility and more üîÆ