## Imports

In [1]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.preprocessing import KBinsDiscretizer, OrdinalEncoder, OneHotEncoder
from snowflake.ml.modeling.impute import SimpleImputer

import json

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
connection_parameters = json.load(open('/Users/skhara/Documents/GitHub/creds.json'))
session = Session.builder.configs(connection_parameters).create()

# 1. Snowpark for ML

## 1.1 Prepare Data 

In [3]:
session.use_database('DEMO_DB')
session.use_schema('PUBLIC')

In [4]:
# Creating a Snowpark DataFrame
application_record_sdf = session.table('APPLICATION_RECORD')
credit_record_sdf = session.table('CREDIT_RECORD')
print('Application table size\t: ',application_record_sdf.count(), 
      '\nCredit table size\t: ', credit_record_sdf.count())

Application table size	:  438557 
Credit table size	:  1048575


In [5]:
credit_record_sdf.limit(5).to_pandas()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [6]:
# We will create a new column, TARGET, that will have a 1 for high-risk and 0 for low-risk.
credit_record_sdf = credit_record_sdf.group_by('ID')\
                        .agg(F.sum(F.iff(F.col('STATUS').in_(['2', '3','4','5']), 1, 0)).as_("CNT_LATE"))\
                        .with_column('TARGET', F.when(F.col('CNT_LATE') > 0, 1).otherwise(0)).drop("CNT_LATE")

In [7]:
# Join Credit Record data with Application Record Data
joined_sdf = application_record_sdf.join(credit_record_sdf, using_columns='ID', join_type='inner')

In [8]:
# Duplicate Removal - Use the **drop_duplicates** to remove duplicated rows
joined_sdf = joined_sdf.drop_duplicates('ID')
joined_sdf.count()

36457

In [9]:
# Selecting a few columns for modeling
cols_numerical = ['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'CNT_FAM_MEMBERS', 'TARGET']
cols_categorical = ['CODE_GENDER', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
joined_sdf = joined_sdf[cols_numerical+cols_categorical]

In [10]:
# Perform One-Hot-Encoding for categorical columns
my_ohe_encoder = OneHotEncoder(input_cols=cols_categorical, output_cols=cols_categorical, drop_input_cols=True)
prepared_sdf = my_ohe_encoder.fit(joined_sdf).transform(joined_sdf)

In [11]:
# Cleaning column names to make it easier for future referencing
import re

cols = prepared_sdf.columns
for old_col in cols:
    new_col = re.sub(r'[^a-zA-Z0-9_]', '', old_col)
    new_col = new_col.upper()
    prepared_sdf = prepared_sdf.rename(col(old_col), new_col)

In [12]:
# Split the data and save the train and test sets as tables in Snowflake
snowdf_train, snowdf_test = prepared_sdf.random_split([0.8, 0.2], seed=82) 
snowdf_train.write.mode("overwrite").save_as_table("CREDIT_DEFAULT_TRAIN")
snowdf_test.write.mode("overwrite").save_as_table("CREDIT_DEFAULT_TEST")

## 1.2 ML Modeling

In [13]:
snowdf_train = session.table('CREDIT_DEFAULT_TRAIN')
snowdf_test = session.table('CREDIT_DEFAULT_TEST')

In [14]:
# Prepare Data for modeling
feature_cols = snowdf_train.columns
feature_cols.remove('TARGET')
target_col = 'TARGET'

## 1.3 Train and Score Multiple Models

In [15]:
from snowflake.ml.modeling.metrics import f1_score, accuracy_score, confusion_matrix

def calculate_test_metrics(session, model, test_df):
    """ Function to calculate the metrics. """
    session.use_database('DEMO_DB')
    session.use_schema('PUBLIC')
    
    scored_sdf = model.predict(test_df)

    f1 = f1_score(df = scored_sdf,
                  y_true_col_names = 'TARGET',
                  y_pred_col_names = 'PREDICTION')
    
    accuracy = accuracy_score(df = scored_sdf,
                              y_true_col_names = 'TARGET',
                              y_pred_col_names = 'PREDICTION')

    cm = confusion_matrix(df = scored_sdf,
                          y_true_col_name = 'TARGET',
                          y_pred_col_name = 'PREDICTION')
    

    return accuracy, f1, cm

In [10]:
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.model_selection import GridSearchCV

models = []
for n_estimators in [50,100,150]:
    for learning_rate in [0.1, 0.2]:
        model = XGBClassifier(input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION', n_estimators=n_estimators, learning_rate=learning_rate)
        model.fit(snowdf_train)
        models.append(model)

In [None]:
# from snowflake.ml.modeling.xgboost import XGBRegressor
# from snowflake.ml.modeling.model_selection import GridSearchCV

# grid_search = GridSearchCV(
#     estimator=XGBRegressor(),
#     param_grid={
#         "n_estimators":[100, 200, 300],
#         "learning_rate":[0.1, 0.2],
#     },
#     n_jobs = -1,
#     scoring="neg_mean_absolute_percentage_error",
#     input_cols=feature_cols,
#     label_cols=target_col,
#     output_cols='PREDICTION'
# )

# # Train
# grid_search.fit(snowdf_train)

In [14]:
calculate_test_metrics(session, model, snowdf_test)

(0.984944,
 0.036036036036036036,
 array([[7.992e+03, 8.000e+00],
        [1.000e+02, 2.000e+00]]))

# 2. MLOps

## 2.1 Open/Create Model Registry
A model registry needs to be created before it can be used. The creation will create a new database in the current account so the active role needs to have permissions to create a database. After the first creation, the model registry can be opened without the need to create it again.

In [5]:
import importlib
from snowflake.ml.registry import model_registry

In [7]:
session.use_database('BAIN_TF_TASK')

In [9]:
registry_name = 'BAIN_TF_TASK'
schema_name = 'ML_TASK'

model_registry.create_model_registry(session= session,
                                     database_name= registry_name,
                                     schema_name= schema_name)

registry = model_registry.ModelRegistry(session= session,
                                        database_name= registry_name,
                                        schema_name= schema_name)



In [20]:
registry.list_models().to_pandas()



Unnamed: 0,CREATION_CONTEXT,CREATION_ENVIRONMENT_SPEC,CREATION_ROLE,CREATION_TIME,ID,INPUT_SPEC,NAME,OUTPUT_SPEC,RUNTIME_ENVIRONMENT_SPEC,TYPE,URI,VERSION,ARTIFACT_IDS,DESCRIPTION,METRICS,TAGS,REGISTRATION_TIMESTAMP
0,,"{\n ""python"": ""3.9.16""\n}","""ACCOUNTADMIN""",2023-09-11 11:57:45.421000-07:00,12350cf050d511ee9dd80a72b796458d,,CREDIT_DEFAULT_PREDICTION_TEST_08_02_2023,,,snowml,sfc://MODEL_REGISTRY_CREDIT.CREDIT_DEFAULT.SNO...,v2,,"""This is the demo XGBClassifier trained using ...","{\n ""accuracy"": 0.985648,\n ""cm"": [\n [\n...",,2023-09-11 11:57:46.896000-07:00
1,,"{\n ""python"": ""3.9.16""\n}","""ACCOUNTADMIN""",2023-09-11 11:59:58.735000-07:00,6265f68a50d511ee9dd80a72b796458d,,CREDIT_DEFAULT_PREDICTION_TEST_08_02_2023,,,snowml,sfc://MODEL_REGISTRY_CREDIT.CREDIT_DEFAULT.SNO...,v3,,"""This is the demo XGBClassifier trained using ...","{\n ""accuracy"": 0.985789,\n ""cm"": [\n [\n...",,2023-09-11 11:59:59.821000-07:00
2,,"{\n ""python"": ""3.9.16""\n}","""ACCOUNTADMIN""",2023-09-11 12:01:06.114000-07:00,8b3355b250d511ee9dd80a72b796458d,,CREDIT_DEFAULT_PREDICTION_TEST_08_02_2023,,,snowml,sfc://MODEL_REGISTRY_CREDIT.CREDIT_DEFAULT.SNO...,v4,,"""This is the demo XGBClassifier trained using ...","{\n ""accuracy"": 0.985789,\n ""cm"": [\n [\n...",,2023-09-11 12:01:07.148000-07:00
3,,"{\n ""python"": ""3.9.16""\n}","""ACCOUNTADMIN""",2023-09-11 12:02:18.877000-07:00,b67b862c50d511ee9dd80a72b796458d,,CREDIT_DEFAULT_PREDICTION_TEST_08_02_2023,,,snowml,sfc://MODEL_REGISTRY_CREDIT.CREDIT_DEFAULT.SNO...,v5,,"""This is the demo XGBClassifier trained using ...","{\n ""accuracy"": 0.985367,\n ""cm"": [\n [\n...",,2023-09-11 12:02:19.954000-07:00
4,,"{\n ""python"": ""3.9.16""\n}","""ACCOUNTADMIN""",2023-09-11 12:03:29.279000-07:00,dfd0b72c50d511ee9dd80a72b796458d,,CREDIT_DEFAULT_PREDICTION_TEST_08_02_2023,,,snowml,sfc://MODEL_REGISTRY_CREDIT.CREDIT_DEFAULT.SNO...,v6,,"""This is the demo XGBClassifier trained using ...","{\n ""accuracy"": 0.985789,\n ""cm"": [\n [\n...",,2023-09-11 12:03:31.840000-07:00
5,,"{\n ""python"": ""3.9.16""\n}","""ACCOUNTADMIN""",2023-09-11 12:04:34.346000-07:00,073ec6d250d611ee9dd80a72b796458d,,CREDIT_DEFAULT_PREDICTION_TEST_08_02_2023,,,snowml,sfc://MODEL_REGISTRY_CREDIT.CREDIT_DEFAULT.SNO...,v7,,"""This is the demo XGBClassifier trained using ...","{\n ""accuracy"": 0.984944,\n ""cm"": [\n [\n...",,2023-09-11 12:04:35.738000-07:00
6,,"{\n ""python"": ""3.9.16""\n}","""ACCOUNTADMIN""",2023-09-11 12:27:22.101000-07:00,364b1ec850d911ee9dd80a72b796458d,,XGB_MODEL,,,snowml,sfc://MODEL_REGISTRY_CREDIT.CREDIT_DEFAULT.SNO...,v1,,"""This is the demo XGBClassifier trained using ...","{\n ""accuracy"": 0.985789,\n ""cm"": [\n [\n...",,2023-09-11 12:27:23.197000-07:00
7,,"{\n ""python"": ""3.9.16""\n}","""ACCOUNTADMIN""",2023-09-11 12:30:29.961000-07:00,a62e746050d911ee9dd80a72b796458d,,XGB_MODEL,,,snowml,sfc://MODEL_REGISTRY_CREDIT.CREDIT_DEFAULT.SNO...,v4,,"""This is the demo XGBClassifier trained using ...","{\n ""accuracy"": 0.985789,\n ""cm"": [\n [\n...",,2023-09-11 12:30:31.165000-07:00
8,,"{\n ""python"": ""3.9.16""\n}","""ACCOUNTADMIN""",2023-09-11 12:31:35.169000-07:00,cd2c1bee50d911ee9dd80a72b796458d,,XGB_MODEL,,,snowml,sfc://MODEL_REGISTRY_CREDIT.CREDIT_DEFAULT.SNO...,v5,,"""This is the demo XGBClassifier trained using ...","{\n ""accuracy"": 0.984944,\n ""cm"": [\n [\n...",,2023-09-11 12:31:36.366000-07:00
9,,"{\n ""python"": ""3.9.16""\n}","""ACCOUNTADMIN""",2023-09-11 12:26:17.240000-07:00,0fc308ba50d911ee9dd80a72b796458d,,XGB_MODEL,,,snowml,sfc://MODEL_REGISTRY_CREDIT.CREDIT_DEFAULT.SNO...,v0,,"""This is the demo XGBClassifier trained using ...","{\n ""accuracy"": 0.985648,\n ""cm"": [\n [\n...",,2023-09-11 12:26:18.674000-07:00


## 2.2 Log registry

In [30]:
# Train and Test dataset Load
snowdf_train = session.table('CREDIT_DEFAULT_TRAIN')
snowdf_test = session.table('CREDIT_DEFAULT_TEST')

In [37]:
def log_snowml_xgb_model(session, registry, model, train_df, model_version, f1, accuracy, cm):
    from snowflake.ml.registry import model_registry
    
    session.use_database('DEMO_DB')
    session.use_schema('PUBLIC')
    X = train_df.select(feature_cols).limit(100).to_pandas()

    model_id = registry.log_model(model_name="XGB_MODEL", model_version=model_version, model=model, sample_input_data=X[:10])

    registry_model = model_registry.ModelReference(registry=registry, model_name="XGB_MODEL", model_version=model_version)
    registry_model.set_metric(metric_name='f1', metric_value=f1)
    registry_model.set_metric(metric_name='accuracy', metric_value=accuracy)
    registry_model.set_metric(metric_name='cm', metric_value=cm)
    registry_model.set_model_description(description="This is the demo XGBClassifier trained using SnowflakeML to predict whether an applicant will default or not")
    return registry_model

In [None]:
modelv = 0
for model in models:
    MODEL_VERSION = "v{}".format(modelv)
    accuracy, f1, cm = calculate_test_metrics(session, model, snowdf_test)
    registry_model = log_snowml_xgb_model(session, registry, model, snowdf_train, MODEL_VERSION, f1, accuracy, cm)
    modelv += 1

In [None]:
registry.list_models().to_pandas().T

## 2.3 Find Best Model

In [None]:
df = registry.list_models().filter(F.col("NAME") == "CREDIT_DEFAULT_PREDICTION_TEST_08_02_2023")\
.sort(F.parse_json("METRICS")['f1'], ascending=False).to_pandas()[['NAME','VERSION','METRICS']]
df

In [None]:
# get best fitting model by querying registry
best_model_version = registry.list_models().filter(F.col("NAME") == "CREDIT_DEFAULT_PREDICTION_TEST_08_02_2023").sort(F.parse_json("METRICS")['f1'], ascending=False).collect()[0]['VERSION']
best_model = model_registry.ModelReference(registry=registry, model_name="CREDIT_DEFAULT_PREDICTION_TEST1", model_version=best_model_version)
print("Best model version based on F1: {}".format(best_model_version))

## 2.4 Deploy Best Model
Now, we want to deploy our best-fit model to a permanent Python UDF so that it can be integrated into a continuous data pipeline. We'll use the Snowpark model registry's deployment framework to do exactly that with just one line of code:

In [None]:
session.sql("drop function if exists FRAUD_DETECTION_XGB_DEMO_UDF(OBJECT)").collect()

In [None]:
best_model.deploy(deployment_name="FRAUD_DETECTION_XGB_DEMO_UDF_MLREGISTRY",
                  target_method="predict",
                  permanent= True,
                  options={"_use_local_snowml": True,
                           "permanent_udf_stage_location": "@ML_MODELS/fraud_detection",
                           "relax_version": True})