In [None]:
# !conda install snowflake-ml-python --y

In [6]:
# !pip install snowflake-ml-python --index-url https://repo.anaconda.com/pkgs/snowflake --y

## Imports

In [5]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType, FloatType, StringType,StructType, StructField, DecimalType

import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.registry import registry

import json

import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [7]:
my_dir = os.getcwd()
# connection_parameters = json.load(open(f'{my_dir}/creds.json'))
connection_parameters = json.load(open('/Users/skhara/Documents/Code/creds.json'))
session = Session.builder.configs(connection_parameters).create()

In [11]:
session.use_warehouse('SSK_RESEARCH') # Pick a known Warehouse
session.use_database('DEMO_DB') # Pick a DB
session.use_schema('PUBLIC') # Pick a Schema

# 0.0 Load Data

In [12]:
df_credit = pd.read_csv("credit_record.csv")
df_application = pd.read_csv("application_record.csv")
session.write_pandas(df_credit,'CREDIT_RECORD',auto_create_table=True,quote_identifiers=False,overwrite=True)
session.write_pandas(df_application,'APPLICATION_RECORD',auto_create_table=True,quote_identifiers=False,overwrite=True)

<snowflake.snowpark.table.Table at 0x16710a1f0>

## 1.0 Prepare Data 

In [13]:
# Creating a Snowpark DataFrame
application_record_sdf = session.table('APPLICATION_RECORD')
credit_record_sdf = session.table('CREDIT_RECORD')
print('Application table size\t: ',application_record_sdf.count(), 
      '\nCredit table size\t: ', credit_record_sdf.count())

Application table size	:  438557 
Credit table size	:  1048575


In [14]:
credit_record_sdf.limit(5).to_pandas()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [15]:
# We will create a new column, TARGET, that will have a 1 for high-risk and 0 for low-risk.
credit_record_sdf = credit_record_sdf.group_by('ID')\
                        .agg(F.sum(F.iff(F.col('STATUS').in_(['2', '3','4','5']), 1, 0)).as_("CNT_LATE"))\
                        .with_column('TARGET', F.when(F.col('CNT_LATE') > 0, 1).otherwise(0)).drop("CNT_LATE")

In [16]:
# Join Credit Record data with Application Record Data
joined_sdf = application_record_sdf.join(credit_record_sdf, using_columns='ID', join_type='inner')

In [17]:
# Duplicate Removal - Use the **drop_duplicates** to remove duplicated rows
joined_sdf = joined_sdf.drop_duplicates('ID')

In [18]:
joined_sdf.count()

36457

In [19]:
# Selecting a few columns for modeling
cols_numerical = ['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'CNT_FAM_MEMBERS']
cols_categorical = ['CODE_GENDER', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
target = ['TARGET']
joined_sdf = joined_sdf[cols_numerical + cols_categorical + target]

In [20]:
joined_sdf.limit(10).to_pandas()

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_EMPLOYED,FLAG_MOBIL,CNT_FAM_MEMBERS,CODE_GENDER,NAME_HOUSING_TYPE,OCCUPATION_TYPE,TARGET
0,94500.0,365243,1,1,F,House / apartment,,0
1,90000.0,-735,1,4,M,House / apartment,Laborers,0
2,166500.0,-1046,1,3,M,House / apartment,Core staff,0
3,675000.0,-1175,1,4,F,House / apartment,HR staff,0
4,123750.0,-599,1,2,F,House / apartment,Cleaning staff,0
5,135000.0,-95,1,1,F,House / apartment,Sales staff,0
6,225000.0,-1111,1,2,M,House / apartment,,0
7,90000.0,365243,1,1,F,House / apartment,,0
8,180000.0,-643,1,2,M,House / apartment,High skill tech staff,0
9,67500.0,-8691,1,2,F,With parents,Medicine staff,0


In [21]:
# Build the pipeline
preprocessing_pipeline = Pipeline(
    steps=[
            (
                "OHE",
                snowml.OneHotEncoder(
                    input_cols=cols_categorical,
                    output_cols=cols_categorical,
                    drop_input_cols=True
                )
            )
    ]
)

transformed_df = preprocessing_pipeline.fit(joined_sdf).transform(joined_sdf)
transformed_df.limit(5).to_pandas()

Unnamed: 0,CODE_GENDER_F,CODE_GENDER_M,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,...,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,OCCUPATION_TYPE_None,AMT_INCOME_TOTAL,DAYS_EMPLOYED,FLAG_MOBIL,CNT_FAM_MEMBERS,TARGET
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,135000.0,-777,1,2,0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,157500.0,-367,1,3,0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,180000.0,-4354,1,2,0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,171000.0,-8405,1,1,0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,315000.0,-806,1,2,0


In [22]:
transformed_df.columns

['CODE_GENDER_F',
 'CODE_GENDER_M',
 '"NAME_HOUSING_TYPE_Co-op apartment"',
 '"NAME_HOUSING_TYPE_House / apartment"',
 '"NAME_HOUSING_TYPE_Municipal apartment"',
 '"NAME_HOUSING_TYPE_Office apartment"',
 '"NAME_HOUSING_TYPE_Rented apartment"',
 '"NAME_HOUSING_TYPE_With parents"',
 '"OCCUPATION_TYPE_Accountants"',
 '"OCCUPATION_TYPE_Cleaning staff"',
 '"OCCUPATION_TYPE_Cooking staff"',
 '"OCCUPATION_TYPE_Core staff"',
 '"OCCUPATION_TYPE_Drivers"',
 '"OCCUPATION_TYPE_HR staff"',
 '"OCCUPATION_TYPE_High skill tech staff"',
 '"OCCUPATION_TYPE_IT staff"',
 '"OCCUPATION_TYPE_Laborers"',
 '"OCCUPATION_TYPE_Low-skill Laborers"',
 '"OCCUPATION_TYPE_Managers"',
 '"OCCUPATION_TYPE_Medicine staff"',
 '"OCCUPATION_TYPE_Private service staff"',
 '"OCCUPATION_TYPE_Realty agents"',
 '"OCCUPATION_TYPE_Sales staff"',
 '"OCCUPATION_TYPE_Secretaries"',
 '"OCCUPATION_TYPE_Security staff"',
 '"OCCUPATION_TYPE_Waiters/barmen staff"',
 '"OCCUPATION_TYPE_None"',
 'AMT_INCOME_TOTAL',
 'DAYS_EMPLOYED',
 'FLAG_MO

In [23]:
# Cleaning column names to make it easier for future referencing
import re

cols = transformed_df.columns
for old_col in cols:
    new_col = re.sub(r'[^a-zA-Z0-9_]', '', old_col)
    new_col = new_col.upper()
    transformed_df = transformed_df.rename(col(old_col), new_col)

In [24]:
# Split the data and save the train and test sets as tables in Snowflake
snowdf_train, snowdf_test = transformed_df.random_split([0.8, 0.2], seed=82) 
snowdf_train.write.mode("overwrite").save_as_table("CREDIT_DEFAULT_TRAIN")
snowdf_test.write.mode("overwrite").save_as_table("CREDIT_DEFAULT_TEST")

## 2.0 ML Modeling

In [25]:
session.sql('ALTER WAREHOUSE SSK_RESEARCH SET WAREHOUSE_SIZE = "LARGE"').collect()
session.use_warehouse('SSK_RESEARCH')

In [26]:
# Prepare Data for modeling
snowdf_train = session.table('DEMO_DB.PUBLIC.CREDIT_DEFAULT_TRAIN')
feature_cols = snowdf_train.columns
target_col = 'TARGET'
feature_cols.remove(target_col)

In [27]:
# Define the XGBClassifier and fit the model
from snowflake.ml.modeling.xgboost import XGBClassifier
xgbmodel = XGBClassifier(random_state=123, input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION')
xgbmodel.fit(snowdf_train)

The version of package 'snowflake-snowpark-python' in the local environment is 1.12.1, which does not fit the criteria for the requirement 'snowflake-snowpark-python'. Your UDF might not work when the package version is different between the server and your local environment.


<snowflake.ml.modeling.xgboost.xgb_classifier.XGBClassifier at 0x1674ef6a0>

In [28]:
# Score the data using the fitted xgbmodel
snowdf_test = session.table('CREDIT_DEFAULT_TEST')
scored_sdf = xgbmodel.predict(snowdf_test)
print(snowdf_test.count())

7112


In [29]:
# Calculate Metrics
from snowflake.ml.modeling.metrics import f1_score, accuracy_score
F1 = f1_score(df = scored_sdf,
              y_true_col_names = 'TARGET',
              y_pred_col_names = 'PREDICTION')
ACCURACY = accuracy_score(df = scored_sdf,
                          y_true_col_names = 'TARGET',
                          y_pred_col_names = 'PREDICTION')
print(f'F1 Score: {F1} \nAccuracy Score: {ACCURACY}')

DataFrame.flatten() is deprecated since 0.7.0. Use `DataFrame.join_table_function()` instead.


F1 Score: 0.028368794326241134 
Accuracy Score: 0.980737


### Now, let's use Snowpark ML's Distributed GridSearchCV() function to find optimal model parameters

In [None]:
from snowflake.ml.modeling.xgboost import XGBClassifier

grid_search = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid={
        "n_estimators":[100, 200],
        "learning_rate":[0.3],
        "max_depth": [1,3]
    },
    n_jobs = 1,
    scoring="neg_mean_absolute_percentage_error",
    input_cols=feature_cols,
    label_cols=target_col,
    output_cols='PREDICTION'
)

# Train
grid_search.fit(snowdf_train)

In [30]:
db = 'DEMO_DB'
schema = 'CREDIT_APPROVAL'

# Create a registry and log the model
native_registry = registry.Registry(session=session, database_name=db, schema_name=schema)
native_registry.show_models()

Unnamed: 0,created_on,name,database_name,schema_name,comment,owner,default_version_name,versions
0,2024-01-23 18:49:21.897000-08:00,DEMO_CREDIT_XGB,DEMO_DB,CREDIT_APPROVAL,,ACCOUNTADMIN,V0,"[""V0"",""V1"",""V2_2024_02_14"",""V3_2024_02_14"",""V4..."
1,2024-03-07 12:44:09.373000-08:00,TEST_DELETE_LATER,DEMO_DB,CREDIT_APPROVAL,,ACCOUNTADMIN,V1_2024_03_07,"[""V1_2024_03_07""]"


In [31]:
# Define model name
model_name = "DEMO_CREDIT_XGB"
model_version = f"V1_{pd.datetime.now().strftime('%Y_%m_%d')}"
# model_version = "V0"

# Let's log the best model trained
model_ver = native_registry.log_model(
    model_name= model_name,
    version_name= model_version,
    model= xgbmodel
)

In [32]:
snowdf_test = session.table('CREDIT_DEFAULT_TEST')
model_name = "DEMO_CREDIT_XGB"
# model_version = 'V0'

model_ver = native_registry.get_model(model_name).version(model_version)
model_ver.show_functions()


[{'name': 'PREDICT',
  'target_method': 'predict',
  'signature': ModelSignature(
                      inputs=[
                          FeatureSpec(dtype=DataType.DOUBLE, name='CODE_GENDER_F'),
  		FeatureSpec(dtype=DataType.DOUBLE, name='CODE_GENDER_M'),
  		FeatureSpec(dtype=DataType.DOUBLE, name='NAME_HOUSING_TYPE_COOPAPARTMENT'),
  		FeatureSpec(dtype=DataType.DOUBLE, name='NAME_HOUSING_TYPE_HOUSEAPARTMENT'),
  		FeatureSpec(dtype=DataType.DOUBLE, name='NAME_HOUSING_TYPE_MUNICIPALAPARTMENT'),
  		FeatureSpec(dtype=DataType.DOUBLE, name='NAME_HOUSING_TYPE_OFFICEAPARTMENT'),
  		FeatureSpec(dtype=DataType.DOUBLE, name='NAME_HOUSING_TYPE_RENTEDAPARTMENT'),
  		FeatureSpec(dtype=DataType.DOUBLE, name='NAME_HOUSING_TYPE_WITHPARENTS'),
  		FeatureSpec(dtype=DataType.DOUBLE, name='OCCUPATION_TYPE_ACCOUNTANTS'),
  		FeatureSpec(dtype=DataType.DOUBLE, name='OCCUPATION_TYPE_CLEANINGSTAFF'),
  		FeatureSpec(dtype=DataType.DOUBLE, name='OCCUPATION_TYPE_COOKINGSTAFF'),
  		FeatureSpec(dtype=

In [33]:
result_sdf2 = model_ver.run(snowdf_train, function_name="predict")

In [34]:
result_sdf2.write.mode("overwrite").save_as_table("RESULTS")

# Want to Deploy and Schedule Your Code?

In [None]:
from snowflake.core import Root
from snowflake.core.task import StoredProcedureCall, Task
from snowflake.core.task.dagv1 import DAGOperation, DAG, DAGTask
from datetime import date, timedelta

In [None]:
api_root = Root(session)

In [None]:
def func_step1(session:Session):
    session.use_database('DEMO_DB')
    session.use_schema('PUBLIC')
        # Creating a Snowpark DataFrame
    application_record_sdf = session.table('APPLICATION_RECORD')
    credit_record_sdf = session.table('CREDIT_RECORD')

    # We will create a new column, TARGET, that will have a 1 for high-risk and 0 for low-risk.
    credit_record_sdf = credit_record_sdf.group_by('ID')\
                            .agg(F.sum(F.iff(F.col('STATUS').in_(['2', '3','4','5']), 1, 0)).as_("CNT_LATE"))\
                            .with_column('TARGET', F.when(F.col('CNT_LATE') > 0, 1).otherwise(0)).drop("CNT_LATE")

    
    # Join Credit Record data with Application Record Data
    joined_sdf = application_record_sdf.join(credit_record_sdf, using_columns='ID', join_type='inner')
    # Duplicate Removal - Use the **drop_duplicates** to remove duplicated rows
    joined_sdf = joined_sdf.drop_duplicates('ID')

    # Selecting a few columns for modeling
    cols_numerical = ['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'CNT_FAM_MEMBERS', 'TARGET']
    cols_categorical = ['CODE_GENDER', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
    joined_sdf = joined_sdf[cols_numerical+cols_categorical]
    return joined_sdf

In [None]:
def func_data_preprocess_pipeline(session:Session) -> str:
    import snowflake.snowpark.functions as F
    from snowflake.snowpark.functions import udf
    from snowflake.ml.modeling.preprocessing import OneHotEncoder
    import re

    joined_sdf = func_step1(Session)
    # Perform One-Hot-Encoding for categorical columns
    my_ohe_encoder = OneHotEncoder(input_cols=cols_categorical, output_cols=cols_categorical, drop_input_cols=True)
    prepared_sdf = my_ohe_encoder.fit(joined_sdf).transform(joined_sdf)

    cols = prepared_sdf.columns
    for old_col in cols:
        new_col = re.sub(r'[^a-zA-Z0-9_]', '', old_col)
        new_col = new_col.upper()
        prepared_sdf = prepared_sdf.rename(col(old_col), new_col)

    # Save the data as table in Snowflake
    prepared_sdf.write.mode("overwrite").save_as_table("PIPE_CREDIT_DEFAULT")
    
    return f"Preprocessing pipeline ran successfully"

In [None]:
def func_inference_pipeline(session:Session) -> str:
    from snowflake.ml.registry import registry
    registry = registry.Registry(session=session, database_name=db, schema_name=schema)
    
    # Get data
    snowdf_test = session.table('PIPE_CREDIT_DEFAULT')
    model_name = "DEMO_CREDIT_XGB"
    model_ver = registry.get_model(model_name).default
    result_sdf = model_ver.run(snowdf_test, function_name="predict")
    result_sdf.write.mode("overwrite").save_as_table("SCORED_CREDIT_DEFAULT")
    return f"Inference pipeline ran successfully"

#### Setup Tasks

In [None]:
schema = api_root.databases['DEMO_DB'].schemas['PUBLIC']
tasks = schema.tasks

In [None]:
# Task 1 stuff
task1_entity = Task(
    "preprocess_pipeline",
    definition=StoredProcedureCall(func_data_preprocess_pipeline, stage_location="@ML_MODELS", packages=["snowflake-snowpark-python","snowflake-ml-python", "regex"]),
    warehouse='SSK_RESEARCH',
    schedule=timedelta(days=1))

task1 = tasks.create(task1_entity, mode="orReplace")

In [None]:
# Task 2 stuff
task2_entity = Task(
    "inference_pipeline",
    definition=StoredProcedureCall(func_inference_pipeline, stage_location="@ML_MODELS", packages=["snowflake-snowpark-python"]),
    warehouse='SSK_RESEARCH')

task2_entity.predecessors = ["DEMO_DB.PUBLIC.PREPROCESS_PIPELINE"]
task2 = tasks.create(task2_entity, mode="orReplace")

In [None]:
task2.resume()
task1.resume()