## Imports

In [None]:
!conda install snowflake-ml-python --y

In [None]:
!conda install snowflake-snowpark-python --y

In [1]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType, FloatType, StringType,StructType, StructField, DecimalType

import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.registry import registry

import json

import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# !pip install snowflake-ml-python --index-url https://repo.anaconda.com/pkgs/snowflake

In [34]:
my_dir = os.getcwd()
# connection_parameters = json.load(open(f'{my_dir}/creds.json'))
connection_parameters = json.load(open('/Users/skhara/Documents/Code/creds.json'))
session = Session.builder.configs(connection_parameters).create()

# <font color='red'>Snowpark with Big Data</font>

In [4]:
session.use_warehouse('ML_WORKLOADS')

In [5]:
snow_df = session.table('CITIBIKEML_JACK.DEMO.TRIPS')

In [6]:
snow_df

<snowflake.snowpark.table.Table at 0x16c5bb640>

In [7]:
%%time
snow_df.limit(5).to_pandas()

CPU times: user 12.7 ms, sys: 5.48 ms, total: 18.2 ms
Wall time: 3.5 s


Unnamed: 0,START_STATION_NAME,START_STATION_ID,END_STATION_NAME,END_STATION_ID,START_STATION_LATITUDE,START_STATION_LONGITUDE,END_STATION_LATITUDE,END_STATION_LONGITUDE,USERTYPE,STARTTIME,STOPTIME
0,Cleveland Pl & Spring St,151,Great Jones St,229,40.722103786686034,-73.99724900722504,40.72743423,-73.99379025,Subscriber,2015-12-18 20:55:35,2015-12-18 20:58:07
1,Cleveland Pl & Spring St,151,W 17 St & 8 Ave,116,40.722103786686034,-73.99724900722504,40.74177603,-74.00149746,Subscriber,2015-12-20 14:19:38,2015-12-20 14:33:54
2,Cleveland Pl & Spring St,151,6 Ave & Canal St,377,40.722103786686034,-73.99724900722504,40.72243797,-74.00566443,Subscriber,2015-12-20 19:00:11,2015-12-20 19:04:43
3,Cleveland Pl & Spring St,151,Allen St & Rivington St,401,40.722103786686034,-73.99724900722504,40.72019576,-73.98997825,Subscriber,2015-12-29 17:33:42,2015-12-29 17:39:57
4,Cleveland Pl & Spring St,151,Washington St & Gansevoort St,405,40.722103786686034,-73.99724900722504,40.739323,-74.008119,Subscriber,2015-12-30 13:56:20,2015-12-30 14:11:19


In [8]:
%%time
print('Size of the Snowpark DF: ', snow_df.count())

Size of the Snowpark DF:  103835123
CPU times: user 3.51 ms, sys: 1.79 ms, total: 5.29 ms
Wall time: 2.37 s


In [None]:
# # Retrieve the last query ID directly using Snowpark's session object
# # This uses the LAST_QUERY_ID() function which gives the ID of the last query executed in the session
# last_query_id = session.sql("SELECT LAST_QUERY_ID()").collect()[0][0]
# print(f"The last query ID is: {last_query_id}")

In [13]:
%%time
pandas_df = snow_df.with_column("date", F.to_date("STARTTIME")).group_by("date").count().sort("date").to_pandas()

CPU times: user 10.3 ms, sys: 2.41 ms, total: 12.7 ms
Wall time: 437 ms


In [12]:
len(pandas_df)

2641

In [None]:
plt.figure(figsize=(15, 8))
ax = sns.lineplot(x='DATE', y='COUNT', data=pandas_df)

#  
#  
# 
 
# <font color='red'>Snowpark for ML</font>

## 1.0 Prepare Data 

In [14]:
session.sql('ALTER WAREHOUSE SSK_RESEARCH SET WAREHOUSE_SIZE = "LARGE"').collect()
session.use_warehouse('SSK_RESEARCH')

In [15]:
session.use_database('DEMO_DB')
session.use_schema('PUBLIC')

In [16]:
# Creating a Snowpark DataFrame
application_record_sdf = session.table('APPLICATION_RECORD')
credit_record_sdf = session.table('CREDIT_RECORD')
print('Application table size\t: ',application_record_sdf.count(), 
      '\nCredit table size\t: ', credit_record_sdf.count())

Application table size	:  438557 
Credit table size	:  1048575


In [18]:
credit_record_sdf.limit(5).to_pandas()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [19]:
# We will create a new column, TARGET, that will have a 1 for high-risk and 0 for low-risk.
credit_record_sdf = credit_record_sdf.group_by('ID')\
                        .agg(F.sum(F.iff(F.col('STATUS').in_(['2', '3','4','5']), 1, 0)).as_("CNT_LATE"))\
                        .with_column('TARGET', F.when(F.col('CNT_LATE') > 0, 1).otherwise(0)).drop("CNT_LATE")

In [20]:
# Join Credit Record data with Application Record Data
joined_sdf = application_record_sdf.join(credit_record_sdf, using_columns='ID', join_type='inner')

In [21]:
# Duplicate Removal - Use the **drop_duplicates** to remove duplicated rows
joined_sdf = joined_sdf.drop_duplicates('ID')

In [22]:
joined_sdf.count()

36457

In [23]:
# Selecting a few columns for modeling
cols_numerical = ['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'CNT_FAM_MEMBERS']
cols_categorical = ['CODE_GENDER', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
target = ['TARGET']
joined_sdf = joined_sdf[cols_numerical + cols_categorical + target]

In [24]:
joined_sdf.limit(10).to_pandas()

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_EMPLOYED,FLAG_MOBIL,CNT_FAM_MEMBERS,CODE_GENDER,NAME_HOUSING_TYPE,OCCUPATION_TYPE,TARGET
0,90000.0,365243,1,2,F,House / apartment,,0
1,135000.0,-5507,1,2,F,House / apartment,,0
2,135000.0,-397,1,3,F,House / apartment,Accountants,0
3,216000.0,-1457,1,3,M,Rented apartment,Laborers,0
4,225000.0,-5064,1,2,M,House / apartment,Laborers,0
5,180000.0,-1892,1,3,M,House / apartment,Laborers,0
6,90000.0,-1508,1,2,F,House / apartment,Medicine staff,0
7,360000.0,-4023,1,2,F,House / apartment,Managers,0
8,121500.0,365243,1,3,F,House / apartment,,0
9,202500.0,-261,1,3,F,House / apartment,Medicine staff,0


In [25]:
# Build the pipeline
preprocessing_pipeline = Pipeline(
    steps=[
            (
                "OHE",
                snowml.OneHotEncoder(
                    input_cols=cols_categorical,
                    output_cols=cols_categorical,
                    drop_input_cols=True
                )
            )
    ]
)

transformed_df = preprocessing_pipeline.fit(joined_sdf).transform(joined_sdf)
transformed_df.limit(5).to_pandas()

Unnamed: 0,CODE_GENDER_F,CODE_GENDER_M,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,...,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,OCCUPATION_TYPE_None,AMT_INCOME_TOTAL,DAYS_EMPLOYED,FLAG_MOBIL,CNT_FAM_MEMBERS,TARGET
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,450000.0,-2470,1,2,0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,135000.0,-777,1,2,0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,270000.0,-2202,1,2,0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,180000.0,-4354,1,2,0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,180000.0,-8923,1,3,0


In [26]:
transformed_df.columns

['CODE_GENDER_F',
 'CODE_GENDER_M',
 '"NAME_HOUSING_TYPE_Co-op apartment"',
 '"NAME_HOUSING_TYPE_House / apartment"',
 '"NAME_HOUSING_TYPE_Municipal apartment"',
 '"NAME_HOUSING_TYPE_Office apartment"',
 '"NAME_HOUSING_TYPE_Rented apartment"',
 '"NAME_HOUSING_TYPE_With parents"',
 '"OCCUPATION_TYPE_Accountants"',
 '"OCCUPATION_TYPE_Cleaning staff"',
 '"OCCUPATION_TYPE_Cooking staff"',
 '"OCCUPATION_TYPE_Core staff"',
 '"OCCUPATION_TYPE_Drivers"',
 '"OCCUPATION_TYPE_HR staff"',
 '"OCCUPATION_TYPE_High skill tech staff"',
 '"OCCUPATION_TYPE_IT staff"',
 '"OCCUPATION_TYPE_Laborers"',
 '"OCCUPATION_TYPE_Low-skill Laborers"',
 '"OCCUPATION_TYPE_Managers"',
 '"OCCUPATION_TYPE_Medicine staff"',
 '"OCCUPATION_TYPE_Private service staff"',
 '"OCCUPATION_TYPE_Realty agents"',
 '"OCCUPATION_TYPE_Sales staff"',
 '"OCCUPATION_TYPE_Secretaries"',
 '"OCCUPATION_TYPE_Security staff"',
 '"OCCUPATION_TYPE_Waiters/barmen staff"',
 '"OCCUPATION_TYPE_None"',
 'AMT_INCOME_TOTAL',
 'DAYS_EMPLOYED',
 'FLAG_MO

In [27]:
# Cleaning column names to make it easier for future referencing
import re

cols = transformed_df.columns
for old_col in cols:
    new_col = re.sub(r'[^a-zA-Z0-9_]', '', old_col)
    new_col = new_col.upper()
    transformed_df = transformed_df.rename(col(old_col), new_col)

In [28]:
# Split the data and save the train and test sets as tables in Snowflake
snowdf_train, snowdf_test = transformed_df.random_split([0.8, 0.2], seed=82) 
snowdf_train.write.mode("overwrite").save_as_table("CREDIT_DEFAULT_TRAIN")
snowdf_test.write.mode("overwrite").save_as_table("CREDIT_DEFAULT_TEST")

## 2.0 ML Modeling

In [35]:
session.sql('ALTER WAREHOUSE SSK_RESEARCH SET WAREHOUSE_SIZE = "LARGE"').collect()
session.use_warehouse('SSK_RESEARCH')

In [36]:
# Prepare Data for modeling
snowdf_train = session.table('DEMO_DB.PUBLIC.CREDIT_DEFAULT_TRAIN')
feature_cols = snowdf_train.columns
target_col = 'TARGET'
feature_cols.remove(target_col)

In [37]:
# Define the XGBClassifier and fit the model
from snowflake.ml.modeling.xgboost import XGBClassifier
xgbmodel = XGBClassifier(random_state=123, input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION')
xgbmodel.fit(snowdf_train)

<snowflake.ml.modeling.xgboost.xgb_classifier.XGBClassifier at 0x322837070>

In [None]:
# Score the data using the fitted xgbmodel
snowdf_test = session.table('CREDIT_DEFAULT_TEST')
scored_sdf = xgbmodel.predict(snowdf_test)
print(snowdf_test.count())

In [None]:
# Calculate Metrics
from snowflake.ml.modeling.metrics import f1_score, accuracy_score
F1 = f1_score(df = scored_sdf,
              y_true_col_names = 'TARGET',
              y_pred_col_names = 'PREDICTION')
ACCURACY = accuracy_score(df = scored_sdf,
                          y_true_col_names = 'TARGET',
                          y_pred_col_names = 'PREDICTION')
print(f'F1 Score: {F1} \nAccuracy Score: {ACCURACY}')

### Now, let's use Snowpark ML's Distributed GridSearchCV() function to find optimal model parameters

In [None]:
from snowflake.ml.modeling.xgboost import XGBClassifier

grid_search = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid={
        "n_estimators":[100, 200],
        "learning_rate":[0.3],
        "max_depth": [1,3]
    },
    n_jobs = 1,
    scoring="neg_mean_absolute_percentage_error",
    input_cols=feature_cols,
    label_cols=target_col,
    output_cols='PREDICTION'
)

# Train
grid_search.fit(snowdf_train)

In [None]:
db = 'DEMO_DB'
schema = 'CREDIT_APPROVAL'

# Create a registry and log the model
native_registry = registry.Registry(session=session, database_name=db, schema_name=schema)
native_registry.show_models()

In [None]:
# Define model name
model_name = "TEST_DELETE_LATER"
model_version = f"V1_{pd.datetime.now().strftime('%Y_%m_%d')}"

# Let's log the best model trained
model_ver = native_registry.log_model(
    model_name= model_name,
    version_name= model_version,
    model= xgbmodel
)

In [None]:
model_name = "TEST_DELETE_LATER"
model_version = f"V1_{pd.datetime.now().strftime('%Y_%m_%d')}"

native_registry.get_model(model_name).show_versions()

In [None]:
snowdf_test = session.table('CREDIT_DEFAULT_TEST')
model_name = "DEMO_CREDIT_XGB"
model_version = 'V0'

model_ver = native_registry.get_model(model_name).version(model_version)
model_ver.show_functions()

In [None]:
snowdf_test = session.table('CREDIT_DEFAULT_TEST')
model_name = "TEST_DELETE_LATER"
model_version = 'V0'

model_ver = native_registry.get_model(model_name).default
result_sdf2 = model_ver.run(snowdf_train, function_name="predict")

In [None]:
result_sdf2.write.mode("overwrite").save_as_table("RESULTS")

# Want to Deploy and Schedule Your Code?

In [None]:
from snowflake.core import Root
from snowflake.core.task import StoredProcedureCall, Task
from snowflake.core.task.dagv1 import DAGOperation, DAG, DAGTask
from datetime import date, timedelta

In [None]:
api_root = Root(session)

In [None]:
def func_step1(session:Session):
    session.use_database('DEMO_DB')
    session.use_schema('PUBLIC')
        # Creating a Snowpark DataFrame
    application_record_sdf = session.table('APPLICATION_RECORD')
    credit_record_sdf = session.table('CREDIT_RECORD')

    # We will create a new column, TARGET, that will have a 1 for high-risk and 0 for low-risk.
    credit_record_sdf = credit_record_sdf.group_by('ID')\
                            .agg(F.sum(F.iff(F.col('STATUS').in_(['2', '3','4','5']), 1, 0)).as_("CNT_LATE"))\
                            .with_column('TARGET', F.when(F.col('CNT_LATE') > 0, 1).otherwise(0)).drop("CNT_LATE")

    
    # Join Credit Record data with Application Record Data
    joined_sdf = application_record_sdf.join(credit_record_sdf, using_columns='ID', join_type='inner')
    # Duplicate Removal - Use the **drop_duplicates** to remove duplicated rows
    joined_sdf = joined_sdf.drop_duplicates('ID')

    # Selecting a few columns for modeling
    cols_numerical = ['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'CNT_FAM_MEMBERS', 'TARGET']
    cols_categorical = ['CODE_GENDER', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
    joined_sdf = joined_sdf[cols_numerical+cols_categorical]
    return joined_sdf

In [None]:
def func_data_preprocess_pipeline(session:Session) -> str:
    import snowflake.snowpark.functions as F
    from snowflake.snowpark.functions import udf
    from snowflake.ml.modeling.preprocessing import OneHotEncoder
    import re

    joined_sdf = func_step1(Session)
    # Perform One-Hot-Encoding for categorical columns
    my_ohe_encoder = OneHotEncoder(input_cols=cols_categorical, output_cols=cols_categorical, drop_input_cols=True)
    prepared_sdf = my_ohe_encoder.fit(joined_sdf).transform(joined_sdf)

    cols = prepared_sdf.columns
    for old_col in cols:
        new_col = re.sub(r'[^a-zA-Z0-9_]', '', old_col)
        new_col = new_col.upper()
        prepared_sdf = prepared_sdf.rename(col(old_col), new_col)

    # Save the data as table in Snowflake
    prepared_sdf.write.mode("overwrite").save_as_table("PIPE_CREDIT_DEFAULT")
    
    return f"Preprocessing pipeline ran successfully"

In [None]:
def func_inference_pipeline(session:Session) -> str:
    from snowflake.ml.registry import registry
    registry = registry.Registry(session=session, database_name=db, schema_name=schema)
    
    # Get data
    snowdf_test = session.table('PIPE_CREDIT_DEFAULT')
    model_name = "DEMO_CREDIT_XGB"
    model_ver = registry.get_model(model_name).default
    result_sdf = model_ver.run(snowdf_test, function_name="predict")
    result_sdf.write.mode("overwrite").save_as_table("SCORED_CREDIT_DEFAULT")
    return f"Inference pipeline ran successfully"

#### Setup Tasks

In [None]:
schema = api_root.databases['DEMO_DB'].schemas['PUBLIC']
tasks = schema.tasks

In [None]:
# Task 1 stuff
task1_entity = Task(
    "preprocess_pipeline",
    definition=StoredProcedureCall(func_data_preprocess_pipeline, stage_location="@ML_MODELS", packages=["snowflake-snowpark-python","snowflake-ml-python", "regex"]),
    warehouse='SSK_RESEARCH',
    schedule=timedelta(days=1))

task1 = tasks.create(task1_entity, mode="orReplace")

In [None]:
# Task 2 stuff
task2_entity = Task(
    "inference_pipeline",
    definition=StoredProcedureCall(func_inference_pipeline, stage_location="@ML_MODELS", packages=["snowflake-snowpark-python"]),
    warehouse='SSK_RESEARCH')

task2_entity.predecessors = ["DEMO_DB.PUBLIC.PREPROCESS_PIPELINE"]
task2 = tasks.create(task2_entity, mode="orReplace")

In [None]:
task2.resume()
task1.resume()