## Imports

In [None]:
import configparser

In [None]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType, FloatType, StringType,StructType, StructField, DecimalType

import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.registry import registry

import json

import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [None]:
# !pip install snowflake-ml-python --index-url https://repo.anaconda.com/pkgs/snowflake

In [None]:
my_dir = os.getcwd()
# connection_parameters = json.load(open(f'{my_dir}/creds.json'))
connection_parameters = json.load(open('/Users/skhara/Documents/Code/creds.json'))
session = Session.builder.configs(connection_parameters).create()

# <font color='red'>Snowpark with Big Data</font>

In [None]:
session.use_warehouse('ML_WORKLOADS')

In [None]:
snow_df = session.table('CITIBIKEML_JACK.DEMO.TRIPS')

In [None]:
%%time
snow_df.limit(5).to_pandas()

In [None]:
%%time
print('Size of the Snowpark DF: ', snow_df.count())

In [None]:
# Retrieve the last query ID directly using Snowpark's session object
# This uses the LAST_QUERY_ID() function which gives the ID of the last query executed in the session
last_query_id = session.sql("SELECT LAST_QUERY_ID()").collect()[0][0]
print(f"The last query ID is: {last_query_id}")

In [None]:
%%time
pandas_df = snow_df.with_column("date", F.to_date("STARTTIME")).group_by("date").count().sort("date").to_pandas()

In [None]:
pandas_df

In [None]:
plt.figure(figsize=(15, 8))
ax = sns.lineplot(x='DATE', y='COUNT', data=pandas_df)

#  
#  
# 
 
# <font color='red'>Snowpark for ML</font>

## 1.0 Prepare Data 

In [None]:
session.sql('ALTER WAREHOUSE SSK_RESEARCH SET WAREHOUSE_SIZE = "LARGE"').collect()
session.use_warehouse('SSK_RESEARCH')

In [None]:
session.use_database('DEMO_DB')
session.use_schema('PUBLIC')

In [None]:
# Creating a Snowpark DataFrame
application_record_sdf = session.table('APPLICATION_RECORD')
credit_record_sdf = session.table('CREDIT_RECORD')
print('Application table size\t: ',application_record_sdf.count(), 
      '\nCredit table size\t: ', credit_record_sdf.count())

In [None]:
credit_record_sdf.limit(5).to_pandas()

In [None]:
# We will create a new column, TARGET, that will have a 1 for high-risk and 0 for low-risk.
credit_record_sdf = credit_record_sdf.group_by('ID')\
                        .agg(F.sum(F.iff(F.col('STATUS').in_(['2', '3','4','5']), 1, 0)).as_("CNT_LATE"))\
                        .with_column('TARGET', F.when(F.col('CNT_LATE') > 0, 1).otherwise(0)).drop("CNT_LATE")

In [None]:
# Join Credit Record data with Application Record Data
joined_sdf = application_record_sdf.join(credit_record_sdf, using_columns='ID', join_type='inner')

In [None]:
# Duplicate Removal - Use the **drop_duplicates** to remove duplicated rows
joined_sdf = joined_sdf.drop_duplicates('ID')

In [None]:
joined_sdf.count()

In [None]:
# Selecting a few columns for modeling
cols_numerical = ['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'CNT_FAM_MEMBERS']
cols_categorical = ['CODE_GENDER', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
target = ['TARGET']
joined_sdf = joined_sdf[cols_numerical + cols_categorical + target]

In [None]:
joined_sdf.limit(10).to_pandas()

In [None]:
# Build the pipeline
preprocessing_pipeline = Pipeline(
    steps=[
            (
                "OHE",
                snowml.OneHotEncoder(
                    input_cols=cols_categorical,
                    output_cols=cols_categorical,
                    drop_input_cols=True
                )
            ),
            (
                "XGBModel",
                XGBClassifier(
                    random_state=123,
                    input_cols=feature_cols,
                    label_cols=target_col,
                    output_cols='PREDICTION',
                )
            )
    ]
)

transformed_df = preprocessing_pipeline.fit(joined_sdf).transform(joined_sdf)
transformed_df.limit(5).to_pandas()

In [None]:
transformed_df.columns

In [None]:
# Cleaning column names to make it easier for future referencing
import re

cols = transformed_df.columns
for old_col in cols:
    new_col = re.sub(r'[^a-zA-Z0-9_]', '', old_col)
    new_col = new_col.upper()
    transformed_df = transformed_df.rename(col(old_col), new_col)

In [None]:
# Split the data and save the train and test sets as tables in Snowflake
snowdf_train, snowdf_test = transformed_df.random_split([0.8, 0.2], seed=82) 
snowdf_train.write.mode("overwrite").save_as_table("CREDIT_DEFAULT_TRAIN")
snowdf_test.write.mode("overwrite").save_as_table("CREDIT_DEFAULT_TEST")

## 2.0 ML Modeling

In [None]:
session.sql('ALTER WAREHOUSE SSK_RESEARCH SET WAREHOUSE_SIZE = "LARGE"').collect()
session.use_warehouse('SSK_RESEARCH')

In [None]:
# Prepare Data for modeling
snowdf_train = session.table('DEMO_DB.PUBLIC.CREDIT_DEFAULT_TRAIN')
feature_cols = snowdf_train.columns
target_col = 'TARGET'
feature_cols.remove(target_col)

In [None]:
# Define the XGBClassifier and fit the model
from snowflake.ml.modeling.xgboost import XGBClassifier
xgbmodel = XGBClassifier(random_state=123, input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION')
xgbmodel.fit(snowdf_train)

In [None]:
# Score the data using the fitted xgbmodel
snowdf_test = session.table('CREDIT_DEFAULT_TEST')
scored_sdf = xgbmodel.predict(snowdf_test)
print(snowdf_test.count())

In [None]:
# Calculate Metrics
from snowflake.ml.modeling.metrics import f1_score, accuracy_score
F1 = f1_score(df = scored_sdf,
              y_true_col_names = 'TARGET',
              y_pred_col_names = 'PREDICTION')
ACCURACY = accuracy_score(df = scored_sdf,
                          y_true_col_names = 'TARGET',
                          y_pred_col_names = 'PREDICTION')
print(f'F1 Score: {F1} \nAccuracy Score: {ACCURACY}')

### Now, let's use Snowpark ML's Distributed GridSearchCV() function to find optimal model parameters

In [None]:
from snowflake.ml.modeling.xgboost import XGBClassifier

grid_search = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid={
        "n_estimators":[100, 200],
        "learning_rate":[0.3],
        "max_depth": [1,3]
    },
    n_jobs = 1,
    scoring="neg_mean_absolute_percentage_error",
    input_cols=feature_cols,
    label_cols=target_col,
    output_cols='PREDICTION'
)

# Train
grid_search.fit(snowdf_train)

In [None]:
db = 'DEMO_DB'
schema = 'CREDIT_APPROVAL'

# Create a registry and log the model
native_registry = registry.Registry(session=session, database_name=db, schema_name=schema)
native_registry.show_models()

In [None]:
# Define model name
model_name = "TEST_DELETE_LATER"
model_version = f"V1_{pd.datetime.now().strftime('%Y_%m_%d')}"

# Let's log the best model trained
model_ver = native_registry.log_model(
    model_name= model_name,
    version_name= model_version,
    model= xgbmodel
)

In [None]:
model_name = "TEST_DELETE_LATER"
model_version = f"V1_{pd.datetime.now().strftime('%Y_%m_%d')}"

native_registry.get_model(model_name).show_versions()

In [None]:
snowdf_test = session.table('CREDIT_DEFAULT_TEST')
model_name = "DEMO_CREDIT_XGB"
model_version = 'V0'

model_ver = native_registry.get_model(model_name).version(model_version)
model_ver.show_functions()

In [None]:
snowdf_test = session.table('CREDIT_DEFAULT_TEST')
model_name = "TEST_DELETE_LATER"
model_version = 'V0'

model_ver = native_registry.get_model(model_name).default
result_sdf2 = model_ver.run(snowdf_train, function_name="predict")

In [None]:
result_sdf2.write.mode("overwrite").save_as_table("RESULTS")

# Want to Deploy and Schedule Your Code?

In [None]:
from snowflake.core import Root
from snowflake.core.task import StoredProcedureCall, Task
from snowflake.core.task.dagv1 import DAGOperation, DAG, DAGTask
from datetime import date, timedelta

In [None]:
api_root = Root(session)

In [None]:
def func_step1(session:Session):
    session.use_database('DEMO_DB')
    session.use_schema('PUBLIC')
        # Creating a Snowpark DataFrame
    application_record_sdf = session.table('APPLICATION_RECORD')
    credit_record_sdf = session.table('CREDIT_RECORD')

    # We will create a new column, TARGET, that will have a 1 for high-risk and 0 for low-risk.
    credit_record_sdf = credit_record_sdf.group_by('ID')\
                            .agg(F.sum(F.iff(F.col('STATUS').in_(['2', '3','4','5']), 1, 0)).as_("CNT_LATE"))\
                            .with_column('TARGET', F.when(F.col('CNT_LATE') > 0, 1).otherwise(0)).drop("CNT_LATE")

    
    # Join Credit Record data with Application Record Data
    joined_sdf = application_record_sdf.join(credit_record_sdf, using_columns='ID', join_type='inner')
    # Duplicate Removal - Use the **drop_duplicates** to remove duplicated rows
    joined_sdf = joined_sdf.drop_duplicates('ID')

    # Selecting a few columns for modeling
    cols_numerical = ['AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'CNT_FAM_MEMBERS', 'TARGET']
    cols_categorical = ['CODE_GENDER', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
    joined_sdf = joined_sdf[cols_numerical+cols_categorical]
    return joined_sdf

In [None]:
def func_data_preprocess_pipeline(session:Session) -> str:
    import snowflake.snowpark.functions as F
    from snowflake.snowpark.functions import udf
    from snowflake.ml.modeling.preprocessing import OneHotEncoder
    import re

    joined_sdf = func_step1(Session)
    # Perform One-Hot-Encoding for categorical columns
    my_ohe_encoder = OneHotEncoder(input_cols=cols_categorical, output_cols=cols_categorical, drop_input_cols=True)
    prepared_sdf = my_ohe_encoder.fit(joined_sdf).transform(joined_sdf)

    cols = prepared_sdf.columns
    for old_col in cols:
        new_col = re.sub(r'[^a-zA-Z0-9_]', '', old_col)
        new_col = new_col.upper()
        prepared_sdf = prepared_sdf.rename(col(old_col), new_col)

    # Save the data as table in Snowflake
    prepared_sdf.write.mode("overwrite").save_as_table("PIPE_CREDIT_DEFAULT")
    
    return f"Preprocessing pipeline ran successfully"

In [None]:
def func_inference_pipeline(session:Session) -> str:
    from snowflake.ml.registry import registry
    registry = registry.Registry(session=session, database_name=db, schema_name=schema)
    
    # Get data
    snowdf_test = session.table('PIPE_CREDIT_DEFAULT')
    model_name = "DEMO_CREDIT_XGB"
    model_ver = registry.get_model(model_name).default
    result_sdf = model_ver.run(snowdf_test, function_name="predict")
    result_sdf.write.mode("overwrite").save_as_table("SCORED_CREDIT_DEFAULT")
    return f"Inference pipeline ran successfully"

#### Setup Tasks

In [None]:
schema = api_root.databases['DEMO_DB'].schemas['PUBLIC']
tasks = schema.tasks

In [None]:
# Task 1 stuff
task1_entity = Task(
    "preprocess_pipeline",
    definition=StoredProcedureCall(func_data_preprocess_pipeline, stage_location="@ML_MODELS", packages=["snowflake-snowpark-python","snowflake-ml-python", "regex"]),
    warehouse='SSK_RESEARCH',
    schedule=timedelta(days=1))

task1 = tasks.create(task1_entity, mode="orReplace")

In [None]:
# Task 2 stuff
task2_entity = Task(
    "inference_pipeline",
    definition=StoredProcedureCall(func_inference_pipeline, stage_location="@ML_MODELS", packages=["snowflake-snowpark-python"]),
    warehouse='SSK_RESEARCH')

task2_entity.predecessors = ["DEMO_DB.PUBLIC.PREPROCESS_PIPELINE"]
task2 = tasks.create(task2_entity, mode="orReplace")

In [None]:
task2.resume()
task1.resume()