### Load Snowpark libraries

In [1]:
# Import required libraries
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import avg, sum, col,lit
from snowflake.snowpark.functions import udf, sproc, col
from snowflake.snowpark.types import IntegerType, FloatType, LongType, DoubleType, DecimalType,StringType, BooleanType, Variant
from snowflake.snowpark.types import PandasSeries, PandasDataFrame
from snowflake.snowpark import functions as fn

import sys ,json
import io
import logging
import pandas as pd

import joblib
import pandas as pd
import numpy as np
import json

from snowflake.snowpark import version
print (f"snowflake snowpark version is: {version.VERSION}")

snowflake snowpark version is: (0, 10, 0)


### Connect to Snowflake and establish session

In [2]:
snowflake_connection_cfg = open('cred.json')
snowflake_connection_cfg = snowflake_connection_cfg.read()
snowflake_connection_cfg = json.loads(snowflake_connection_cfg)

'''
APP_WH XS
LAB_WH S
HMWH M optimized warehouse
DCR_MA_WH L
BANK1_WH XL
'''

# Creating Snowpark Session
hottopic_session = Session.builder.configs(snowflake_connection_cfg).create()
print('Current Database:', hottopic_session.get_current_database())
print('Current Schema:', hottopic_session.get_current_schema())
print('Current Warehouse:', hottopic_session.get_current_warehouse())
print("Warehouse set up:")
hottopic_session.sql("show warehouses like 'APP_WH'").collect()

Current Database: "BANK1_CRM_DB"
Current Schema: "PUBLIC"
Current Warehouse: "APP_WH"
Warehouse set up:


[Row(name='APP_WH', state='STARTED', type='STANDARD', size='X-Small', min_cluster_count=1, max_cluster_count=1, started_clusters=1, running=0, queued=0, is_default='N', is_current='Y', auto_suspend=600, auto_resume='true', available=' 100', provisioning='0', quiescing='0', other='0', created_on=datetime.datetime(2022, 2, 27, 4, 51, 57, 85000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), resumed_on=datetime.datetime(2022, 12, 8, 19, 27, 10, 202000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), updated_on=datetime.datetime(2022, 12, 8, 19, 27, 10, 202000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), owner='SYSADMIN', comment='', enable_query_acceleration='false', query_acceleration_max_scale_factor=8, resource_monitor='null', actives=1, pendings=0, failed=0, suspended=0, uuid='1463550724', scaling_policy='STANDARD')]

### Create stage location for models

In [3]:
hottopic_session.sql("CREATE OR REPLACE STAGE hottopic_stage_models").collect()

[Row(status='Stage area HOTTOPIC_STAGE_MODELS successfully created.')]

In [4]:
hottopic_session.sql("CREATE OR REPLACE STAGE hottopic_stage_data").collect()

[Row(status='Stage area HOTTOPIC_STAGE_DATA successfully created.')]

In [5]:
hottopic_session.clear_packages()
hottopic_session.add_packages("snowflake-snowpark-python")
hottopic_session.add_packages("scikit-learn","pandas","numpy","joblib","cachetools")
hottopic_session.clear_imports()
# hottopic_session.add_import(ge_import_path)

### Define function to save trained model

In [6]:
def save_file(session, model, path, dest_filename):
    # logger.debug('#save_file: -- START--')
    input_stream = io.BytesIO()
    joblib.dump(model, input_stream)
    session._conn.upload_stream(input_stream, path, dest_filename)
    return "successfully created file: " + path

### Define Features required to train model

In [7]:
features=["RECENCY_DAY",
"FREQUENCY",
"MONETORY",
"RMF_SCORE",
"DOTCOM",
"REWARDS_ACCOUNT",
"FREQ_1",
"FREQ_2",
"FREQ_3",
"FREQ_4",
"FREQ_5",
"FREQ_6",
"FREQ_7",
"FREQ_8",
"FREQ_9",
"FREQ_10",
"FREQ_11",
"FREQ_12",
"CNT_PER_PDT",
"CNT_PER_PDT_SFC",
"CNT_PER_PDT_VFC",
"NO_DISCOUNT",
"DISCOUNT_PROMOTION",
"SUM_DIS_PRO"]

### Define Model pipeline for Imputer, Standard Scaler and Random Classifier Model

In [8]:
def build_rf_model(p_df: pd.DataFrame,ne,nj,cw, md):
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.ensemble import RandomForestClassifier
    numeric_features = p_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = p_df.select_dtypes(include=['object']).columns.tolist()

    feature_names = numeric_features + categorical_features

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler(with_mean=True,with_std=True))])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier'
                    ,RandomForestClassifier(n_estimators=ne, n_jobs=-nj, class_weight=cw,max_depth=md)
                )
            ])

    return model


In [9]:
def build_dtree_model(p_df: pd.DataFrame,cw, md):
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.tree import DecisionTreeClassifier
    numeric_features = p_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = p_df.select_dtypes(include=['object']).columns.tolist()

    feature_names = numeric_features + categorical_features

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler(with_mean=True,with_std=True))])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier'
                    ,DecisionTreeClassifier(class_weight=cw,max_depth=md)
                )
            ])

    return model


### Define Classification report to register model output

In [10]:
def get_classification_report(y_test, y_pred):
    from sklearn import metrics
    report = metrics.classification_report(y_test, y_pred, output_dict=True,target_names=['0','1'])
    df_classification_report = pd.DataFrame(report).transpose()    
    return df_classification_report

### Define Model parameteres to register model output

In [11]:
def get_model_info(model_name, test_size, random_state,ne,nj,cw,max_depth):
    data = [[model_name,test_size,random_state,ne,nj,cw,max_depth]]  
    df_model_info = pd.DataFrame(data,columns=['model','test_size','random_state','ne','nj','cw','max_depth'])
    return df_model_info

### Define Train random forest classifier model

In [12]:
def hottopic_train_rf_model(session: Session, training_table: str, model_name: str,features:list, Y: str,test_size:float,random_state:int,ne:int,nj:int,cw:str, md:int) -> str:
    from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

    training_data = session.table(training_table)

    Data_train, Data_test = training_data.random_split([1-test_size, test_size], seed=random_state)
    pd_Data_train=Data_train.to_pandas()
    pd_Data_test=Data_test.to_pandas()
    from sklearn.ensemble import RandomForestClassifier 
    # Model building
    rf = build_rf_model(pd_Data_train[features],ne,nj,cw, md)
    rf.fit(pd_Data_train[features], pd_Data_train[Y])

    model_dir = '@hottopic_stage_models'
    model_fl = model_name+'.joblib'
    save_file(session, rf, model_dir ,model_fl)

    score = rf.score(pd_Data_test[features], pd_Data_test[Y])
    
    y_pred = rf.predict(pd_Data_test)
    df_classification_report = get_classification_report(y_pred,pd_Data_test[Y]).reset_index().rename(columns={"index": "class"}).reset_index(drop=True)
    df_model_info = get_model_info(model_fl,test_size,random_state,ne,nj,cw,md)
    df_model_info=df_model_info.append([df_model_info]*5,ignore_index=True)
    session.create_dataframe(df_classification_report.join(df_model_info)).write.mode("append").save_as_table("hottopic_model_output")
    
    return df_classification_report.join(df_model_info)

### Define train decision Tree classifier model

In [13]:
def hottopic_train_dtree_model(session: Session, training_table: str, model_name: str,features:list, Y: str,test_size:float,random_state:int,cw:str, md:int) -> str:
    from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

    training_data = session.table(training_table)

    Data_train, Data_test = training_data.random_split([1-test_size, test_size], seed=random_state)
    pd_Data_train=Data_train.to_pandas()
    pd_Data_test=Data_test.to_pandas()
    from sklearn.ensemble import RandomForestClassifier 
    # Model building
    dtree = build_dtree_model(pd_Data_train[features],cw, md)
    dtree.fit(pd_Data_train[features], pd_Data_train[Y])

    model_dir = '@hottopic_stage_models'
    model_fl = model_name+'.joblib'
    save_file(session, dtree, model_dir ,model_fl)

    score = dtree.score(pd_Data_test[features], pd_Data_test[Y])
    
    y_pred = dtree.predict(pd_Data_test)
    df_classification_report = get_classification_report(y_pred,pd_Data_test[Y]).reset_index().rename(columns={"index": "class"}).reset_index(drop=True)
    df_model_info = get_model_info(model_fl,test_size,random_state,None,None,cw,md)
    df_model_info=df_model_info.append([df_model_info]*5,ignore_index=True)
    session.create_dataframe(df_classification_report.join(df_model_info)).write.mode("append").save_as_table("hottopic_model_output")
    
    return df_classification_report.join(df_model_info)

### Create Model output tables to save model ouptut

In [14]:
hottopic_session.sql("create or replace table hottopic_model_output (class varchar, precision double, recall double, f1score double, support double, model varchar,test_size float, random_state int, ne int, nj int, cw varchar, max_depth int)").collect()

[Row(status='Table HOTTOPIC_MODEL_OUTPUT successfully created.')]

### Define stored proc to register random forest classifier model

In [15]:
# Registering the function as a Stored Procedure
hottopic_rf_sproc = hottopic_session.sproc.register(func=hottopic_train_rf_model, # training function defined above
                                            name='hottopic_train_rf_model', # training model name to be registered in snowlake
                                            is_permanent=True, # permanent stored proc
                                            replace=True, # replace if existing already
                                            stage_location='@hottopic_stage_models', # save the model in stage location
                                            packages=['snowflake-snowpark-python','scikit-learn','joblib']) # import model libaries


### Define Stored Proc to register decision tree classifier model

In [16]:
# Registering the function as a Stored Procedure
hottopic_dtree_sproc = hottopic_session.sproc.register(func=hottopic_train_dtree_model, # training function defined above
                                            name='hottopic_train_dtree_model', # training model name to be registered in snowlake
                                            is_permanent=True, # permanent stored proc
                                            replace=True, # replace if existing already
                                            stage_location='@hottopic_stage_models', # save the model in stage location
                                            packages=['snowflake-snowpark-python','scikit-learn','joblib']) # import model libaries


### Check how much data we have in the train table and we are thinking it is roughly 15 Million records

In [19]:
hottopic_session.sql("select to_varchar(count(*), '999,999,999,999') as train_set_count from hot_topic_train_table_15M").show()

---------------------
|"TRAIN_SET_COUNT"  |
---------------------
|      15,002,000   |
---------------------



### Train  Random Forest classifier and Decision Tree Classifier in Snowflake through registered Stored Procs and capture model output in a snowflake table

### All the above steps are just definition and Registration of component
### The below training runs completely on Snowflake and you can go check in history tab
### I have changed the warehouse HMWH to suit the training needs

In [17]:
hottopic_session.sql("use warehouse HMWH").collect()
print('Current Warehouse:', hottopic_session.get_current_warehouse())
print("Warehouse set up:")
hottopic_session.sql("show warehouses like 'HMWH'").collect()

Current Warehouse: "HMWH"
Warehouse set up:


[Row(name='HMWH', state='SUSPENDED', type='SNOWPARK-OPTIMIZED', size='Medium', min_cluster_count=1, max_cluster_count=1, started_clusters=0, running=0, queued=0, is_default='N', is_current='Y', auto_suspend=300, auto_resume='true', available='', provisioning='', quiescing='', other='', created_on=datetime.datetime(2022, 11, 1, 18, 18, 59, 3000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), resumed_on=datetime.datetime(2022, 12, 7, 21, 49, 31, 111000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), updated_on=datetime.datetime(2022, 12, 7, 21, 49, 31, 111000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), owner='SYSADMIN', comment='warehouse used by role dev_blogger', enable_query_acceleration='false', query_acceleration_max_scale_factor=8, resource_monitor='null', actives=0, pendings=0, failed=0, suspended=1, uuid='1463550932', scaling_policy='STANDARD')]

### Now the training begins with snowpark optimized warehouse - Medium

### Remember we are building 24 trees here before final vote which is a giant training workload. It takes less than 4 minutes

In [20]:
table_name = 'hot_topic_train_table_15M'
test_size = 0.25
max_depth = 25
model_name = 'rf_hottopic_model'
random_state = 43,
n_estimator = 24
n_jobs = 1,
class_weight = 'balanced_subsample'
print ("random forest classifier report")
print (hottopic_rf_sproc(table_name
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43
                ,n_estimator
                ,1
                ,class_weight
                , max_depth))

random forest classifier report
          class  precision    recall  ...  nj                  cw max_depth
0             0   0.692308  0.749174  ...   1  balanced_subsample        25
1             1   0.704741  0.642607  ...   1  balanced_subsample        25
2      accuracy   0.697776  0.697776  ...   1  balanced_subsample        25
3     macro avg   0.698524  0.695890  ...   1  balanced_subsample        25
4  weighted avg   0.698304  0.697776  ...   1  balanced_subsample        25

[5 rows x 12 columns]


### Now let's try to train decision tree model with 15M records again.

### This training is comparatively less massive compared to previous but still trains on 15M rows table.

In [21]:
table_name = 'hot_topic_train_table_15M'
test_size = 0.25
max_depth = 25
model_name = 'dtree_hottopic_model'
random_state = 43,
class_weight = 'balanced'
print ("decision tree classifier report")
print (hottopic_rf_sproc(table_name
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43,
                4,
                1
                ,class_weight
                , max_depth))

decision tree classifier report
          class  precision    recall  f1-score  ...  ne nj        cw  max_depth
0             0   0.657652  0.735144  0.694242  ...   4  1  balanced         25
1             1   0.698302  0.615666  0.654385  ...   4  1  balanced         25
2      accuracy   0.675533  0.675533  0.675533  ...   4  1  balanced         25
3     macro avg   0.677977  0.675405  0.674314  ...   4  1  balanced         25
4  weighted avg   0.677933  0.675533  0.674357  ...   4  1  balanced         25

[5 rows x 12 columns]


### Training results stored in snowflake table

In [23]:
hottopic_session.sql("""select class,precision,recall,f1score,model from hottopic_model_output """).show()

-------------------------------------------------------------------------------------------------------------
|"CLASS"       |"PRECISION"         |"RECALL"            |"F1SCORE"           |"MODEL"                      |
-------------------------------------------------------------------------------------------------------------
|0             |0.6923082415675701  |0.7491739960458956  |0.7196194559974349  |rf_hottopic_model.joblib     |
|1             |0.7047406121883438  |0.64260682919493    |0.6722410512589072  |rf_hottopic_model.joblib     |
|accuracy      |0.6977758114831959  |0.6977758114831959  |0.6977758114831959  |rf_hottopic_model.joblib     |
|macro avg     |0.6985244268779569  |0.6958904126204128  |0.6959302536281711  |rf_hottopic_model.joblib     |
|weighted avg  |0.6983044719131892  |0.6977758114831959  |0.6967684779429774  |rf_hottopic_model.joblib     |
|0             |0.6576519990959162  |0.7351442464705651  |0.6942423701301556  |dtree_hottopic_model.joblib  |
|1        

### Check the Trained models as Queries in Snowflake and the duration it took

In [42]:
qry = '''select query_id,
       substr(query_text,59,21) model,
       warehouse_name,
       execution_status,
       TIMESTAMPDIFF(second,start_time,end_time) total_duration
from table(information_schema.QUERY_HISTORY_BY_WAREHOUSE('HMWH'))
WHERE query_text like '%CALL%' and query_text like '%hot_topic%'
and TIMESTAMPDIFF(second,start_time,end_time) > 100
order by start_time desc'''
hottopic_session.sql(qry).show()


---------------------------------------------------------------------------------------------------------------------------
|"QUERY_ID"                            |"MODEL"                |"WAREHOUSE_NAME"  |"EXECUTION_STATUS"  |"TOTAL_DURATION"  |
---------------------------------------------------------------------------------------------------------------------------
|01a8d846-0602-46ac-0057-3c03015c96d2  |'dtree_hottopic_model  |HMWH              |SUCCESS             |177               |
|01a8d841-0602-46ac-0057-3c03015c95ee  |'rf_hottopic_model',   |HMWH              |SUCCESS             |236               |
---------------------------------------------------------------------------------------------------------------------------



### Check if the classifier models are saved in stage location.
### Remember if the same model name was used for all the iterations, then only the last trained model will be saved for the model name.

In [43]:
hottopic_session.sql("list @hottopic_stage_models").collect()

[Row(name='hottopic_stage_models/dtree_hottopic_model.joblib', size=1117024, md5='99711b1d3bea8d4bc944a0638794cec6', last_modified='Fri, 9 Dec 2022 03:53:21 GMT'),
 Row(name='hottopic_stage_models/rf_hottopic_model.joblib', size=4827680, md5='71441ac3b1f7541dea7fdc46e09ff61b', last_modified='Fri, 9 Dec 2022 03:48:28 GMT')]

### Define UDF for Inference

### for Randomforest classifier

In [44]:
import sys
import cachetools
import os
from snowflake.snowpark.functions import udf
hottopic_session.add_import("@hottopic_stage_models/rf_hottopic_model.joblib")  

@cachetools.cached(cache={})
def read_file(filename):
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

@udf(name="predict_rf_hottopic_model", is_permanent=True, stage_location="@hottopic_stage_models", replace=True)
def predict(RECENCY_DAY: float, FREQUENCY: float, MONETORY: float, RMF_SCORE: float, DOTCOM: float, REWARDS_ACCOUNT: float,
            FREQ_1: float, FREQ_2: float, FREQ_3: float, FREQ_4: float, FREQ_5: float, FREQ_6: float, FREQ_7: float, 
            FREQ_8: float, FREQ_9: float, FREQ_10: float, FREQ_11: float, FREQ_12: float, CNT_PER_PDT: float, 
            CNT_PER_PDT_SFC: float, CNT_PER_PDT_VFC: float, NO_DISCOUNT: float, DISCOUNT_PROMOTION: float,
            SUM_DIS_PRO: float) -> float:
       m = read_file('rf_hottopic_model.joblib')       
       row = pd.DataFrame([locals()], columns=features)
       return m.predict(row)[0]

### for Decision Tree classifier

In [45]:

import sys
import cachetools
import os
from snowflake.snowpark.functions import udf
hottopic_session.add_import("@hottopic_stage_models/dtree_hottopic_model.joblib")  

@cachetools.cached(cache={})
def read_file(filename):
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

@udf(name="predict_dtree_hottopic_model", is_permanent=True, stage_location="@hottopic_stage_models", replace=True)
def predict(RECENCY_DAY: float, FREQUENCY: float, MONETORY: float, RMF_SCORE: float, DOTCOM: float, REWARDS_ACCOUNT: float,
            FREQ_1: float, FREQ_2: float, FREQ_3: float, FREQ_4: float, FREQ_5: float, FREQ_6: float, FREQ_7: float, 
            FREQ_8: float, FREQ_9: float, FREQ_10: float, FREQ_11: float, FREQ_12: float, CNT_PER_PDT: float, 
            CNT_PER_PDT_SFC: float, CNT_PER_PDT_VFC: float, NO_DISCOUNT: float, DISCOUNT_PROMOTION: float, 
            SUM_DIS_PRO: float) -> float:
       m = read_file('dtree_hottopic_model.joblib')       
       row = pd.DataFrame([locals()], columns=features)
       return m.predict(row)[0]

In [None]:
features = ['RECENCY_DAY',
'FREQUENCY',
'MONETORY',
'RMF_SCORE',
'DOTCOM',
'REWARDS_ACCOUNT',
'FREQ_1',
'FREQ_2',
'FREQ_3',
'FREQ_4',
'FREQ_5',
'FREQ_6',
'FREQ_7',
'FREQ_8',
'FREQ_9',
'FREQ_10',
'FREQ_11',
'FREQ_12',
'CNT_PER_PDT',
'CNT_PER_PDT_SFC',
'CNT_PER_PDT_VFC',
'NO_DISCOUNT',
'DISCOUNT_PROMOTION',
'SUM_DIS_PRO',
'LABEL']

### Check sample Data and for that let's try to change to a smaller standard warehouse

In [48]:
hottopic_session.sql("use warehouse app_wh").collect()

[Row(status='Statement executed successfully.')]

In [49]:
snowpark_hottopic_df = hottopic_session.table('hot_topic_test_table_1k')
snowpark_hottopic_df.show(1)

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ACCT_NMB"  |"RECENCY_DAY"  |"FREQUENCY"  |"MONETORY"  |"RMF_SCORE"  |"DOTCOM"  |"REWARDS_ACCOUNT"  |"FREQ_1"  |"FREQ_2"  |"FREQ_3"  |"FREQ_4"  |"FREQ_5"  |"FREQ_6"  |"FREQ_7"  |"FREQ_8"  |"FREQ_9"  |"FREQ_10"  |"FREQ_11"  |"FREQ_12"  |"CNT_PER_PDT"  |"CNT_PER_PDT_SFC"  |"CNT_PER_PDT_VFC"  |"NO_DISCOUNT"  |"DISCOUNT_PROMOTION"  |"SUM_DIS_PRO"  |"LABEL"  |"features"                                          |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Check Data structure

In [50]:
snowpark_hottopic_df.schema.fields

[StructField('ACCT_NMB', StringType(), nullable=True),
 StructField('RECENCY_DAY', DoubleType(), nullable=True),
 StructField('FREQUENCY', DoubleType(), nullable=True),
 StructField('MONETORY', DoubleType(), nullable=True),
 StructField('RMF_SCORE', DoubleType(), nullable=True),
 StructField('DOTCOM', DoubleType(), nullable=True),
 StructField('REWARDS_ACCOUNT', DoubleType(), nullable=True),
 StructField('FREQ_1', DoubleType(), nullable=True),
 StructField('FREQ_2', DoubleType(), nullable=True),
 StructField('FREQ_3', DoubleType(), nullable=True),
 StructField('FREQ_4', DoubleType(), nullable=True),
 StructField('FREQ_5', DoubleType(), nullable=True),
 StructField('FREQ_6', DoubleType(), nullable=True),
 StructField('FREQ_7', DoubleType(), nullable=True),
 StructField('FREQ_8', DoubleType(), nullable=True),
 StructField('FREQ_9', DoubleType(), nullable=True),
 StructField('FREQ_10', DoubleType(), nullable=True),
 StructField('FREQ_11', DoubleType(), nullable=True),
 StructField('FREQ_1

### Now using both randomforest and decisiontree classifiers let's predict and infer using SQL.

### You can compare both model inference results side by side using SQL Query completed run on Snowflake

In [53]:
qry="""SELECT 
       substr(ACCT_NMB,5,5) ACCT_NMB,
       LABEL AS ORIGINAL_LABEL,
       predict_rf_hottopic_model(RECENCY_DAY,FREQUENCY, MONETORY,RMF_SCORE,DOTCOM,REWARDS_ACCOUNT,
       FREQ_1,FREQ_2, FREQ_3, FREQ_4, FREQ_5, FREQ_6, FREQ_7, FREQ_8, FREQ_9, FREQ_10, FREQ_11, FREQ_12,
       CNT_PER_PDT, CNT_PER_PDT_SFC, CNT_PER_PDT_VFC, NO_DISCOUNT, DISCOUNT_PROMOTION, SUM_DIS_PRO) as PRED_rf_LABEL,
       predict_dtree_hottopic_model(RECENCY_DAY,FREQUENCY, MONETORY,RMF_SCORE,DOTCOM,REWARDS_ACCOUNT,
       FREQ_1,FREQ_2, FREQ_3, FREQ_4, FREQ_5, FREQ_6, FREQ_7, FREQ_8, FREQ_9, FREQ_10, FREQ_11, FREQ_12,
       CNT_PER_PDT, CNT_PER_PDT_SFC, CNT_PER_PDT_VFC, NO_DISCOUNT, DISCOUNT_PROMOTION, SUM_DIS_PRO) as PRED_dtree_LABEL
        FROM (hot_topic_test_table_1k)"""
predict_snowpark_df = hottopic_session.sql(qry)
predict_snowpark_df.show(1000)

------------------------------------------------------------------------
|"ACCT_NMB"  |"ORIGINAL_LABEL"  |"PRED_RF_LABEL"  |"PRED_DTREE_LABEL"  |
------------------------------------------------------------------------
|00888       |0                 |0.0              |1.0                 |
|01071       |1                 |0.0              |0.0                 |
|01103       |0                 |0.0              |0.0                 |
|01890       |0                 |0.0              |0.0                 |
|01898       |0                 |0.0              |0.0                 |
|02421       |0                 |0.0              |0.0                 |
|02562       |0                 |0.0              |0.0                 |
|04405       |0                 |1.0              |1.0                 |
|05715       |0                 |0.0              |0.0                 |
|06163       |1                 |0.0              |0.0                 |
|06881       |0                 |0.0              |

In [None]:
hottopic_session.close()
print('Finished!!!')