### In this Notebook let's train and infer 3 different classifiers 
### 1. Random forest 
### 2. Decision tree 
### 3. Logistic Regression. 
### The volume of data we will be training is around 15 million records using snowpark optimized warehouse

### Load Snowpark libraries

In [1]:
# Import required libraries
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import avg, sum, col,lit
from snowflake.snowpark.functions import udf, sproc, col
from snowflake.snowpark.types import IntegerType, FloatType, LongType, DoubleType, DecimalType,StringType, BooleanType, Variant
from snowflake.snowpark.types import PandasSeries, PandasDataFrame
from snowflake.snowpark import functions as fn

import sys ,json
import io
import logging
import pandas as pd

import joblib
import pandas as pd
import numpy as np
import json

from snowflake.snowpark import version
print (f"snowflake snowpark version is: {version.VERSION}")

snowflake snowpark version is: (0, 10, 0)


### Connect to Snowflake and establish session

In [2]:
snowflake_connection_cfg = open('cred.json')
snowflake_connection_cfg = snowflake_connection_cfg.read()
snowflake_connection_cfg = json.loads(snowflake_connection_cfg)

'''
APP_WH XS
LAB_WH S
HMWH M optimized warehouse
DCR_MA_WH L
BANK1_WH XL
'''

# Creating Snowpark Session
staples_session = Session.builder.configs(snowflake_connection_cfg).create()
print('Current Database:', staples_session.get_current_database())
print('Current Schema:', staples_session.get_current_schema())
print('Current Warehouse:', staples_session.get_current_warehouse())
print("Warehouse set up:")
staples_session.sql("show warehouses like 'APP_WH'").collect()

Current Database: "BANK1_CRM_DB"
Current Schema: "PUBLIC"
Current Warehouse: "APP_WH"
Warehouse set up:


[Row(name='APP_WH', state='STARTED', type='STANDARD', size='X-Small', min_cluster_count=1, max_cluster_count=1, started_clusters=1, running=0, queued=0, is_default='N', is_current='Y', auto_suspend=600, auto_resume='true', available=' 100', provisioning='0', quiescing='0', other='0', created_on=datetime.datetime(2022, 2, 27, 4, 51, 57, 85000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), resumed_on=datetime.datetime(2022, 12, 15, 18, 19, 18, 443000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), updated_on=datetime.datetime(2022, 12, 15, 18, 19, 18, 443000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), owner='SYSADMIN', comment='', enable_query_acceleration='false', query_acceleration_max_scale_factor=8, resource_monitor='null', actives=1, pendings=0, failed=0, suspended=0, uuid='1463550724', scaling_policy='STANDARD')]

### Create stage location for models

In [3]:
staples_session.sql("CREATE OR REPLACE STAGE staples_stage_models").collect()

[Row(status='Stage area STAPLES_STAGE_MODELS successfully created.')]

In [4]:
staples_session.sql("CREATE OR REPLACE STAGE staples_stage_data").collect()

[Row(status='Stage area STAPLES_STAGE_DATA successfully created.')]

In [5]:
staples_session.clear_packages()
staples_session.add_packages("snowflake-snowpark-python")
staples_session.add_packages("scikit-learn","pandas","numpy","joblib","cachetools")
staples_session.clear_imports()
# staples_session.add_import(ge_import_path)

### Define function to save trained model

In [6]:
def save_file(session, model, path, dest_filename):
    # logger.debug('#save_file: -- START--')
    input_stream = io.BytesIO()
    joblib.dump(model, input_stream)
    session._conn.upload_stream(input_stream, path, dest_filename)
    return "successfully created file: " + path

### Define Features required to train model

In [7]:
features=["RECENCY_DAY",
"FREQUENCY",
"MONETORY",
"RMF_SCORE",
"DOTCOM",
"REWARDS_ACCOUNT",
"FREQ_1",
"FREQ_2",
"FREQ_3",
"FREQ_4",
"FREQ_5",
"FREQ_6",
"FREQ_7",
"FREQ_8",
"FREQ_9",
"FREQ_10",
"FREQ_11",
"FREQ_12",
"CNT_PER_PDT",
"CNT_PER_PDT_SFC",
"CNT_PER_PDT_VFC",
"NO_DISCOUNT",
"DISCOUNT_PROMOTION",
"SUM_DIS_PRO"]

### Define Model pipeline for Imputer, Standard Scaler and Random Classifier Model

In [8]:
def build_rf_model(p_df: pd.DataFrame,ne,nj,cw, md):
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.ensemble import RandomForestClassifier
    numeric_features = p_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = p_df.select_dtypes(include=['object']).columns.tolist()

    feature_names = numeric_features + categorical_features

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler(with_mean=True,with_std=True))])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier'
                    ,RandomForestClassifier(n_estimators=ne, n_jobs=-nj, class_weight=cw,max_depth=md)
                    # ,RandomForestClassifier(n_estimators=4, n_jobs=-1, class_weight='balanced_subsample',max_depth=20)
                    # ,RandomForestClassifier(maxBins=20,featureSubsetStrategy='onethird') need to find the equivalents
                    # of these maxBins and featureSubsetStrategy. For featureSubsetStrategy I do think it is the 
                    # classweight from sklearn based on the documentation. I also think maxBins could be the same as
                    # maxdepth.
                )
            ])

    return model


### Define Model pipeline for Imputer, Standard Scaler and Decision Tree Classifier Model

In [9]:
def build_dtree_model(p_df: pd.DataFrame,cw, md):
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.tree import DecisionTreeClassifier
    numeric_features = p_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = p_df.select_dtypes(include=['object']).columns.tolist()

    feature_names = numeric_features + categorical_features

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler(with_mean=True,with_std=True))])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier'
                    ,DecisionTreeClassifier(class_weight=cw,max_depth=md)
                    # ,RandomForestClassifier(n_estimators=4, n_jobs=-1, class_weight='balanced_subsample',max_depth=20)
                    # ,RandomForestClassifier(maxBins=20,featureSubsetStrategy='onethird') need to find the equivalents
                    # of these maxBins and featureSubsetStrategy. For featureSubsetStrategy I do think it is the 
                    # classweight from sklearn based on the documentation. I also think maxBins could be the same as
                    # maxdepth.
                )
            ])

    return model


### Define Model pipeline for Imputer, Standard Scaler and Logistic Regression Model

In [10]:
def build_logres_model(p_df: pd.DataFrame,randomstate):
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.linear_model import LogisticRegression
    numeric_features = p_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = p_df.select_dtypes(include=['object']).columns.tolist()

    feature_names = numeric_features + categorical_features

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler(with_mean=True,with_std=True))])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier'
                    ,LogisticRegression(random_state=randomstate)
                    # ,RandomForestClassifier(n_estimators=4, n_jobs=-1, class_weight='balanced_subsample',max_depth=20)
                    # ,RandomForestClassifier(maxBins=20,featureSubsetStrategy='onethird') need to find the equivalents
                    # of these maxBins and featureSubsetStrategy. For featureSubsetStrategy I do think it is the 
                    # classweight from sklearn based on the documentation. I also think maxBins could be the same as
                    # maxdepth.
                )
            ])

    return model


### Define Classification report to register model output

In [11]:
def get_classification_report(y_test, y_pred):
    from sklearn import metrics
    report = metrics.classification_report(y_test, y_pred, output_dict=True,target_names=['0','1'])
    df_classification_report = pd.DataFrame(report).transpose()    
    return df_classification_report

### Define Model parameteres to register model output

In [12]:
def get_model_info(model_name, test_size, random_state,ne,nj,cw,max_depth):
    data = [[model_name,test_size,random_state,ne,nj,cw,max_depth]]  
    df_model_info = pd.DataFrame(data,columns=['model','test_size','random_state','ne','nj','cw','max_depth'])
    return df_model_info

### Define Train random forest classifier model

In [17]:
# def staples_train_rf_model(session: Session, training_table: str, sample_size_n: int, model_name: str,features:list, Y: str,test_size:float,random_state:int,ne:int,nj:int,cw:str, md:int) -> str:
def staples_train_rf_model(session: Session, training_table: str, model_name: str,features:list, Y: str,test_size:float,random_state:int,ne:int,nj:int,cw:str, md:int) -> str:
    from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, confusion_matrix, RocCurveDisplay
    from sklearn import metrics
    # import matplotlib.pyplot as plt
#    training_data = session.table(training_table).sample(n=sample_size_n)
    training_data = session.table(training_table)

    Data_train, Data_test = training_data.random_split([1-test_size, test_size], seed=random_state)
    pd_Data_train=Data_train.to_pandas()
    pd_Data_test=Data_test.to_pandas()
    from sklearn.ensemble import RandomForestClassifier 
    # Model building
    rf = build_rf_model(pd_Data_train[features],ne,nj,cw, md)
    rf.fit(pd_Data_train[features], pd_Data_train[Y])

    model_dir = '@staples_stage_models'
    model_fl = model_name+'.joblib'
    save_file(session, rf, model_dir ,model_fl)

    score = rf.score(pd_Data_test[features], pd_Data_test[Y])
    
    y_pred = rf.predict(pd_Data_test)
    df_classification_report = get_classification_report(y_pred,pd_Data_test[Y]).reset_index().rename(columns={"index": "class"}).reset_index(drop=True)
    df_model_info = get_model_info(model_fl,test_size,random_state,ne,nj,cw,md)
    df_model_info=df_model_info.append([df_model_info]*5,ignore_index=True)
    session.create_dataframe(df_classification_report.join(df_model_info)).write.mode("append").save_as_table("staples_model_output")
    
#     ax = plt.gca()
#     rfc_disp = RocCurveDisplay.from_estimator(rf, pd_Data_test[features], pd_Data_test[Y], ax=ax, alpha=0.8)
#     rfc_disp.plot(ax=ax, alpha=0.8)
    
#     return metrics.plot_roc_curve(rf, pd_Data_test[features], pd_Data_test[Y])
    return df_classification_report.join(df_model_info)

### Define train decision Tree classifier model

In [18]:
#def staples_train_dtree_model(session: Session, training_table: str, sample_size_n: int, model_name: str,features:list, Y: str,test_size:float,random_state:int,cw:str, md:int) -> str:
def staples_train_dtree_model(session: Session, training_table: str, model_name: str,features:list, Y: str,test_size:float,random_state:int,cw:str, md:int) -> str:
    from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, confusion_matrix, RocCurveDisplay
    
    # training_data = session.table(training_table).sample(n=sample_size_n)
    training_data = session.table(training_table)

    Data_train, Data_test = training_data.random_split([1-test_size, test_size], seed=random_state)
    pd_Data_train=Data_train.to_pandas()
    pd_Data_test=Data_test.to_pandas()
    from sklearn.ensemble import RandomForestClassifier 
    # Model building
    dtree = build_dtree_model(pd_Data_train[features],cw, md)
    dtree.fit(pd_Data_train[features], pd_Data_train[Y])

    model_dir = '@staples_stage_models'
    model_fl = model_name+'.joblib'
    save_file(session, dtree, model_dir ,model_fl)

    score = dtree.score(pd_Data_test[features], pd_Data_test[Y])
    
    y_pred = dtree.predict(pd_Data_test)
    df_classification_report = get_classification_report(y_pred,pd_Data_test[Y]).reset_index().rename(columns={"index": "class"}).reset_index(drop=True)
    df_model_info = get_model_info(model_fl,test_size,random_state,None,None,cw,md)
    df_model_info=df_model_info.append([df_model_info]*5,ignore_index=True)
    session.create_dataframe(df_classification_report.join(df_model_info)).write.mode("append").save_as_table("staples_model_output")
    
    return df_classification_report.join(df_model_info)

### Define train logistic regression classifier model

In [19]:
def staples_train_logres_model(session: Session, training_table: str, model_name: str,features:list, Y: str,test_size:float,random_state:int) -> str:
    from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, confusion_matrix, RocCurveDisplay
    
    training_data = session.table(training_table)

    Data_train, Data_test = training_data.random_split([1-test_size, test_size], seed=random_state)
    pd_Data_train=Data_train.to_pandas()
    pd_Data_test=Data_test.to_pandas()
    from sklearn.linear_model import LogisticRegression
    # Model building
    logres = build_logres_model(pd_Data_train[features],random_state)
    logres.fit(pd_Data_train[features], pd_Data_train[Y])

    # Save Model
    model_dir = '@staples_stage_models'
    model_fl = model_name+'.joblib'
    save_file(session, logres, model_dir ,model_fl)

    # Score Model
    score = logres.score(pd_Data_test[features], pd_Data_test[Y])
    
    y_pred = logres.predict(pd_Data_test)
    
    #Evaluate Metrics
    df_classification_report = get_classification_report(y_pred,pd_Data_test[Y]).reset_index().rename(columns={"index": "class"}).reset_index(drop=True)
    df_model_info = get_model_info(model_fl,test_size,random_state,None,None,None,None)
    df_model_info=df_model_info.append([df_model_info]*5,ignore_index=True)
    session.create_dataframe(df_classification_report.join(df_model_info)).write.mode("append").save_as_table("staples_model_output")
    
    return df_classification_report.join(df_model_info)

### Create Model output tables to save model ouptut

In [20]:
staples_session.sql("create or replace table staples_model_output (class varchar, precision double, recall double, f1score double, support double, model varchar,test_size float, random_state int, ne int, nj int, cw varchar, max_depth int)").collect()

[Row(status='Table STAPLES_MODEL_OUTPUT successfully created.')]

### Define stored proc to register random forest classifier model

In [21]:
# Registering the function as a Stored Procedure
staples_rf_sproc = staples_session.sproc.register(func=staples_train_rf_model, # training function defined above
                                            name='staples_train_rf_model', # training model name to be registered in snowlake
                                            is_permanent=True, # permanent stored proc
                                            replace=True, # replace if existing already
                                            stage_location='@staples_stage_models', # save the model in stage location
                                            packages=['snowflake-snowpark-python','scikit-learn','joblib']) # import model libaries


### Define Stored Proc to register decision tree classifier model

In [22]:
# Registering the function as a Stored Procedure
staples_dtree_sproc = staples_session.sproc.register(func=staples_train_dtree_model, # training function defined above
                                            name='staples_train_dtree_model', # training model name to be registered in snowlake
                                            is_permanent=True, # permanent stored proc
                                            replace=True, # replace if existing already
                                            stage_location='@staples_stage_models', # save the model in stage location
                                            packages=['snowflake-snowpark-python','scikit-learn','joblib']) # import model libaries


### Define Stored Proc to register Logistic regression classifier model

In [23]:
# Registering the function as a Stored Procedure
staples_logres_sproc = staples_session.sproc.register(func=staples_train_logres_model, # training function defined above
                                            name='staples_train_logres_model', # training model name to be registered in snowlake
                                            is_permanent=True, # permanent stored proc
                                            replace=True, # replace if existing already
                                            stage_location='@staples_stage_models', # save the model in stage location
                                            packages=['snowflake-snowpark-python','scikit-learn','joblib']) # import model libaries


### Train with 15Million row table¶

In [25]:
staples_session.sql("select to_varchar(count(*), '999,999,999,999.00') as train_set_count from staples_data_train_15M").show()

-----------------------
|"TRAIN_SET_COUNT"    |
-----------------------
|      15,002,000.00  |
-----------------------



### Train  Random Forest classifier, Decision Tree Classifier  and Logistic Regression model in Snowflake through registered Stored Procs and capture model output in a snowflake table

### All the above steps are just definition and Registration of component
### The below training runs completely on Snowflake and you can go check in history tab
### I have changed the warehouse HMWH to suit the training needs

In [26]:
staples_session.sql("use warehouse HMWH").collect()
print('Current Warehouse:', staples_session.get_current_warehouse())
print("Warehouse set up:")
staples_session.sql("show warehouses like 'HMWH'").collect()

Current Warehouse: "HMWH"
Warehouse set up:


[Row(name='HMWH', state='SUSPENDED', type='SNOWPARK-OPTIMIZED', size='Medium', min_cluster_count=1, max_cluster_count=1, started_clusters=0, running=0, queued=0, is_default='N', is_current='Y', auto_suspend=300, auto_resume='true', available='', provisioning='', quiescing='', other='', created_on=datetime.datetime(2022, 11, 1, 18, 18, 59, 3000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), resumed_on=datetime.datetime(2022, 12, 14, 23, 48, 0, 173000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), updated_on=datetime.datetime(2022, 12, 14, 23, 48, 0, 173000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), owner='SYSADMIN', comment='warehouse used by role dev_blogger', enable_query_acceleration='false', query_acceleration_max_scale_factor=8, resource_monitor='null', actives=0, pendings=0, failed=0, suspended=1, uuid='1463550932', scaling_policy='STANDARD')]

In [28]:
table_name = 'staples_data_train_15M'
test_size = 0.25
max_depth = 25
model_name = 'rf_staples_model'
random_state = 43,
n_estimator = 4
n_jobs = 1,
class_weight = 'balanced_subsample'
print ("random forest classifier report")
print (staples_rf_sproc(table_name
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43
                ,n_estimator
                ,1
                ,class_weight
                , max_depth))

random forest classifier report
          class  precision    recall  ...  nj                  cw max_depth
0             0   0.665814  0.750339  ...   1  balanced_subsample        25
1             1   0.717696  0.627604  ...   1  balanced_subsample        25
2      accuracy   0.688626  0.688626  ...   1  balanced_subsample        25
3     macro avg   0.691755  0.688972  ...   1  balanced_subsample        25
4  weighted avg   0.691901  0.688626  ...   1  balanced_subsample        25

[5 rows x 12 columns]


In [29]:
table_name = 'staples_data_train_15M'
test_size = 0.25
max_depth = 25
model_name = 'dtree_staples_model'
random_state = 43,
class_weight = 'balanced'
print ("decision tree classifier report")
print (staples_rf_sproc(table_name
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43,
                4,
                1
                ,class_weight
                , max_depth))

decision tree classifier report
          class  precision    recall  f1-score  ...  ne nj        cw  max_depth
0             0   0.656793  0.747357  0.699155  ...   4  1  balanced         25
1             1   0.717216  0.621322  0.665834  ...   4  1  balanced         25
2      accuracy   0.683368  0.683368  0.683368  ...   4  1  balanced         25
3     macro avg   0.687004  0.684339  0.682494  ...   4  1  balanced         25
4  weighted avg   0.687470  0.683368  0.682238  ...   4  1  balanced         25

[5 rows x 12 columns]


In [30]:
table_name = 'staples_data_train_15M'
test_size = 0.25
model_name = 'logres_staples_model'
random_state = 43,
class_weight = 'balanced'
print ("Logistic Regression classifier report")
print (staples_logres_sproc(table_name
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43))

Logistic Regression classifier report
          class  precision    recall  f1-score  ...    ne    nj    cw  max_depth
0             0   0.800532  0.722841  0.759706  ...  None  None  None       None
1             1   0.609302  0.705869  0.654040  ...  None  None  None       None
2      accuracy   0.716395  0.716395  0.716395  ...  None  None  None       None
3     macro avg   0.704917  0.714355  0.706873  ...  None  None  None       None
4  weighted avg   0.727906  0.716395  0.719575  ...  None  None  None       None

[5 rows x 12 columns]


### Let's change the warehouse back to APP_WH which is size XS

In [32]:
staples_session.sql("use warehouse app_wh").collect()

[Row(status='Statement executed successfully.')]

### Training results stored in snowflake table

In [36]:
staples_session.sql("""select class,precision,recall,f1score,model from staples_model_output order by f1score desc""").show(100)

-------------------------------------------------------------------------------------------------------------
|"CLASS"       |"PRECISION"         |"RECALL"            |"F1SCORE"           |"MODEL"                      |
-------------------------------------------------------------------------------------------------------------
|0             |0.8005320731950629  |0.7228412196384562  |0.7597055598522422  |logres_staples_model.joblib  |
|weighted avg  |0.7279057500677809  |0.7163953910387709  |0.7195754499547652  |logres_staples_model.joblib  |
|accuracy      |0.7163953910387709  |0.7163953910387709  |0.7163953910387709  |logres_staples_model.joblib  |
|macro avg     |0.7049171331408622  |0.7143550745519367  |0.7068729519630736  |logres_staples_model.joblib  |
|0             |0.665814139981582   |0.7503392319466926  |0.705554186878469   |rf_staples_model.joblib      |
|0             |0.6567933532261986  |0.7473570906854266  |0.6991546693262326  |dtree_staples_model.joblib   |
|accuracy 

### Check the Trained models as Queries in Snowflake and the duration it took

In [50]:
qry = '''select query_id,
       substr(query_text,55,20) model,
       warehouse_name,
       execution_status,
       TIMESTAMPDIFF(second,start_time,end_time) total_duration_in_secs
from table(information_schema.QUERY_HISTORY_BY_WAREHOUSE('HMWH'))
WHERE query_text like '%CALL%' and execution_status = 'SUCCESS'
order by start_time desc'''
staples_session.sql(qry).show(3)

----------------------------------------------------------------------------------------------------------------------------------
|"QUERY_ID"                            |"MODEL"               |"WAREHOUSE_NAME"  |"EXECUTION_STATUS"  |"TOTAL_DURATION_IN_SECS"  |
----------------------------------------------------------------------------------------------------------------------------------
|01a8ff6e-0402-5016-0057-3c030165c1f6  |M', 'logres_staples_  |HMWH              |SUCCESS             |245                       |
|01a8ff6b-0402-4ffa-0057-3c030165b42a  |'dtree_staples_model  |HMWH              |SUCCESS             |167                       |
|01a8ff67-0402-5016-0057-3c030165c11e  |'rf_staples_model',   |HMWH              |SUCCESS             |230                       |
----------------------------------------------------------------------------------------------------------------------------------



### Check if the classifier models are saved in stage location.
### Remember if the same model name was used for all the iterations, then only the last trained model will be saved for the model name.

In [51]:
staples_session.sql("list @staples_stage_models").collect()

[Row(name='staples_stage_models/dtree_staples_model.joblib', size=926096, md5='efc562fb55a22e5fa4615837c5f4ffa7', last_modified='Fri, 16 Dec 2022 02:54:07 GMT'),
 Row(name='staples_stage_models/logres_staples_model.joblib', size=367632, md5='61448dad56719781e377281aae8ba9b0', last_modified='Fri, 16 Dec 2022 02:58:13 GMT'),
 Row(name='staples_stage_models/rf_staples_model.joblib', size=1014736, md5='f71ee4e3519dc67b451bf07abe28a0cb', last_modified='Fri, 16 Dec 2022 02:51:03 GMT')]

### Define UDF for Inference

### for Randomforest classifier

In [52]:
import sys
import cachetools
import os
from snowflake.snowpark.functions import udf
staples_session.add_import("@staples_stage_models/rf_staples_model.joblib")  

@cachetools.cached(cache={})
def read_file(filename):
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

@udf(name="predict_rf_staples_model", is_permanent=True, stage_location="@staples_stage_models", replace=True)
def predict(RECENCY_DAY: float, FREQUENCY: float, MONETORY: float, RMF_SCORE: float, DOTCOM: float, REWARDS_ACCOUNT: float,
            FREQ_1: float, FREQ_2: float, FREQ_3: float, FREQ_4: float, FREQ_5: float, FREQ_6: float, FREQ_7: float, 
            FREQ_8: float, FREQ_9: float, FREQ_10: float, FREQ_11: float, FREQ_12: float, CNT_PER_PDT: float, 
            CNT_PER_PDT_SFC: float, CNT_PER_PDT_VFC: float, NO_DISCOUNT: float, DISCOUNT_PROMOTION: float,
            SUM_DIS_PRO: float) -> float:
       m = read_file('rf_staples_model.joblib')       
       row = pd.DataFrame([locals()], columns=features)
       return m.predict(row)[0]

### for Decision Tree classifier

In [54]:

import sys
import cachetools
import os
from snowflake.snowpark.functions import udf
staples_session.add_import("@staples_stage_models/dtree_staples_model.joblib")  

@cachetools.cached(cache={})
def read_file(filename):
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

@udf(name="predict_dtree_staples_model", is_permanent=True, stage_location="@staples_stage_models", replace=True)
def predict(RECENCY_DAY: float, FREQUENCY: float, MONETORY: float, RMF_SCORE: float, DOTCOM: float, REWARDS_ACCOUNT: float,
            FREQ_1: float, FREQ_2: float, FREQ_3: float, FREQ_4: float, FREQ_5: float, FREQ_6: float, FREQ_7: float, 
            FREQ_8: float, FREQ_9: float, FREQ_10: float, FREQ_11: float, FREQ_12: float, CNT_PER_PDT: float, 
            CNT_PER_PDT_SFC: float, CNT_PER_PDT_VFC: float, NO_DISCOUNT: float, DISCOUNT_PROMOTION: float, 
            SUM_DIS_PRO: float) -> float:
       m = read_file('dtree_staples_model.joblib')       
       row = pd.DataFrame([locals()], columns=features)
       return m.predict(row)[0]

### for Logistic regression

In [55]:
import sys
import cachetools
import os
from snowflake.snowpark.functions import udf
staples_session.add_import("@staples_stage_models/logres_staples_model.joblib")  

@cachetools.cached(cache={})
def read_file(filename):
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

@udf(name="predict_logres_staples_model", is_permanent=True, stage_location="@staples_stage_models", replace=True)
def predict(RECENCY_DAY: float, FREQUENCY: float, MONETORY: float, RMF_SCORE: float, DOTCOM: float, REWARDS_ACCOUNT: float,
            FREQ_1: float, FREQ_2: float, FREQ_3: float, FREQ_4: float, FREQ_5: float, FREQ_6: float, FREQ_7: float, 
            FREQ_8: float, FREQ_9: float, FREQ_10: float, FREQ_11: float, FREQ_12: float, CNT_PER_PDT: float, 
            CNT_PER_PDT_SFC: float, CNT_PER_PDT_VFC: float, NO_DISCOUNT: float, DISCOUNT_PROMOTION: float,
            SUM_DIS_PRO: float) -> float:
       m = read_file('logres_staples_model.joblib')       
       row = pd.DataFrame([locals()], columns=features)
       return m.predict(row)[0]

In [56]:
features = ['RECENCY_DAY',
'FREQUENCY',
'MONETORY',
'RMF_SCORE',
'DOTCOM',
'REWARDS_ACCOUNT',
'FREQ_1',
'FREQ_2',
'FREQ_3',
'FREQ_4',
'FREQ_5',
'FREQ_6',
'FREQ_7',
'FREQ_8',
'FREQ_9',
'FREQ_10',
'FREQ_11',
'FREQ_12',
'CNT_PER_PDT',
'CNT_PER_PDT_SFC',
'CNT_PER_PDT_VFC',
'NO_DISCOUNT',
'DISCOUNT_PROMOTION',
'SUM_DIS_PRO',
'LABEL']

### Load Dataset to snowflake table for inference

In [None]:
data_train_df = pd.read_csv("Data_train.csv",header=None,delimiter=',')
cols = ['ACCT_NMB','RECENCY_DAY',
'FREQUENCY',
'MONETORY',
'RMF_SCORE',
'DOTCOM',
'REWARDS_ACCOUNT',
'FREQ_1',
'FREQ_2',
'FREQ_3',
'FREQ_4',
'FREQ_5',
'FREQ_6',
'FREQ_7',
'FREQ_8',
'FREQ_9',
'FREQ_10',
'FREQ_11',
'FREQ_12',
'CNT_PER_PDT',
'CNT_PER_PDT_SFC',
'CNT_PER_PDT_VFC',
'NO_DISCOUNT',
'DISCOUNT_PROMOTION',
'SUM_DIS_PRO',
'LABEL',
'features']
staple_table_df = pd.DataFrame(data_train_df[1:].values,columns=cols)
staple_table_df = staple_table_df.astype({"ACCT_NMB": str, 
                             "RECENCY_DAY": float, 
                             "FREQUENCY": float,
                             "MONETORY": float,
                             "RMF_SCORE": float,
                             "DOTCOM": float,
                             "REWARDS_ACCOUNT": float,
                             "FREQ_1": float,
                             "FREQ_2": float,
                             "FREQ_3": float,
                             "FREQ_4": float,
                             "FREQ_5": float,
                             "FREQ_6": float,
                             "FREQ_7": float,
                             "FREQ_8": float,
                             "FREQ_9": float,
                             "FREQ_10": float,
                             "FREQ_11": float,
                             "FREQ_12": float,
                             "CNT_PER_PDT": float,
                             "CNT_PER_PDT_SFC": float,
                             "CNT_PER_PDT_VFC": float,
                             "NO_DISCOUNT": float,
                             "DISCOUNT_PROMOTION": float,
                             "LABEL": int,
                             "features": str
                            }
                           )
staples_session.create_dataframe(staple_table_df).write.mode("overwrite").save_as_table("staples_data_train_15M")

### Check sample Data

In [57]:
snowpark_staples_df = staples_session.table('staples_data_train_15M')
snowpark_staples_df.show(1)

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ACCT_NMB"  |"RECENCY_DAY"  |"FREQUENCY"  |"MONETORY"  |"RMF_SCORE"  |"DOTCOM"  |"REWARDS_ACCOUNT"  |"FREQ_1"  |"FREQ_2"  |"FREQ_3"  |"FREQ_4"  |"FREQ_5"  |"FREQ_6"  |"FREQ_7"  |"FREQ_8"  |"FREQ_9"  |"FREQ_10"  |"FREQ_11"  |"FREQ_12"  |"CNT_PER_PDT"  |"CNT_PER_PDT_SFC"  |"CNT_PER_PDT_VFC"  |"NO_DISCOUNT"  |"DISCOUNT_PROMOTION"  |"SUM_DIS_PRO"  |"LABEL"  |"features"                                          |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Check Data structure

In [58]:
snowpark_staples_df.schema.fields

[StructField('ACCT_NMB', StringType(), nullable=True),
 StructField('RECENCY_DAY', DoubleType(), nullable=True),
 StructField('FREQUENCY', DoubleType(), nullable=True),
 StructField('MONETORY', DoubleType(), nullable=True),
 StructField('RMF_SCORE', DoubleType(), nullable=True),
 StructField('DOTCOM', DoubleType(), nullable=True),
 StructField('REWARDS_ACCOUNT', DoubleType(), nullable=True),
 StructField('FREQ_1', DoubleType(), nullable=True),
 StructField('FREQ_2', DoubleType(), nullable=True),
 StructField('FREQ_3', DoubleType(), nullable=True),
 StructField('FREQ_4', DoubleType(), nullable=True),
 StructField('FREQ_5', DoubleType(), nullable=True),
 StructField('FREQ_6', DoubleType(), nullable=True),
 StructField('FREQ_7', DoubleType(), nullable=True),
 StructField('FREQ_8', DoubleType(), nullable=True),
 StructField('FREQ_9', DoubleType(), nullable=True),
 StructField('FREQ_10', DoubleType(), nullable=True),
 StructField('FREQ_11', DoubleType(), nullable=True),
 StructField('FREQ_1

### Now using all the models we trained randomforest, decisiontree and logistic regression classifiers let's predict and infer using SQL.

### You can compare all model inference results side by side using SQL Query completed run on Snowflake

In [59]:
qry="""SELECT 
       ACCT_NMB,
       LABEL AS ORIGINAL_LABEL,
       predict_rf_staples_model(RECENCY_DAY,FREQUENCY, MONETORY,RMF_SCORE,DOTCOM,REWARDS_ACCOUNT,
       FREQ_1,FREQ_2, FREQ_3, FREQ_4, FREQ_5, FREQ_6, FREQ_7, FREQ_8, FREQ_9, FREQ_10, FREQ_11, FREQ_12,
       CNT_PER_PDT, CNT_PER_PDT_SFC, CNT_PER_PDT_VFC, NO_DISCOUNT, DISCOUNT_PROMOTION, SUM_DIS_PRO) as PRED_rf_LABEL,
       predict_dtree_staples_model(RECENCY_DAY,FREQUENCY, MONETORY,RMF_SCORE,DOTCOM,REWARDS_ACCOUNT,
       FREQ_1,FREQ_2, FREQ_3, FREQ_4, FREQ_5, FREQ_6, FREQ_7, FREQ_8, FREQ_9, FREQ_10, FREQ_11, FREQ_12,
       CNT_PER_PDT, CNT_PER_PDT_SFC, CNT_PER_PDT_VFC, NO_DISCOUNT, DISCOUNT_PROMOTION, SUM_DIS_PRO) as PRED_dtree_LABEL,
       predict_logres_staples_model(RECENCY_DAY,FREQUENCY, MONETORY,RMF_SCORE,DOTCOM,REWARDS_ACCOUNT,
       FREQ_1,FREQ_2, FREQ_3, FREQ_4, FREQ_5, FREQ_6, FREQ_7, FREQ_8, FREQ_9, FREQ_10, FREQ_11, FREQ_12,
       CNT_PER_PDT, CNT_PER_PDT_SFC, CNT_PER_PDT_VFC, NO_DISCOUNT, DISCOUNT_PROMOTION, SUM_DIS_PRO) as PRED_logres_LABEL
        FROM (staples_data_train)"""
predict_snowpark_df = staples_session.sql(qry)
predict_snowpark_df.show(1000)

----------------------------------------------------------------------------------------------
|"ACCT_NMB"  |"ORIGINAL_LABEL"  |"PRED_RF_LABEL"  |"PRED_DTREE_LABEL"  |"PRED_LOGRES_LABEL"  |
----------------------------------------------------------------------------------------------
|0000008888  |0                 |1.0              |1.0                 |1.0                  |
|0000010710  |1                 |0.0              |0.0                 |0.0                  |
|0000011031  |0                 |0.0              |0.0                 |0.0                  |
|0000018903  |0                 |0.0              |0.0                 |0.0                  |
|0000018986  |0                 |0.0              |0.0                 |0.0                  |
|0000024216  |0                 |0.0              |0.0                 |0.0                  |
|0000025627  |0                 |0.0              |0.0                 |0.0                  |
|0000044057  |0                 |1.0              

In [60]:
staples_session.close()
print('Finished!!!')

Finished!!!
