### Load Snowpark libraries

In [1]:
# Import required libraries
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import avg, sum, col,lit
from snowflake.snowpark.functions import udf, sproc, col
from snowflake.snowpark.types import IntegerType, FloatType, LongType, DoubleType, DecimalType,StringType, BooleanType, Variant
from snowflake.snowpark.types import PandasSeries, PandasDataFrame
from snowflake.snowpark import functions as fn

import sys ,json
import io
import logging
import pandas as pd

import joblib
import pandas as pd
import numpy as np
import json

from snowflake.snowpark import version
print (f"snowflake snowpark version is: {version.VERSION}")

snowflake snowpark version is: (0, 10, 0)


### Connect to Snowflake and establish session

In [2]:
snowflake_connection_cfg = open('cred.json')
snowflake_connection_cfg = snowflake_connection_cfg.read()
snowflake_connection_cfg = json.loads(snowflake_connection_cfg)

# Creating Snowpark Session
rf_session = Session.builder.configs(snowflake_connection_cfg).create()
print('Current Database:', rf_session.get_current_database())
print('Current Schema:', rf_session.get_current_schema())
print('Current Warehouse:', rf_session.get_current_warehouse())

Current Database: "BANK1_CRM_DB"
Current Schema: "PUBLIC"
Current Warehouse: "APP_WH"


### Create stage location for models

In [3]:
rf_session.sql("CREATE OR REPLACE STAGE stage_models").collect()

[Row(status='Stage area STAGE_MODELS successfully created.')]

In [4]:
rf_session.clear_packages()
rf_session.add_packages("snowflake-snowpark-python")
rf_session.add_packages("scikit-learn","pandas","numpy","joblib","cachetools")
rf_session.clear_imports()
# rf_session.add_import(ge_import_path)

### Define function to save trained model

In [5]:
def save_file(session, model, path, dest_filename):
    # logger.debug('#save_file: -- START--')
    input_stream = io.BytesIO()
    joblib.dump(model, input_stream)
    session._conn.upload_stream(input_stream, path, dest_filename)
    return "successfully created file: " + path

### Define Features required to train model

In [6]:
features=['SEPTAL_LENGTH','SEPTAL_WIDTH','PETAL_LENGTH','PETAL_WIDTH']

### Define Model pipeline for Imputer, Standard Scaler and Random Classifier Model

In [7]:
def build_rf_model(p_df: pd.DataFrame,ne,nj,cw, md):
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.ensemble import RandomForestClassifier
    numeric_features = p_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = p_df.select_dtypes(include=['object']).columns.tolist()

    feature_names = numeric_features + categorical_features

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler(with_mean=True,with_std=True))])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier'
                    ,RandomForestClassifier(n_estimators=ne, n_jobs=-nj, class_weight=cw,max_depth=md)
                    # ,RandomForestClassifier(n_estimators=4, n_jobs=-1, class_weight='balanced_subsample',max_depth=20)
                    # ,RandomForestClassifier(maxBins=20,featureSubsetStrategy='onethird') need to find the equivalents
                    # of these maxBins and featureSubsetStrategy. For featureSubsetStrategy I do think it is the 
                    # classweight from sklearn based on the documentation. I also think maxBins could be the same as
                    # maxdepth.
                )
            ])

    return model


In [8]:
def build_dtree_model(p_df: pd.DataFrame,cw, md):
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.tree import DecisionTreeClassifier
    numeric_features = p_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = p_df.select_dtypes(include=['object']).columns.tolist()

    feature_names = numeric_features + categorical_features

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler(with_mean=True,with_std=True))])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier'
                    ,DecisionTreeClassifier(class_weight=cw,max_depth=md)
                    # ,RandomForestClassifier(n_estimators=4, n_jobs=-1, class_weight='balanced_subsample',max_depth=20)
                    # ,RandomForestClassifier(maxBins=20,featureSubsetStrategy='onethird') need to find the equivalents
                    # of these maxBins and featureSubsetStrategy. For featureSubsetStrategy I do think it is the 
                    # classweight from sklearn based on the documentation. I also think maxBins could be the same as
                    # maxdepth.
                )
            ])

    return model


### Define Classification report to register model output

In [9]:
def get_classification_report(y_test, y_pred):
    from sklearn import metrics
    report = metrics.classification_report(y_test, y_pred, output_dict=True,target_names=['setosa', 'versicolor', 'virginica'])
    df_classification_report = pd.DataFrame(report).transpose()    
    return df_classification_report

### Define Model parameteres to register model output

In [10]:
def get_model_info(model_name, test_size, random_state,ne,nj,cw,max_depth):
    data = [[model_name,test_size,random_state,ne,nj,cw,max_depth]]  
    df_model_info = pd.DataFrame(data,columns=['model','test_size','random_state','ne','nj','cw','max_depth'])
    return df_model_info

### Define Train random forest classifier model

In [11]:
def train_rf_model(session: Session, training_table: str, sample_size_n: int, model_name: str,features:list, Y: str,test_size:float,random_state:int,ne:int,nj:int,cw:str, md:int) -> str:
    from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

    training_data = session.table(training_table).sample(n=sample_size_n)
    Data_train, Data_test = training_data.random_split([1-test_size, test_size], seed=random_state)
    pd_Data_train=Data_train.to_pandas()
    pd_Data_test=Data_test.to_pandas()
    from sklearn.ensemble import RandomForestClassifier 
    # Model building
    rf = build_rf_model(pd_Data_train[features],ne,nj,cw, md)
    rf.fit(pd_Data_train[features], pd_Data_train[Y])

    model_dir = '@stage_models'
    model_fl = model_name+'.joblib'
    save_file(session, rf, model_dir ,model_fl)

    score = rf.score(pd_Data_test[features], pd_Data_test[Y])
    
    y_pred = rf.predict(pd_Data_test)
    df_classification_report = get_classification_report(y_pred,pd_Data_test[Y]).reset_index().rename(columns={"index": "class"}).reset_index(drop=True)
    df_model_info = get_model_info(model_fl,test_size,random_state,ne,nj,cw,md)
    df_model_info=df_model_info.append([df_model_info]*5,ignore_index=True)
    session.create_dataframe(df_classification_report.join(df_model_info)).write.mode("append").save_as_table("model_output")
    
    return df_classification_report.join(df_model_info)

### Define train decision Tree classifier model

In [12]:
def train_dtree_model(session: Session, training_table: str, sample_size_n: int, model_name: str,features:list, Y: str,test_size:float,random_state:int,cw:str, md:int) -> str:
    from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

    training_data = session.table(training_table).sample(n=sample_size_n)
    Data_train, Data_test = training_data.random_split([1-test_size, test_size], seed=random_state)
    pd_Data_train=Data_train.to_pandas()
    pd_Data_test=Data_test.to_pandas()
    from sklearn.ensemble import RandomForestClassifier 
    # Model building
    dtree = build_dtree_model(pd_Data_train[features],cw, md)
    dtree.fit(pd_Data_train[features], pd_Data_train[Y])

    model_dir = '@stage_models'
    model_fl = model_name+'.joblib'
    save_file(session, dtree, model_dir ,model_fl)

    score = dtree.score(pd_Data_test[features], pd_Data_test[Y])
    
    y_pred = dtree.predict(pd_Data_test)
    df_classification_report = get_classification_report(y_pred,pd_Data_test[Y]).reset_index().rename(columns={"index": "class"}).reset_index(drop=True)
    df_model_info = get_model_info(model_fl,test_size,random_state,None,None,cw,md)
    df_model_info=df_model_info.append([df_model_info]*5,ignore_index=True)
    session.create_dataframe(df_classification_report.join(df_model_info)).write.mode("append").save_as_table("model_output")
    
    return df_classification_report.join(df_model_info)

### Create Model output tables to save model ouptut

In [13]:
rf_session.sql("create or replace table model_output (class varchar, precision double, recall double, f1score double, support double, model varchar,test_size float, random_state int, ne int, nj int, cw varchar, max_depth int)").collect()

[Row(status='Table MODEL_OUTPUT successfully created.')]

### Define stored proc to register random forest classifier model

In [14]:
# Registering the function as a Stored Procedure
rf_sproc = rf_session.sproc.register(func=train_rf_model, # training function defined above
                                            name='train_rf_model', # training model name to be registered in snowlake
                                            is_permanent=True, # permanent stored proc
                                            replace=True, # replace if existing already
                                            stage_location='@stage_models', # save the model in stage location
                                            packages=['snowflake-snowpark-python','scikit-learn','joblib']) # import model libaries


### Define Stored Proc to register decision tree classifier model

In [16]:
# Registering the function as a Stored Procedure
dtree_sproc = rf_session.sproc.register(func=train_dtree_model, # training function defined above
                                            name='train_dtree_model', # training model name to be registered in snowlake
                                            is_permanent=True, # permanent stored proc
                                            replace=True, # replace if existing already
                                            stage_location='@stage_models', # save the model in stage location
                                            packages=['snowflake-snowpark-python','scikit-learn','joblib']) # import model libaries


### Train  Random Forest classifier and Decision Tree Classifier in Snowflake through registered Stored Procs and capture model output in a snowflake table

### All the above steps are just definition and Registration of component
### The below training runs completely on Snowflake and you can go check in history tab

In [32]:
table_name = 'iris_dataset'
sample = 100,
test_size = 0.1
max_depth = 15
model_name = 'rf_iris_model'
random_state = 43,
n_estimator = 4
n_jobs = 1,
class_weight = 'balanced_subsample'
print ("random forest classifier report")
print (rf_sproc(table_name
                ,150
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43
                ,n_estimator
                ,1
                ,class_weight
                , max_depth))

random forest classifier report
          class  precision  recall  ...  nj                  cw max_depth
0        setosa        1.0     1.0  ...   1  balanced_subsample        15
1    versicolor        1.0     1.0  ...   1  balanced_subsample        15
2     virginica        1.0     1.0  ...   1  balanced_subsample        15
3      accuracy        1.0     1.0  ...   1  balanced_subsample        15
4     macro avg        1.0     1.0  ...   1  balanced_subsample        15
5  weighted avg        1.0     1.0  ...   1  balanced_subsample        15

[6 rows x 12 columns]


In [17]:
table_name = 'iris_dataset'
sample = 100,
test_size = 0.1
max_depth = 15
model_name = 'rf_iris_model_'+'ts'+str(test_size)+'_'+'md'+str(max_depth)
random_state = 43,
n_estimator = 4
n_jobs = 1,
class_weight = 'balanced_subsample'
print ("random forest classifier report")
print (rf_sproc(table_name
                ,150
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43
                ,n_estimator
                ,1
                ,class_weight
                , max_depth))

random forest classifier report
          class  precision  recall  ...  nj                  cw max_depth
0        setosa        1.0     1.0  ...   1  balanced_subsample        15
1    versicolor        1.0     1.0  ...   1  balanced_subsample        15
2     virginica        1.0     1.0  ...   1  balanced_subsample        15
3      accuracy        1.0     1.0  ...   1  balanced_subsample        15
4     macro avg        1.0     1.0  ...   1  balanced_subsample        15
5  weighted avg        1.0     1.0  ...   1  balanced_subsample        15

[6 rows x 12 columns]


In [18]:
table_name = 'iris_dataset'
sample = 150,
test_size = 0.25
max_depth = 20
model_name = 'rf_iris_model_'+'ts'+str(test_size)+'_'+'md'+str(max_depth)
random_state = 43,
n_estimator = 4
n_jobs = 1,
class_weight = 'balanced_subsample'
print ("random forest classifier report")
print (rf_sproc(table_name
                ,100
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43
                ,n_estimator
                ,1
                ,class_weight
                , max_depth))

random forest classifier report
          class  precision    recall  ...  nj                  cw max_depth
0        setosa   1.000000  1.000000  ...   1  balanced_subsample        20
1    versicolor   0.900000  0.900000  ...   1  balanced_subsample        20
2     virginica   0.875000  0.875000  ...   1  balanced_subsample        20
3      accuracy   0.925926  0.925926  ...   1  balanced_subsample        20
4     macro avg   0.925000  0.925000  ...   1  balanced_subsample        20
5  weighted avg   0.925926  0.925926  ...   1  balanced_subsample        20

[6 rows x 12 columns]


In [19]:
table_name = 'iris_dataset'
sample = 150,
test_size = 0.25
max_depth = 25
model_name = 'rf_iris_model_'+'ts'+str(test_size)+'_'+'md'+str(max_depth)
random_state = 43,
n_estimator = 4
n_jobs = 1,
class_weight = 'balanced_subsample'
print ("random forest classifier report")
print (rf_sproc(table_name
                ,100
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43
                ,n_estimator
                ,1
                ,class_weight
                , max_depth))

random forest classifier report
          class  precision    recall  ...  nj                  cw max_depth
0        setosa   1.000000  1.000000  ...   1  balanced_subsample        25
1    versicolor   0.833333  0.909091  ...   1  balanced_subsample        25
2     virginica   0.800000  0.666667  ...   1  balanced_subsample        25
3      accuracy   0.888889  0.888889  ...   1  balanced_subsample        25
4     macro avg   0.877778  0.858586  ...   1  balanced_subsample        25
5  weighted avg   0.887654  0.888889  ...   1  balanced_subsample        25

[6 rows x 12 columns]


In [37]:
table_name = 'iris_dataset'
sample = 150,
test_size = 0.25
max_depth = 20
model_name = 'dtree_iris_model'
random_state = 43,
class_weight = 'balanced'
print ("decision tree classifier report")
print (rf_sproc(table_name
                ,150
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43,
                4,
                1
                ,class_weight
                , max_depth))

decision tree classifier report
          class  precision    recall  f1-score  ...  ne nj        cw  max_depth
0        setosa   1.000000  1.000000  1.000000  ...   4  1  balanced         20
1    versicolor   0.857143  1.000000  0.923077  ...   4  1  balanced         20
2     virginica   1.000000  0.777778  0.875000  ...   4  1  balanced         20
3      accuracy   0.941176  0.941176  0.941176  ...   4  1  balanced         20
4     macro avg   0.952381  0.925926  0.932692  ...   4  1  balanced         20
5  weighted avg   0.949580  0.941176  0.939762  ...   4  1  balanced         20

[6 rows x 12 columns]


In [20]:
table_name = 'iris_dataset'
sample = 150,
test_size = 0.25
max_depth = 20
model_name = 'dtree_iris_model_'+'ts'+str(test_size)+'_'+'md'+str(max_depth)
random_state = 43,
class_weight = 'balanced'
print ("decision tree classifier report")
print (rf_sproc(table_name
                ,150
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43,
                4,
                1
                ,class_weight
                , max_depth))

decision tree classifier report
          class  precision    recall  f1-score  ...  ne nj        cw  max_depth
0        setosa   1.000000  1.000000  1.000000  ...   4  1  balanced         20
1    versicolor   0.928571  1.000000  0.962963  ...   4  1  balanced         20
2     virginica   1.000000  0.875000  0.933333  ...   4  1  balanced         20
3      accuracy   0.970588  0.970588  0.970588  ...   4  1  balanced         20
4     macro avg   0.976190  0.958333  0.965432  ...   4  1  balanced         20
5  weighted avg   0.972689  0.970588  0.970153  ...   4  1  balanced         20

[6 rows x 12 columns]


In [25]:
table_name = 'iris_dataset'
sample = 100,
test_size = 0.25
max_depth = 25
model_name = 'dtree_iris_model_'+'ts'+str(test_size)+'_'+'md'+str(max_depth)
random_state = 43,
class_weight = 'balanced'
print ("decision tree classifier report")
print (dtree_sproc(table_name
                ,100
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43
                ,class_weight
                , max_depth))

decision tree classifier report
          class  precision    recall  f1-score  ...    ne    nj        cw  max_depth
0        setosa   1.000000  1.000000  1.000000  ...  None  None  balanced         25
1    versicolor   0.750000  0.750000  0.750000  ...  None  None  balanced         25
2     virginica   0.857143  0.857143  0.857143  ...  None  None  balanced         25
3      accuracy   0.851852  0.851852  0.851852  ...  None  None  balanced         25
4     macro avg   0.869048  0.869048  0.869048  ...  None  None  balanced         25
5  weighted avg   0.851852  0.851852  0.851852  ...  None  None  balanced         25

[6 rows x 12 columns]


In [29]:
table_name = 'iris_dataset'
sample = 100,
test_size = 0.25
max_depth = 30
model_name = 'dtree_iris_model_'+'ts'+str(test_size)+'_'+'md'+str(max_depth)
random_state = 43,
class_weight = 'balanced'
print ("decision tree classifier report")
print (dtree_sproc(table_name
                ,100
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43
                ,class_weight
                , max_depth))

decision tree classifier report
          class  precision    recall  f1-score  ...    ne    nj        cw  max_depth
0        setosa   1.000000  1.000000  1.000000  ...  None  None  balanced         30
1    versicolor   0.900000  1.000000  0.947368  ...  None  None  balanced         30
2     virginica   1.000000  0.909091  0.952381  ...  None  None  balanced         30
3      accuracy   0.962963  0.962963  0.962963  ...  None  None  balanced         30
4     macro avg   0.966667  0.969697  0.966583  ...  None  None  balanced         30
5  weighted avg   0.966667  0.962963  0.963056  ...  None  None  balanced         30

[6 rows x 12 columns]


In [30]:
rf_session.sql("""select class,precision,recall,f1score,model,test_size,max_depth from model_output 
               order by f1score desc""").show()

-----------------------------------------------------------------------------------------------------------------------
|"CLASS"       |"PRECISION"  |"RECALL"  |"F1SCORE"  |"MODEL"                              |"TEST_SIZE"  |"MAX_DEPTH"  |
-----------------------------------------------------------------------------------------------------------------------
|weighted avg  |1.0          |1.0       |1.0        |rf_iris_model.joblib                 |0.1          |15           |
|setosa        |1.0          |1.0       |1.0        |dtree_iris_model_ts0.25_md20.joblib  |0.25         |20           |
|setosa        |1.0          |1.0       |1.0        |rf_iris_model_ts0.1_md15.joblib      |0.1          |15           |
|setosa        |1.0          |1.0       |1.0        |rf_iris_model.joblib                 |0.1          |15           |
|versicolor    |1.0          |1.0       |1.0        |rf_iris_model.joblib                 |0.1          |15           |
|virginica     |1.0          |1.0       

### Check if the classifier models are saved in stage location.
### Remember if the same model name was used for all the iterations, then only the last trained model will be saved
### for the model name.

In [38]:
rf_session.sql("list @stage_models").collect()

[Row(name='stage_models/dtree_iris_model.joblib', size=9760, md5='93176d7bb0c3ad50081521a5c8f0af68', last_modified='Mon, 28 Nov 2022 07:03:39 GMT'),
 Row(name='stage_models/dtree_iris_model_ts0.25_md20.joblib', size=9920, md5='6f6838af74d27b27041cc7174ee615dc', last_modified='Mon, 28 Nov 2022 06:59:14 GMT'),
 Row(name='stage_models/dtree_iris_model_ts0.25_md25.joblib', size=4304, md5='a3a907bf4bcf144fbf5e443f158397fb', last_modified='Mon, 28 Nov 2022 07:00:57 GMT'),
 Row(name='stage_models/dtree_iris_model_ts0.25_md30.joblib', size=4624, md5='52ba383dae0c431af809f6a38b3efd5a', last_modified='Mon, 28 Nov 2022 07:01:30 GMT'),
 Row(name='stage_models/rf_iris_model.joblib', size=10416, md5='0322dabbe9898ed39b8b2caebbd67b23', last_modified='Mon, 28 Nov 2022 07:02:05 GMT'),
 Row(name='stage_models/rf_iris_model_ts0.1_md15.joblib', size=10256, md5='44ece3e89482d80f09f45a6a877eb2d6', last_modified='Mon, 28 Nov 2022 07:01:57 GMT'),
 Row(name='stage_models/rf_iris_model_ts0.25_md20.joblib', size

### Define UDF for Inference

### for Randomforest classifier

In [34]:
import sys
import cachetools
import os
from snowflake.snowpark.functions import udf
rf_session.add_import("@stage_models/rf_iris_model.joblib")  

@cachetools.cached(cache={})
def read_file(filename):
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

@udf(name="predict_rf_iris_model", is_permanent=True, stage_location="@stage_models", replace=True)
def predict(SEPTAL_LENGTH: float, SEPTAL_WIDTH: float, PETAL_LENGTH: float, PETAL_WIDTH: float) -> float:
       m = read_file('rf_iris_model.joblib')       
       row = pd.DataFrame([locals()], columns=features)
       return m.predict(row)[0]

### for Decision Tree classifier

In [39]:
import sys
import cachetools
import os
from snowflake.snowpark.functions import udf
rf_session.add_import("@stage_models/dtree_iris_model.joblib")  

@cachetools.cached(cache={})
def read_file(filename):
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

@udf(name="predict_dtree_iris_model", is_permanent=True, stage_location="@stage_models", replace=True)
def predict(SEPTAL_LENGTH: float, SEPTAL_WIDTH: float, PETAL_LENGTH: float, PETAL_WIDTH: float) -> float:
       m = read_file('dtree_iris_model.joblib')       
       row = pd.DataFrame([locals()], columns=features)
       return m.predict(row)[0]

### Load Dataset to snowflake table for inference

In [None]:
from sklearn.datasets import load_iris
import pandas as pd
data = load_iris()
cols = ['SEPTAL_LENGTH','SEPTAL_WIDTH','PETAL_LENGTH','PETAL_WIDTH']
df = pd.DataFrame(data=data.data, columns=cols)
df['LABEL'] = data.target
rf_session.create_dataframe(df).write.mode("overwrite").save_as_table("iris_dataset")

### Check sample Data

In [40]:
snowpark_iris_df = rf_session.table('iris_dataset')
snowpark_iris_df.show()

-------------------------------------------------------------------------------
|"SEPTAL_LENGTH"  |"SEPTAL_WIDTH"  |"PETAL_LENGTH"  |"PETAL_WIDTH"  |"LABEL"  |
-------------------------------------------------------------------------------
|5.1              |3.5             |1.4             |0.2            |0        |
|4.9              |3.0             |1.4             |0.2            |0        |
|4.7              |3.2             |1.3             |0.2            |0        |
|4.6              |3.1             |1.5             |0.2            |0        |
|5.0              |3.6             |1.4             |0.2            |0        |
|5.4              |3.9             |1.7             |0.4            |0        |
|4.6              |3.4             |1.4             |0.3            |0        |
|5.0              |3.4             |1.5             |0.2            |0        |
|4.4              |2.9             |1.4             |0.2            |0        |
|4.9              |3.1             |1.5 

### Check Data structure

In [41]:
snowpark_iris_df.schema.fields

[StructField('SEPTAL_LENGTH', DoubleType(), nullable=True),
 StructField('SEPTAL_WIDTH', DoubleType(), nullable=True),
 StructField('PETAL_LENGTH', DoubleType(), nullable=True),
 StructField('PETAL_WIDTH', DoubleType(), nullable=True),
 StructField('LABEL', LongType(), nullable=True)]

### Now using both randomforest and decisiontree classifiers let's predict and infer using SQL.

### You can compare both model inference results side by side using SQL Query completed run on Snowflake

In [42]:
predict_snowpark_df = rf_session.sql("""SELECT LABEL AS ACTUAL_LABEL, 
               predict_rf_iris_model(SEPTAL_LENGTH, SEPTAL_WIDTH, PETAL_LENGTH, PETAL_WIDTH) as PRED_rf_LABEL,
               predict_dtree_iris_model(SEPTAL_LENGTH, SEPTAL_WIDTH, PETAL_LENGTH, PETAL_WIDTH) as PRED_dtree_LABEL,
               SEPTAL_LENGTH, SEPTAL_WIDTH, PETAL_LENGTH, PETAL_WIDTH
               FROM (iris_dataset) LIMIT 100""")
predict_snowpark_df.show(150)

-----------------------------------------------------------------------------------------------------------------------------
|"ACTUAL_LABEL"  |"PRED_RF_LABEL"  |"PRED_DTREE_LABEL"  |"SEPTAL_LENGTH"  |"SEPTAL_WIDTH"  |"PETAL_LENGTH"  |"PETAL_WIDTH"  |
-----------------------------------------------------------------------------------------------------------------------------
|0               |0.0              |0.0                 |5.1              |3.5             |1.4             |0.2            |
|0               |0.0              |0.0                 |4.9              |3.0             |1.4             |0.2            |
|0               |0.0              |0.0                 |4.7              |3.2             |1.3             |0.2            |
|0               |0.0              |0.0                 |4.6              |3.1             |1.5             |0.2            |
|0               |0.0              |0.0                 |5.0              |3.6             |1.4             |0.2      

In [None]:
rf_session.close()
print('Finished!!!')