### In this notebook Let's try to build a tensorflow model in sklearn pipeline to predict binary classes. Steps involved are
###### 1. Define the snowpark libraries and connect to snowflake
###### 2. Define a sklearn pipeline with defined tensorflow model as kerasclassifier part of keras wrapper
###### 3. Build the function to call the sklearn pipeline with kerasclassifier
###### 4. Function saves tensorflow h5 model as separate model file and sklearn pipeline as separate model file
###### 5. Register the function as Stored Proc to train the model
###### 6. Train the model using snowpark optimized warehouse
###### 7. Check if the model files h5 and pkl have been saved
###### 8. add Import both the model file h5 and pkl
###### 9. register UDF using pkl sklearn pipeline file
###### 10. Perform inference using the tensorflow model and compare with decision tree and regression classifier models. 

### Load Snowpark libraries

In [17]:
# Import required libraries
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import avg, sum, col,lit
from snowflake.snowpark.functions import udf, sproc, col
from snowflake.snowpark.types import IntegerType, FloatType, LongType, DoubleType, DecimalType,StringType, BooleanType, Variant
from snowflake.snowpark.types import PandasSeries, PandasDataFrame
from snowflake.snowpark import functions as fn

import warnings
warnings.filterwarnings('ignore')
import sys ,json
import io
import logging
import pandas as pd

import joblib
import pandas as pd
import numpy as np
import json

from snowflake.snowpark import version
print (f"snowflake snowpark version is: {version.VERSION}")

snowflake snowpark version is: (1, 0, 0)


### Connect to Snowflake and establish session

In [2]:
snowflake_connection_cfg = open('cred.json')
snowflake_connection_cfg = snowflake_connection_cfg.read()
snowflake_connection_cfg = json.loads(snowflake_connection_cfg)

'''
APP_WH XS
LAB_WH S
HMWH M optimized warehouse
DCR_MA_WH L
BANK1_WH XL
'''

# Creating Snowpark Session
sk_tf_session = Session.builder.configs(snowflake_connection_cfg).create()
print('Current Database:', sk_tf_session.get_current_database())
print('Current Schema:', sk_tf_session.get_current_schema())
print('Current Warehouse:', sk_tf_session.get_current_warehouse())
print("Warehouse set up:")
sk_tf_session.sql("show warehouses like 'APP_WH'").collect()

Current Database: "BANK1_CRM_DB"
Current Schema: "PUBLIC"
Current Warehouse: "APP_WH"
Warehouse set up:


[Row(name='APP_WH', state='SUSPENDED', type='STANDARD', size='X-Small', min_cluster_count=1, max_cluster_count=1, started_clusters=0, running=0, queued=0, is_default='N', is_current='Y', auto_suspend=600, auto_resume='true', available='', provisioning='', quiescing='', other='', created_on=datetime.datetime(2022, 2, 27, 4, 51, 57, 85000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), resumed_on=datetime.datetime(2023, 1, 2, 20, 53, 46, 463000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), updated_on=datetime.datetime(2023, 1, 2, 20, 53, 46, 463000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), owner='SYSADMIN', comment='', enable_query_acceleration='false', query_acceleration_max_scale_factor=8, resource_monitor='null', actives=0, pendings=0, failed=0, suspended=1, uuid='1463550724', scaling_policy='STANDARD')]

### Create stage location for models

In [36]:
# sk_tf_session.sql("CREATE OR REPLACE STAGE sp_tf_2_models").collect()

In [20]:
sk_tf_session.clear_packages()
sk_tf_session.add_packages("snowflake-snowpark-python")
sk_tf_session.add_packages("scikit-learn","pandas","numpy","joblib","cachetools","tensorflow","dill")
sk_tf_session.clear_imports()

The version of package numpy in the local environment is 1.23.5, which does not fit the criteria for the requirement numpy. Your UDF might not work when the package version is different between the server and your local environment
The version of package joblib in the local environment is 1.2.0, which does not fit the criteria for the requirement joblib. Your UDF might not work when the package version is different between the server and your local environment
The version of package cachetools in the local environment is 5.2.0, which does not fit the criteria for the requirement cachetools. Your UDF might not work when the package version is different between the server and your local environment
The version of package tensorflow in the local environment is 2.10.0, which does not fit the criteria for the requirement tensorflow. Your UDF might not work when the package version is different between the server and your local environment
package dill is not installed in the local environme

### Define function to save trained model

In [21]:
def save_pipeline_tf_file(session, model ,classifier,modeldir, modelh5file, pipelinefile):
    import joblib
    import dill
    import os

    # collect the preprocessing pipeline & model seperately
    model_file = os.path.join('/tmp', modelh5file)
    classifier.model.save(model_file)
    session.file.put(model_file, modeldir,overwrite=True)

    # Upload  pipeline to a stage
    pipeline_file = os.path.join('/tmp', pipelinefile)
    input_stream = io.BytesIO()
    dill.dump(model, open(pipeline_file, "wb"))
    session.file.put(pipeline_file, modeldir,overwrite=True)
    return f"successfully uploaded model file: {modelh5file} and pipeline file : {pipelinefile}"

### Define Features required to train model

In [22]:
features=["RECENCY_DAY",
"FREQUENCY",
"MONETORY",
"RMF_SCORE",
"DOTCOM",
"REWARDS_ACCOUNT",
"FREQ_1",
"FREQ_2",
"FREQ_3",
"FREQ_4",
"FREQ_5",
"FREQ_6",
"FREQ_7",
"FREQ_8",
"FREQ_9",
"FREQ_10",
"FREQ_11",
"FREQ_12",
"CNT_PER_PDT",
"CNT_PER_PDT_SFC",
"CNT_PER_PDT_VFC",
"NO_DISCOUNT",
"DISCOUNT_PROMOTION",
"SUM_DIS_PRO"]

In [23]:
def build_tf_model(p_df: pd.DataFrame):
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    import tensorflow as tf
    from tensorflow.keras import layers
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout , BatchNormalization
    from tensorflow.keras import datasets, layers, models
    from tensorflow.keras.utils import to_categorical
    from tensorflow.keras.callbacks import ReduceLROnPlateau
    from tensorflow.keras.callbacks import EarlyStopping
    from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
    numeric_features = p_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = p_df.select_dtypes(include=['object']).columns.tolist()
    
    feature_names = numeric_features + categorical_features

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler(with_mean=True,with_std=True))])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    def kerasclassifiermodel():
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout , BatchNormalization

        clf = Sequential()
        # clf.add(Dense(2, activation='relu', input_dim=24))
        # clf.add(Dense(2, activation='relu'))
        # clf.add(Dense(2, activation='sigmoid'))
        clf.add(Dense(units = 10 , activation = 'relu'))
        clf.add(Dropout(0.2))
        clf.add(Dense(units = 1 , activation = 'sigmoid')) # Tanh
        clf.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=["accuracy"])
        return clf

    model = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier'
                    ,KerasClassifier(kerasclassifiermodel)
                )
            ])

    return model

### Define Classification report to register model output

In [24]:
def get_classification_report(y_test, y_pred):
    from sklearn import metrics
    report = metrics.classification_report(y_test, y_pred, output_dict=True,target_names=['0','1'])
    df_classification_report = pd.DataFrame(report).transpose()    
    return df_classification_report

### Define Model parameteres to register model output

In [25]:
def get_model_info(model_name, test_size, random_state,epochs,batchsize):
    data = [[model_name,test_size,random_state,epochs,batchsize]]  
    df_model_info = pd.DataFrame(data,columns=['model','test_size','random_state','epochs','batchsize'])
    return df_model_info

### Define tensorflow stored proc function

In [26]:
def sp_train_tf_model(session: Session, training_table: str, model_name: str,features:list, Y: str,test_size:float,random_state:int,epochs:int,batchsize:int) -> str:
    from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, confusion_matrix, RocCurveDisplay
    from tensorflow.keras.callbacks import EarlyStopping

    # Load training data
    training_data = session.table(training_table)

    # Train test split
    Data_train, Data_test = training_data.random_split([1-test_size, test_size], seed=random_state)
    pd_Data_train=Data_train.to_pandas()
    pd_Data_test=Data_test.to_pandas()

    # Model building
    tf = build_tf_model(pd_Data_train[features])

    # define early stopping
    early_stopping = EarlyStopping()
    custom_early_stopping = [
        EarlyStopping(
        monitor='loss', 
        patience=10, 
        mode='auto'
    )]
    
    # Model training
    tf.fit(pd_Data_train[features], pd_Data_train[Y], 
           classifier__epochs=epochs, 
           classifier__batch_size=batchsize, 
           classifier__verbose=1,
           classifier__callbacks=[custom_early_stopping])


    # Upload  model to a stage
    save_pipeline_tf_file(session,tf, 
                 tf.named_steps['classifier'], 
                 "@sp_tf_2_models", 
                 model_name+"keras_model_1M.h5", 
                 model_name+"sklearn_pipe_1M.pkl")
    
    # Score Model
    score = tf.score(pd_Data_test[features], pd_Data_test[Y])
    
    y_pred = tf.predict(pd_Data_test)
    
    #Evaluate Metrics
    df_classification_report = get_classification_report(y_pred,pd_Data_test[Y]).reset_index().rename(columns={"index": "class"}).reset_index(drop=True)
    df_model_info = get_model_info('keras_model_1M.h5',test_size,random_state,epochs,batchsize)
    df_model_info=df_model_info.append([df_model_info]*5,ignore_index=True)
    session.create_dataframe(df_classification_report.join(df_model_info)).write.mode("append").save_as_table("STAPLES_TF_MODEL_OUTPUT")
    
    print (f"model score on validation data: {score}")
    return df_classification_report.join(df_model_info)



### Create Model output tables to save model ouptut

In [27]:
sk_tf_session.sql("create or replace table staples_tf_model_output (class varchar, precision double, recall double, f1score double, support double, model varchar,test_size float, random_state int, epochs int, batchsize int)").collect()

[Row(status='Table STAPLES_TF_MODEL_OUTPUT successfully created.')]

### Define Stored Proc to register Logistic regression classifier model

In [28]:
# Registering the function as a Stored Procedure
sp_tf_2_sproc = sk_tf_session.sproc.register(func=sp_train_tf_model, # training function defined above
                                            name='sp_train_tf_2_model', # training model name to be registered in snowlake
                                            is_permanent=True, # permanent stored proc
                                            replace=True, # replace if existing already
                                            stage_location='@sp_tf_2_models', # save the model in stage location
                                            packages=['snowflake-snowpark-python','scikit-learn','joblib','dill','tensorflow']) # import model libaries


The version of package joblib in the local environment is 1.2.0, which does not fit the criteria for the requirement joblib. Your UDF might not work when the package version is different between the server and your local environment
package dill is not installed in the local environmentYour UDF might not work when the package is installed on the server but not on your local environment.
The version of package tensorflow in the local environment is 2.10.0, which does not fit the criteria for the requirement tensorflow. Your UDF might not work when the package version is different between the server and your local environment


### Train tensorflow model in Snowflake through registered Stored Procs and capture model output in a snowflake table

### All the above steps are just definition and Registration of component
### The below training runs completely on Snowflake and you can go check in history tab
### I have changed the warehouse HMWH to suit the training needs

In [29]:
sk_tf_session.sql("use warehouse HMWH").collect()
print('Current Warehouse:', sk_tf_session.get_current_warehouse())
print("Warehouse set up:")
sk_tf_session.sql("show warehouses like 'HMWH'").collect()

Current Warehouse: "HMWH"
Warehouse set up:


[Row(name='HMWH', state='SUSPENDED', type='SNOWPARK-OPTIMIZED', size='Medium', min_cluster_count=1, max_cluster_count=1, started_clusters=0, running=0, queued=0, is_default='N', is_current='Y', auto_suspend=300, auto_resume='true', available='', provisioning='', quiescing='', other='', created_on=datetime.datetime(2022, 11, 1, 18, 18, 59, 3000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>), resumed_on=datetime.datetime(2022, 12, 30, 23, 28, 18, 762000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), updated_on=datetime.datetime(2022, 12, 30, 23, 28, 18, 762000, tzinfo=<DstTzInfo 'America/Los_Angeles' PST-1 day, 16:00:00 STD>), owner='SYSADMIN', comment='warehouse used by role dev_blogger', enable_query_acceleration='false', query_acceleration_max_scale_factor=8, resource_monitor='null', actives=0, pendings=0, failed=0, suspended=1, uuid='1463550932', scaling_policy='STANDARD')]

In [30]:
# table_name = 'staples_data_train_15M'
# table_name = 'staples_data_train'
table_name = 'STAPLES_DATA_TRAIN_1M'
test_size = 0.25
model_name = 'staples_data_train_'
random_state = 43,
epochs=200,
batchsize=100
print ("Tensorflow classifier report")
print (sp_tf_2_sproc(table_name
                , model_name
                ,features
                ,'LABEL'
                ,test_size
                ,43
                ,200
                ,100))

Tensorflow classifier report
          class  precision    recall  ...  random_state  epochs batchsize
0             0   0.835987  0.719037  ...            43     200       100
1             1   0.583333  0.736031  ...            43     200       100
2      accuracy   0.724956  0.724956  ...            43     200       100
3     macro avg   0.709660  0.727534  ...            43     200       100
4  weighted avg   0.747991  0.724956  ...            43     200       100

[5 rows x 10 columns]


In [31]:
sk_tf_session.sql("list @sp_tf_2_models").collect()

[Row(name='sp_tf_2_models/keras_model.h5.gz', size=298848, md5='9f591c0c5ead4c3617ddf32b7963773f', last_modified='Sat, 24 Dec 2022 03:24:38 GMT'),
 Row(name='sp_tf_2_models/keras_model_small.h5.gz', size=26496, md5='f130111d34e0a91cb1801e30985bee88', last_modified='Thu, 29 Dec 2022 22:22:11 GMT'),
 Row(name='sp_tf_2_models/kmodel.h5.gz', size=301136, md5='994c9e030aa656d168101f88218f991c', last_modified='Sat, 24 Dec 2022 03:22:19 GMT'),
 Row(name='sp_tf_2_models/sklearn_pipe.pkl.gz', size=315344, md5='76656aa8b8c9af7348ff420eacca34fc', last_modified='Sat, 24 Dec 2022 03:24:39 GMT'),
 Row(name='sp_tf_2_models/sklearn_pipe_small.pkl.gz', size=43424, md5='2b2fc55411cf1f982fb380cb0a5d761d', last_modified='Thu, 29 Dec 2022 22:22:12 GMT'),
 Row(name='sp_tf_2_models/sp_tf_2_modelkeras_model.h5.gz', size=304864, md5='df6a053a983c0c8e9655323bb8f758a1', last_modified='Sat, 24 Dec 2022 03:35:51 GMT'),
 Row(name='sp_tf_2_models/sp_tf_2_modelsklearn_pipe.pkl.gz', size=321344, md5='25d7c0f6f3e475fa6

### Check Model Output

In [33]:
sk_tf_session.sql("use warehouse app_wh").collect()

[Row(status='Statement executed successfully.')]

In [32]:
sk_tf_session.sql("""select class,precision,recall,f1score,model from staples_tf_model_output """).show(1000)

---------------------------------------------------------------------------------------------------
|"CLASS"       |"PRECISION"         |"RECALL"            |"F1SCORE"           |"MODEL"            |
---------------------------------------------------------------------------------------------------
|0             |0.8359867828176629  |0.7190373956532705  |0.7731143572427797  |keras_model_1M.h5  |
|1             |0.5833325730967477  |0.7360314938876994  |0.6508455944098652  |keras_model_1M.h5  |
|accuracy      |0.724956200663104   |0.724956200663104   |0.724956200663104   |keras_model_1M.h5  |
|macro avg     |0.7096596779572053  |0.727534444770485   |0.7119799758263224  |keras_model_1M.h5  |
|weighted avg  |0.747990880850823   |0.724956200663104   |0.7305298696154544  |keras_model_1M.h5  |
---------------------------------------------------------------------------------------------------



### Validate model training

In [46]:
qry = '''select query_id,
       substr(query_text,9,21) model,
       warehouse_name,
       execution_status,
       TIMESTAMPDIFF(second,start_time,end_time) total_duration
from table(information_schema.QUERY_HISTORY_BY_WAREHOUSE('HMWH'))
WHERE query_text like '%CALL%' and execution_status ='SUCCESS' and
TIMESTAMPDIFF(second,start_time,end_time) > 300
order by start_time desc'''
sk_tf_session.sql(qry).show()


---------------------------------------------------------------------------------------------------------------------------
|"QUERY_ID"                            |"MODEL"                |"WAREHOUSE_NAME"  |"EXECUTION_STATUS"  |"TOTAL_DURATION"  |
---------------------------------------------------------------------------------------------------------------------------
|01a96a81-0402-4ffa-0057-3c03017e2c3e  |train_tf_2_model('STA  |HMWH              |SUCCESS             |2317              |
---------------------------------------------------------------------------------------------------------------------------



In [47]:
sk_tf_session.sql("use warehouse app_wh").collect()

[Row(status='Statement executed successfully.')]

### Import both model files h5 and pkl

In [48]:
import sys
import cachetools
import os
import joblib
from snowflake.snowpark.functions import udf
sk_tf_session.add_import("@sp_tf_2_models/staples_data_train_keras_model_1M.h5")  
sk_tf_session.add_import("@sp_tf_2_models/staples_data_train_sklearn_pipe_1M.pkl")  

In [49]:
# from tensorflow.keras.models import load_model
# import joblib
# pipeline = joblib.load('sklearn_pipe.pkl')

# # Then, load the Keras model:
# tf.named_steps['classifier'].model = load_model('kmodel.h5')

### Register UDF for Inference

In [50]:
@cachetools.cached(cache={})
def read_file(filename):
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

@udf(name="predict_tf_staples_model", is_permanent=True, stage_location="@sp_tf_2_models", replace=True)
def predict(RECENCY_DAY: float, FREQUENCY: float, MONETORY: float, RMF_SCORE: float, DOTCOM: float, REWARDS_ACCOUNT: float,
            FREQ_1: float, FREQ_2: float, FREQ_3: float, FREQ_4: float, FREQ_5: float, FREQ_6: float, FREQ_7: float, 
            FREQ_8: float, FREQ_9: float, FREQ_10: float, FREQ_11: float, FREQ_12: float, CNT_PER_PDT: float, 
            CNT_PER_PDT_SFC: float, CNT_PER_PDT_VFC: float, NO_DISCOUNT: float, DISCOUNT_PROMOTION: float,
            SUM_DIS_PRO: float) -> int:
       m = read_file('staples_data_train_sklearn_pipe_1M.pkl')       
       row = pd.DataFrame([locals()], columns=features)
       return m.predict(row)[0]

### Call the features

In [51]:
features = ['RECENCY_DAY',
'FREQUENCY',
'MONETORY',
'RMF_SCORE',
'DOTCOM',
'REWARDS_ACCOUNT',
'FREQ_1',
'FREQ_2',
'FREQ_3',
'FREQ_4',
'FREQ_5',
'FREQ_6',
'FREQ_7',
'FREQ_8',
'FREQ_9',
'FREQ_10',
'FREQ_11',
'FREQ_12',
'CNT_PER_PDT',
'CNT_PER_PDT_SFC',
'CNT_PER_PDT_VFC',
'NO_DISCOUNT',
'DISCOUNT_PROMOTION',
'SUM_DIS_PRO',
'LABEL']

### SQL to infer using all model UDFs

In [53]:
qry="""SELECT 
       substr(ACCT_NMB,5,5) ACCT_NMB,
       LABEL AS ORIGINAL_LABEL,
       predict_tf_staples_model(RECENCY_DAY,FREQUENCY, MONETORY,RMF_SCORE,DOTCOM,REWARDS_ACCOUNT,
       FREQ_1,FREQ_2, FREQ_3, FREQ_4, FREQ_5, FREQ_6, FREQ_7, FREQ_8, FREQ_9, FREQ_10, FREQ_11, FREQ_12,
       CNT_PER_PDT, CNT_PER_PDT_SFC, CNT_PER_PDT_VFC, NO_DISCOUNT, DISCOUNT_PROMOTION, SUM_DIS_PRO) as PRED_tf_LABEL,
       predict_rf_staples_model(RECENCY_DAY,FREQUENCY, MONETORY,RMF_SCORE,DOTCOM,REWARDS_ACCOUNT,
       FREQ_1,FREQ_2, FREQ_3, FREQ_4, FREQ_5, FREQ_6, FREQ_7, FREQ_8, FREQ_9, FREQ_10, FREQ_11, FREQ_12,
       CNT_PER_PDT, CNT_PER_PDT_SFC, CNT_PER_PDT_VFC, NO_DISCOUNT, DISCOUNT_PROMOTION, SUM_DIS_PRO) as PRED_rf_LABEL,
       predict_dtree_staples_model(RECENCY_DAY,FREQUENCY, MONETORY,RMF_SCORE,DOTCOM,REWARDS_ACCOUNT,
       FREQ_1,FREQ_2, FREQ_3, FREQ_4, FREQ_5, FREQ_6, FREQ_7, FREQ_8, FREQ_9, FREQ_10, FREQ_11, FREQ_12,
       CNT_PER_PDT, CNT_PER_PDT_SFC, CNT_PER_PDT_VFC, NO_DISCOUNT, DISCOUNT_PROMOTION, SUM_DIS_PRO) as PRED_dtree_LABEL
        FROM (STAPLES_DATA_TRAIN)"""
predict_snowpark_df = sk_tf_session.sql(qry)
predict_snowpark_df.show(1000)

------------------------------------------------------------------------------------------
|"ACCT_NMB"  |"ORIGINAL_LABEL"  |"PRED_TF_LABEL"  |"PRED_RF_LABEL"  |"PRED_DTREE_LABEL"  |
------------------------------------------------------------------------------------------
|00888       |0                 |0                |1.0              |1.0                 |
|01071       |1                 |0                |0.0              |0.0                 |
|01103       |0                 |0                |0.0              |0.0                 |
|01890       |0                 |0                |0.0              |0.0                 |
|01898       |0                 |0                |0.0              |0.0                 |
|02421       |0                 |0                |0.0              |0.0                 |
|02562       |0                 |0                |0.0              |0.0                 |
|04405       |0                 |0                |1.0              |1.0                 |

### Let's close the snowpark session

In [33]:
sk_tf_session.close()
print('Finished!!!')

Finished!!!
