In [2]:
import json
import numpy as np
import pandas as pd
import os
import sys

from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import sproc, col
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T

from snowflake.snowpark.types import PandasDataFrameType, IntegerType, StringType, FloatType
from snowflake.snowpark.exceptions import SnowparkSQLException

# Log In, Create Session

In [3]:
# Reading Snowflake Connection Details
snowflake_connection_cfg = json.loads(open("/Users/mitaylor/Documents/creds/creds.json").read())

# Creating Snowpark Session
session = Session.builder.configs(snowflake_connection_cfg).create()

# Create a fresh & new schema
session.sql("USE DATABASE MT_TEST").collect()
session.sql("CREATE OR REPLACE STAGE FUNCTIONS").collect()
session.sql("CREATE OR REPLACE WAREHOUSE ASYNC_WH WITH WAREHOUSE_SIZE='MEDIUM' WAREHOUSE_TYPE = 'SNOWPARK-OPTIMIZED'").collect()

[Row(status='Warehouse ASYNC_WH successfully created.')]

# Get the Data From the Share, Examine it, and Save it

In [27]:
sdf = session.sql("select * FROM DATA_LAKE_TRADE_DATA_MT.PUBLIC.TRADE")
sdf.write.save_as_table("TEST", mode="overwrite")
sdf.limit(5).to_pandas()

Unnamed: 0,DATE,SYMBOL,EXCHANGE,ACTION,CLOSE,NUM_SHARES,CASH,TRADER,PM
0,2013-06-18,VZ,NYSE,hold,51.55,0.0,0.0,Tiffany Bailey,Jenna Allen
1,2013-06-19,VZ,NYSE,hold,50.05,0.0,0.0,Tiffany Frey,Mrs Claire George
2,2013-06-19,VZ,NYSE,hold,50.05,0.0,0.0,Tiffany Bailey,Jenna Allen
3,2013-06-20,VZ,NYSE,hold,48.96,0.0,0.0,Tiffany Frey,Mrs Claire George
4,2013-06-20,VZ,NYSE,hold,48.96,0.0,0.0,Tiffany Bailey,Jenna Allen


# Prepare the Data for Machine Learning (using a UDTF)

In [5]:
sdf_filtered = sdf.filter((col("SYMBOL") == 'TGVC') | (col("SYMBOL") == 'GOOG') | (col("SYMBOL") == 'OTRK'))
sdf_filtered.limit(5).to_pandas()

Unnamed: 0,DATE,SYMBOL,EXCHANGE,ACTION,CLOSE,NUM_SHARES,CASH,TRADER,PM
0,2019-08-05,GOOG,NASDAQ,hold,57.62,0.0,0.0,charles,warren
1,2019-08-06,GOOG,NASDAQ,hold,58.5,0.0,0.0,charles,warren
2,2019-08-07,GOOG,NASDAQ,hold,58.7,0.0,0.0,charles,warren
3,2019-08-08,GOOG,NASDAQ,hold,60.24,0.0,0.0,charles,warren
4,2019-08-09,GOOG,NASDAQ,hold,59.4,0.0,0.0,charles,warren


In [6]:
from snowflake.snowpark.types import PandasDataFrameType, IntegerType, StringType, FloatType, DateType

class ML_Prep:
    """
    UDTF class to create offset time series data for binary classification

    Yields
    -------
    df_new : DataFrame
        DataFrame with the TM3,TM2,TM1,TM0 data plus the y variable 

    """
    def __init__(self):
        None
        
    def end_partition(self, df):
        df.columns = ['DATE', 'SYMBOL', 'CLOSE']
        dates = df['DATE']
        symbol = df['SYMBOL']
        df = df['CLOSE']

        def series_to_supervised(df, n_in=3, n_out=1, dropnan=True):
            cols = list()
            # input sequence (t-n, ... t-1)
            for i in range(n_in, 0, -1):
                cols.append(df.shift(i))
            for i in range(0, n_out):
                cols.append(df.shift(-i))
            agg = pd.concat(cols, axis=1)
            
            # drop rows with NaN values
            df = pd.DataFrame(agg.values)
            df = df.fillna(df.mean())
            df.columns = ['TM3', 'TM2', 'TM1', 'TM0']
            df['y'] = df['TM0'] - df['TM1']
            df['y'] = [1 if i>0 else 0 for i in list(df['y'])]
            return df
        df_new = series_to_supervised(df)
        df_new['DATE'] = dates
        df_new['SYMBOL'] = symbol
        yield df_new

ML_Prep.end_partition._sf_vectorized_input = pd.DataFrame

ml_prep_udtf = session.udtf.register(
    ML_Prep, # the class
    input_types=[PandasDataFrameType([DateType()] + # DATE
                                     [StringType()] + # SYMBOL
                                     [FloatType()] # CLOSE
                                    )], 
    output_schema=PandasDataFrameType([FloatType(),FloatType(),FloatType(),FloatType(),IntegerType(),DateType(),StringType()],
                                      ["TM3", "TM2", "TM1", "TM0", "Y", "DATE_", "SYMBOL_"]),
    packages=["snowflake-snowpark-python", 'pandas'])  



In [7]:
all_cols = ['DATE', 'SYMBOL', 'CLOSE']
sdf_prepped = sdf_filtered.select(ml_prep_udtf(*all_cols).over(partition_by=['SYMBOL']))
sdf_prepped.limit(5).to_pandas()

Unnamed: 0,TM3,TM2,TM1,TM0,Y,DATE_,SYMBOL_
0,0.521263,0.521232,0.521197,0.44,0,2022-10-11,OTRK
1,0.521263,0.521232,0.44,0.49,1,2022-10-04,OTRK
2,0.521263,0.44,0.49,0.48,0,2022-10-05,OTRK
3,0.44,0.49,0.48,0.48,0,2022-10-05,OTRK
4,0.49,0.48,0.48,0.48,0,2022-10-06,OTRK


# Create Train and Test Set

In [8]:
sdf_goog = sdf_prepped.filter((col("SYMBOL") == 'GOOG'))
weights = [0.5, 0.5]
sdf_goog_train, sdf_goog_test = sdf_goog.random_split(weights)
sdf_goog_train.write.save_as_table("GOOG_TRAIN", mode="overwrite")
sdf_goog_test.write.save_as_table("GOOG_TEST", mode="overwrite")

# Create and Train an ML Model to Predict Price Direction

In [24]:
# Snowpark ML
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.model_selection import GridSearchCV

# Define the XGBRegressor
regressor = XGBClassifier(
    input_cols=['TM3', 'TM2','TM1'],
    label_cols=['Y'],
    output_cols=['Y_PRED']
)

# Train
regressor.fit(sdf_goog_train)

# Predict
result = regressor.predict(sdf_goog_test)



  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.





In [26]:
result.limit(5).to_pandas()

Unnamed: 0,TM2,TM0,DATE_,TM1,Y,TM3,SYMBOL_,Y_PRED
0,171.953619,49.46,2017-10-11,171.953394,0,171.953849,GOOG,0
1,46.62,46.05,2017-09-25,46.43,0,49.46,GOOG,0
2,46.43,46.24,2017-09-26,46.05,1,46.62,GOOG,1
3,46.05,47.22,2017-09-27,46.24,1,46.43,GOOG,1
4,46.24,47.48,2017-09-28,47.22,1,46.05,GOOG,1


# Register ML Model (in the Registry)

In [12]:
from snowflake.ml.registry import model_registry

In [13]:
REGISTRY_DATABASE_NAME = "MODEL_REGISTRY"
REGISTRY_SCHEMA_NAME = "PUBLIC"

model_registry.create_model_registry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)


create_model_registry() is in private preview since 0.2.0. Do not use it in production. 


In [16]:
XGB_MODEL_NAME = "SIMPLE_XGB_MODEL"
XGB_MODEL_VERSION = "v2"
# A name and model tags can be added to the model at registration time.
xgb_model = registry.log_model(
    model_name=XGB_MODEL_NAME,
    model_version=XGB_MODEL_VERSION,
    model=regressor,
    tags={"stage": "testing", "classifier_type": "xgb"},
    sample_input_data=sdf_goog_train.limit(10).to_pandas()[['TM3', 'TM2','TM1']],
)

# Deploy ML Model

In [19]:
xgb_model.deploy(
    deployment_name="xgb_model_predict",
    target_method="predict",
    permanent=True,
    options={"relax_version": True}    
)

Generated UDF file is persisted at: /var/folders/97/8vc6xcbx4zd06p75xg9frdrw0000gn/T/tmpxfb5wsl2.py




MODEL_REGISTRY.PUBLIC.xgb_model_predict is deployed to warehouse.


# Run ML Model

In [23]:
remote_prediction = xgb_model.predict(deployment_name="xgb_model_predict", data=sdf_goog_test)

remote_prediction.limit(10).to_pandas()



Unnamed: 0,TM0,Y,DATE_,SYMBOL_,TM3,TM2,TM1,Y_PRED
0,49.46,0,2017-10-11,GOOG,171.953849,171.953619,171.953394,0
1,46.05,0,2017-09-25,GOOG,49.46,46.62,46.43,0
2,46.24,1,2017-09-26,GOOG,46.62,46.43,46.05,1
3,47.22,1,2017-09-27,GOOG,46.43,46.05,46.24,1
4,47.48,1,2017-09-28,GOOG,46.05,46.24,47.22,1
5,48.94,1,2017-10-06,GOOG,47.89,47.58,48.5,1
6,48.85,0,2017-10-09,GOOG,47.58,48.5,48.94,0
7,48.63,0,2017-10-10,GOOG,48.5,48.94,48.85,0
8,46.58,0,2017-09-20,GOOG,48.94,48.85,48.63,1
9,49.39,1,2017-10-12,GOOG,48.85,48.63,46.58,1


# Examine via Evidently (ideally with a task)

Basic, just look at HTML
advanced ish, write to a table, look at it in Snowsight.

Note this has slightly quirky Python version requirements so you will want to set this up in it's own venv as it requires Python 3.8 and SnowparkML requires 3.9

In [14]:
@sproc(session=session, name='evidently_monitor', stage_location='@FUNCTIONS',  
       packages=['snowflake-snowpark-python', 'pandas', 'evidently'], 
       is_permanent=True, 
       replace=True)
def monitor_model(session: Session, history: str, new_data: str) -> str:
    """
    Creates a report that monitors the model drift etc. using evidently package

    Parameters
    ----------
    history : string
        The initial training table

    new_data : string
        The new date (test in this case)

    Returns
    -------
    completion_confirmation : string
        Simple confirmation of completion of task (note completion is when the table is prepared)

    """
    
    from evidently.report import Report
    from evidently.metrics.base_metric import generate_column_metrics
    from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
    from evidently.test_suite import TestSuite
    from evidently.tests.base_test import generate_column_tests
    from evidently.test_preset import DataStabilityTestPreset, NoTargetPerformanceTestPreset
    from evidently.metrics import ColumnSummaryMetric, ColumnQuantileMetric, ColumnDriftMetric
    from joblib import dump

    report = Report(metrics=[
        ColumnSummaryMetric(column_name='TM1'),
        ColumnQuantileMetric(column_name='TM1', quantile=0.25),
        ColumnDriftMetric(column_name='TM1')
    ])

    reference = session.table(history).to_pandas()
    current = session.table(new_data).to_pandas()
    report.run(reference_data=reference, current_data=current)
    
    report.save_html("/tmp/report.html")
    session.file.put("/tmp/report.html", '@FUNCTIONS', auto_compress=False, overwrite=True)

    return('Data Prepped')

monitor_model("GOOG_TRAIN", "GOOG_TEST")



'Data Prepped'