# Steps in this Notebook

1. Imports
2. Snowflake Setup
3. TS Forecasting with DARTS
4. Deployment of TS Forecasting with Snowflake Tasks

# Imports

In [1]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType, FloatType, StringType,StructType, StructField

# import snowflake.ml.modeling.preprocessing as snowml
# from snowflake.ml.modeling.xgboost import XGBClassifier
# from snowflake.ml.modeling.preprocessing import KBinsDiscretizer, OrdinalEncoder, OneHotEncoder
# from snowflake.ml.modeling.impute import SimpleImputer

import json
import pandas as pd
from datetime import date, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
connection_parameters = json.load(open('/Users/skhara/Documents/Code/creds.json'))
session = Session.builder.configs(connection_parameters).create()

In [3]:
import os
os.getcwd()

'/Users/skhara/Documents/GitHub/skhara-demos-public/Time Series Demo'

In [4]:
!pip list

Package                    Version
-------------------------- ----------------
aiofiles                   22.1.0
aiosqlite                  0.18.0
anyio                      3.5.0
appnope                    0.1.2
argon2-cffi                21.3.0
argon2-cffi-bindings       21.2.0
asn1crypto                 1.5.1
asttokens                  2.0.5
attrs                      23.1.0
Babel                      2.11.0
backcall                   0.2.0
beautifulsoup4             4.12.2
bleach                     4.1.0
Bottleneck                 1.3.5
Brotli                     1.0.9
cachetools                 4.2.2
certifi                    2023.7.22
cffi                       1.15.1
charset-normalizer         2.0.4
cloudpickle                2.0.0
cmdstanpy                  1.1.0
comm                       0.1.2
contourpy                  1.0.5
convertdate                2.3.2
cryptography               41.0.3
cycler                     0.11.0
debugpy                    1.6.7
decorator       

In [5]:
from prophet import Prophet

Importing plotly failed. Interactive plots will not work.


# Snowflake Setup: Create a Database and Schema

We will be using PUBLIC schema.

In [3]:
session.sql('USE DATABASE TIME_SERIES').collect()
session.sql('USE SCHEMA SYNTHETIC_DATA').collect()

[Row(status='Statement executed successfully.')]

In [4]:
# Load TS data from Store_Traffic Database into ACCRUENT_TS_FORECASTING DB for testing purposes.
sdf_raw = session.table('TIME_SERIES_1K')

In [5]:
# Print the Number of Time Series/Pumps that we have to predict
session.sql('SELECT COUNT(DISTINCT SERIES_ID) FROM TIME_SERIES_1K').collect()

[Row(COUNT(DISTINCT SERIES_ID)=1000)]

In [6]:
sdf_raw.describe().collect()

[Row(SUMMARY='count', SERIES_ID=2046000.0, VALUE=2046000.0),
 Row(SUMMARY='mean', SERIES_ID=500.5, VALUE=124.109512),
 Row(SUMMARY='stddev', SERIES_ID=288.67506080366553, VALUE=35.57537708865502),
 Row(SUMMARY='min', SERIES_ID=1.0, VALUE=44.0),
 Row(SUMMARY='max', SERIES_ID=1000.0, VALUE=246.0)]

# Local Testing

In [9]:
df_data = sdf_raw.filter((F.col("SERIES_ID") == 62)).to_pandas()

# Here onwards copy paste in UDTF
import prophet

df_data['ds'] = pd.to_datetime(df_data['DATE'])
df_data = df_data.groupby('ds').sum('VALUE').reset_index()
df_data = df_data.rename(columns={'VALUE':'y'})
df_data = df_data[['ds','y']]
df_data = df_data.sort_values(by=['ds']).reset_index(drop=True)

# Set train start
train_length = 600
forecast_horizon = 30
train_end = max(df_data['ds'])
train_start = train_end - pd.Timedelta(days = 600)

# Get training data
df_data = df_data.loc[(df_data['ds'] > train_start) & (df_data['ds'] <= train_end)]

# Model fit and predict
model = prophet.Prophet()
model.fit(df_data)
future = model.make_future_dataframe(periods=30)
forecast = model.predict(future)

# Post process forecast
forecast = forecast[['ds','yhat']]
forecast.columns = ['TIMESTAMP','FORECAST']
forecast['TRAIN_START'] = train_start
forecast['TRAIN_END'] = train_end
forecast['FORECAST_HORIZON'] = forecast_horizon
forecast['LIBRARY_VERSION'] = str(prophet.__version__)
forecast

17:17:32 - cmdstanpy - INFO - Chain [1] start processing
17:17:32 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,TIMESTAMP,FORECAST,TRAIN_START,TRAIN_END,FORECAST_HORIZON,LIBRARY_VERSION
0,2021-12-17,149.580651,2021-12-16,2023-08-08,30,1.1.5
1,2021-12-18,136.509438,2021-12-16,2023-08-08,30,1.1.5
2,2021-12-19,141.670957,2021-12-16,2023-08-08,30,1.1.5
3,2021-12-20,159.879076,2021-12-16,2023-08-08,30,1.1.5
4,2021-12-21,177.866228,2021-12-16,2023-08-08,30,1.1.5
...,...,...,...,...,...,...
625,2023-09-03,159.915620,2021-12-16,2023-08-08,30,1.1.5
626,2023-09-04,178.099985,2021-12-16,2023-08-08,30,1.1.5
627,2023-09-05,196.063384,2021-12-16,2023-08-08,30,1.1.5
628,2023-09-06,200.465051,2021-12-16,2023-08-08,30,1.1.5


In [21]:
# model.plot(forecast)
# model.plot_components(forecast)

# Creating UDTF for multi-node parallelized model training

### Upload library to Snowflake Stage
We are uploading to a stage as this library is not available through the Snowflake Anaconda Channel.

In [11]:
session.sql('USE DATABASE TIME_SERIES').collect()
session.sql('USE SCHEMA SYNTHETIC_DATA').collect()
session.sql('USE WAREHOUSE ML_WORKLOADS').collect()

[Row(status='Statement executed successfully.')]

In [None]:
session.sql('CREATE STAGE IF NOT EXISTS ML_MODELS').collect()

In [None]:
import darts
print(darts.__version__)
print(darts.__path__[0])

In [None]:
zip_file_path = "/Users/skhara/anaconda3/envs/test_snowpandas_12_18_23/lib/python3.9/site-packages/darts.zip"
session.file.put(zip_file_path, "@ML_MODELS", auto_compress=False, overwrite=True)

In [7]:
import prophet
prophet_version = prophet.__version__
prophet_path = prophet.__path__[0]
print(prophet_version)
print(prophet_path)

1.1.5
/Users/skhara/anaconda3/envs/pysnowpark_ml_09_2023/lib/python3.9/site-packages/prophet


In [None]:
zip_file_path = "/Users/skhara/anaconda3/envs/pysnowpark_ml_09_2023/lib/python3.9/site-packages/prophet.zip"
session.file.put(zip_file_path, "@ML_MODELS", auto_compress=False, overwrite=True)

## - TEST Prophet==1.1.5

In [10]:
session.custom_package_usage_config = {"enabled": True, "cache_path": "@ML_MODELS"} 

Parameter custom_package_usage_config is experimental since 1.6.0. Do not use it in production. 


In [12]:
schema = T.StructType([
    T.StructField("TIMESTAMP", T.DateType()),
    T.StructField("FORECAST", T.FloatType()),
    T.StructField("TRAIN_START", T.DateType()),
    T.StructField("TRAIN_END", T.DateType()),
    T.StructField("FORECAST_HORIZON", T.IntegerType()),
    T.StructField("LIBRARY_VERSION", T.StringType())
])

@F.udtf(output_schema = schema,
        input_types = [T.VariantType()],
        name = "TESTING_PROPHET_2",
        is_permanent=True,
        stage_location="@TIME_SERIES.SYNTHETIC_DATA.ML_MODELS",
        session=session,
        packages=['pandas==1.5.3','holidays', 'snowflake-snowpark-python',
                  'joblib==1.2.0','lightning-utilities==0.7.1','matplotlib==3.7.1',
                  'plotly==5.9.0','pmdarima==2.0.3','pytorch==2.0.1',
                  'pytorch-lightning==2.0.3','pyyaml==6.0','scikit-learn==1.2.2',
                  'scipy==1.10.1','statsmodels',
                  'tbats==1.1.3','torchmetrics==0.11.4','tqdm','xarray'],
        imports = ['@ML_MODELS/prophet.zip'],
        replace=True
       )

class forecast:
    def __init__(self):
        self.rows=[]
        self.dfs=[]
    
    def process(self, data):
        self.rows.append(data)

        # Merge rows into a dataframe
        if len(self.rows) >= 16000:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []
        
        # Merge dataframes into a single dataframe
        # Minimizes memory footprint
        if len(self.dfs) >= 100:
            merged_df = pd.concat(self.dfs)
            self.dfs = [merged_df]

        yield None
    
    def end_partition(self):
        from prophet import Prophet
        import prophet

        if len(self.rows) > 0:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []

        # Preprocess Data
        df_data = pd.concat(self.dfs)
        df_data['ds'] = pd.to_datetime(df_data['DATE'])
        df_data = df_data.groupby('ds').sum('VALUE').reset_index()
        df_data = df_data.rename(columns={'VALUE':'y'})
        df_data = df_data[['ds','y']]
        df_data = df_data.sort_values(by=['ds']).reset_index(drop=True)

        # Set train start
        train_length = 600
        forecast_horizon = 30
        train_end = max(df_data['ds'])
        train_start = train_end - pd.Timedelta(days = 600)

        # Get training data
        df_data = df_data.loc[(df_data['ds'] > train_start) & (df_data['ds'] <= train_end)]

        # Model fit and predict
        model = Prophet()
        model.fit(df_data)
        future = model.make_future_dataframe(periods=forecast_horizon)
        forecast = model.predict(future)

        # Post process forecast
        forecast = forecast[['ds','yhat']]

        # forecast = df_data.copy()
        # forecast = forecast[['ds','y']]
        
        forecast.columns = ['TIMESTAMP','FORECAST']
        forecast['TRAIN_START'] = train_start
        forecast['TRAIN_END'] = train_end
        forecast['FORECAST_HORIZON'] = forecast_horizon
        forecast['LIBRARY_VERSION'] = str(prophet.__version__)

        yield from forecast.itertuples(index=False, name=None)

The version of package 'holidays' in the local environment is 0.33, which does not fit the criteria for the requirement 'holidays'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'joblib' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'joblib==1.2.0'. Your UDF might not work when the package version is different between the server and your local environment.
The version of package 'plotly' in the local environment is 5.17.0, which does not fit the criteria for the requirement 'plotly==5.9.0'. Your UDF might not work when the package version is different between the server and your local environment.
Package 'pytorch' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'pytorch-lightning' in the local environment is 2.0.9, which does not fit the cri

In [13]:
df = session.table('TIME_SERIES_1K') \
        .with_column('ROW', F.object_construct_keep_null('*')) \
        .select(F.col('SERIES_ID'), F.col('ROW')) \
        .filter(F.col('SERIES_ID').isin([1,2,3,4,5]))

store_forecast_test = F.table_function("TESTING_PROPHET_2")

variant_column = F.parse_json(df.col('ROW').cast(T.VariantType()))

forecast_sdf = df.select(F.col('SERIES_ID'),
                         store_forecast_test(variant_column).over(partition_by=['SERIES_ID'])
                        )

forecast_sdf.write.save_as_table("TEST_PROPHET_COMPASS", mode="overwrite")

SnowparkSQLException: (1304): 01b16e5b-0001-f5d3-0021-d9870039b4aa: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/var/folders/2d/stkxpskx5934bff5mzjj6l340000gn/T/ipykernel_1186/1799059119.py", line 49, in end_partition
  File "<frozen importlib._bootstrap>", line 1007, in _find_and_load
  File "<frozen importlib._bootstrap>", line 986, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 664, in _load_unlocked
  File "<frozen importlib._bootstrap>", line 627, in _load_backward_compatible
  File "<frozen zipimport>", line 259, in load_module
  File "/home/udf/145383427489/prophet.zip/prophet/__init__.py", line 12, in <module>
    with open(here / "__version__.py", "r") as f:
NotADirectoryError: [Errno 20] Not a directory: '/home/udf/145383427489/prophet.zip/prophet/__version__.py'
 in function TESTING_PROPHET_2 with handler compute

In [None]:
temp_sdf = session.table('TEST_PROPHET_COMPASS')
temp_sdf.limit(100).to_pandas()

Unnamed: 0,SERIES_ID,TIMESTAMP,FORECAST,TRAIN_START,TRAIN_END,FORECAST_HORIZON,LIBRARY_VERSION
0,3,2021-12-17,129.575846,2021-12-16,2023-08-08,30,1.0
1,3,2021-12-18,118.280887,2021-12-16,2023-08-08,30,1.0
2,3,2021-12-19,122.753493,2021-12-16,2023-08-08,30,1.0
3,3,2021-12-20,138.516870,2021-12-16,2023-08-08,30,1.0
4,3,2021-12-21,154.082572,2021-12-16,2023-08-08,30,1.0
...,...,...,...,...,...,...,...
95,3,2022-03-22,157.777474,2021-12-16,2023-08-08,30,1.0
96,3,2022-03-23,161.631877,2021-12-16,2023-08-08,30,1.0
97,3,2022-03-24,150.820981,2021-12-16,2023-08-08,30,1.0
98,3,2022-03-25,133.532939,2021-12-16,2023-08-08,30,1.0


In [None]:
!pip list