# Steps in this Notebook

1. Imports
2. Snowflake Setup
3. Local Prophet model Test
4. UDTF Prophet for Parallel Compute
5. Prophet==1.1.5 Test (not working atm. Snowflake team on it)

# Imports

In [30]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType, FloatType, StringType,StructType, StructField

import json
import pandas as pd
from datetime import date, timedelta

import warnings
warnings.filterwarnings("ignore")

In [31]:
connection_parameters = json.load(open('/Users/skhara/Documents/Code/creds.json'))
session = Session.builder.configs(connection_parameters).create()

In [32]:
import os
os.getcwd()

'/Users/skhara/Documents/Code/Customer Code/2024-01 - Prophet==1.1.5 Test'

# Snowflake Setup: Create a Database and Schema

We will be using PUBLIC schema.

In [33]:
session.sql('USE DATABASE TIME_SERIES').collect()
session.sql('USE SCHEMA SYNTHETIC_DATA').collect()

[Row(status='Statement executed successfully.')]

In [6]:
# Load TS data from Store_Traffic Database into ACCRUENT_TS_FORECASTING DB for testing purposes.
sdf_raw = session.table('TIME_SERIES_1K')

In [7]:
# Print the Number of Time Series/Pumps that we have to predict
session.sql('SELECT COUNT(DISTINCT SERIES_ID) FROM TIME_SERIES_1K').collect()

[Row(COUNT(DISTINCT SERIES_ID)=1000)]

In [8]:
sdf_raw.describe().collect()

[Row(SUMMARY='count', SERIES_ID=2046000.0, VALUE=2046000.0),
 Row(SUMMARY='mean', SERIES_ID=500.5, VALUE=124.109512),
 Row(SUMMARY='stddev', SERIES_ID=288.67506080366553, VALUE=35.57537708865502),
 Row(SUMMARY='min', SERIES_ID=1.0, VALUE=44.0),
 Row(SUMMARY='max', SERIES_ID=1000.0, VALUE=246.0)]

# Local Testing

In [9]:
df_data = sdf_raw.filter((F.col("SERIES_ID") == 62)).to_pandas()

# Here onwards copy paste in UDTF
import prophet

df_data['ds'] = pd.to_datetime(df_data['DATE'])
df_data = df_data.groupby('ds').sum('VALUE').reset_index()
df_data = df_data.rename(columns={'VALUE':'y'})
df_data = df_data[['ds','y']]
df_data = df_data.sort_values(by=['ds']).reset_index(drop=True)

# Set train start
train_length = 600
forecast_horizon = 30
train_end = max(df_data['ds'])
train_start = train_end - pd.Timedelta(days = 600)

# Get training data
df_data = df_data.loc[(df_data['ds'] > train_start) & (df_data['ds'] <= train_end)]

# Model fit and predict
model = prophet.Prophet()
model.fit(df_data)
future = model.make_future_dataframe(periods=30)
forecast = model.predict(future)

# Post process forecast
forecast = forecast[['ds','yhat']]
forecast.columns = ['TIMESTAMP','FORECAST']
forecast['TRAIN_START'] = train_start
forecast['TRAIN_END'] = train_end
forecast['FORECAST_HORIZON'] = forecast_horizon
forecast['LIBRARY_VERSION'] = str(prophet.__version__)
forecast

Importing plotly failed. Interactive plots will not work.
12:35:49 - cmdstanpy - INFO - Chain [1] start processing
12:35:49 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,TIMESTAMP,FORECAST,TRAIN_START,TRAIN_END,FORECAST_HORIZON,LIBRARY_VERSION
0,2021-12-17,149.580651,2021-12-16,2023-08-08,30,1.1.5
1,2021-12-18,136.509438,2021-12-16,2023-08-08,30,1.1.5
2,2021-12-19,141.670957,2021-12-16,2023-08-08,30,1.1.5
3,2021-12-20,159.879076,2021-12-16,2023-08-08,30,1.1.5
4,2021-12-21,177.866228,2021-12-16,2023-08-08,30,1.1.5
...,...,...,...,...,...,...
625,2023-09-03,159.915620,2021-12-16,2023-08-08,30,1.1.5
626,2023-09-04,178.099985,2021-12-16,2023-08-08,30,1.1.5
627,2023-09-05,196.063384,2021-12-16,2023-08-08,30,1.1.5
628,2023-09-06,200.465051,2021-12-16,2023-08-08,30,1.1.5


In [21]:
# model.plot(forecast)
# model.plot_components(forecast)

# Creating UDTF for multi-node parallelized model training

In [34]:
session.sql('USE DATABASE TIME_SERIES').collect()
session.sql('USE SCHEMA SYNTHETIC_DATA').collect()
session.sql('USE WAREHOUSE ML_WORKLOADS').collect()
session.sql('CREATE STAGE IF NOT EXISTS TEST_PROPHET_115').collect()

[Row(status='TEST_PROPHET_115 already exists, statement succeeded.')]

## Snow Conda Prophet==1.0.0

In [50]:
schema = T.StructType([
    T.StructField("TIMESTAMP", T.DateType()),
    T.StructField("FORECAST", T.FloatType()),
    T.StructField("TRAIN_START", T.DateType()),
    T.StructField("TRAIN_END", T.DateType()),
    T.StructField("FORECAST_HORIZON", T.IntegerType()),
    T.StructField("LIBRARY_VERSION", T.StringType())
])

@F.udtf(session=session,
        output_schema = schema,
        input_types = [T.VariantType()],
        name = "TESTING_PROPHET_101",
        is_permanent=True,
        stage_location="@TIME_SERIES.SYNTHETIC_DATA.TEST_PROPHET_115",
        packages=['pandas==1.5.3','prophet', 'holidays==0.18', 'snowflake-snowpark-python','tqdm'],
        replace=True
       )

class forecast:
    def __init__(self):
        self.rows=[]
        self.dfs=[]
    
    def process(self, data):
        self.rows.append(data)
        # Merge rows into a dataframe
        if len(self.rows) >= 16000:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []
        # Merge dataframes into a single dataframe. Minimizes memory footprint
        if len(self.dfs) >= 100:
            merged_df = pd.concat(self.dfs)
            self.dfs = [merged_df]
        yield None
    
    def end_partition(self):
        from prophet import Prophet
        import prophet

        if len(self.rows) > 0:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []

        # Preprocess Data
        df_data = pd.concat(self.dfs)
        df_data['ds'] = pd.to_datetime(df_data['DATE'])
        df_data = df_data.groupby('ds').sum('VALUE').reset_index()
        df_data = df_data.rename(columns={'VALUE':'y'})
        df_data = df_data[['ds','y']]
        df_data = df_data.sort_values(by=['ds']).reset_index(drop=True)

        # Set train start
        train_length = 600
        forecast_horizon = 30
        train_end = max(df_data['ds'])
        train_start = train_end - pd.Timedelta(days = 600)

        # Get training data
        df_data = df_data.loc[(df_data['ds'] > train_start) & (df_data['ds'] <= train_end)]

        # Model fit and predict
        model = Prophet()
        model.fit(df_data)
        future = model.make_future_dataframe(periods=forecast_horizon)
        forecast = model.predict(future)

        # Post process forecast
        forecast = forecast[['ds','yhat']]
        
        forecast.columns = ['TIMESTAMP','FORECAST']
        forecast['TRAIN_START'] = train_start
        forecast['TRAIN_END'] = train_end
        forecast['FORECAST_HORIZON'] = forecast_horizon
        forecast['LIBRARY_VERSION'] = str(prophet.__version__)

        yield from forecast.itertuples(index=False, name=None)

The version of package 'prophet' in the local environment is 1.1.5, which does not fit the criteria for the requirement 'prophet'. Your UDF might not work when the package version is different between the server and your local environment.


In [51]:
df = session.table('TIME_SERIES_1K') \
        .with_column('ROW', F.object_construct_keep_null('*')) \
        .select(F.col('SERIES_ID'), F.col('ROW')) \
        .filter(F.col('SERIES_ID').isin([1,2,3,4,5]))

store_forecast_test = F.table_function("TESTING_PROPHET_101")

variant_column = F.parse_json(df.col('ROW').cast(T.VariantType()))

forecast_sdf = df.select(F.col('SERIES_ID'),
                         store_forecast_test(variant_column).over(partition_by=['SERIES_ID']))

forecast_sdf.write.save_as_table("TEST_PROPHET_COMPASS", mode="overwrite")

## TEST Prophet==1.1.5 (Error)
Prophet==1.1.5 is not yet supported in Snowflake Anaconda as it has some SYS calls that are blocked by Snowflake. Solution underway by Snowflake Engineering.

In [46]:
schema = T.StructType([
    T.StructField("TIMESTAMP", T.DateType()),
    T.StructField("FORECAST", T.FloatType()),
    T.StructField("TRAIN_START", T.DateType()),
    T.StructField("TRAIN_END", T.DateType()),
    T.StructField("FORECAST_HORIZON", T.IntegerType()),
    T.StructField("LIBRARY_VERSION", T.StringType())
])

@F.udtf(session=session,
        output_schema = schema,
        input_types = [T.VariantType()],
        name = "TESTING_PROPHET_115_DEEN",
        is_permanent=True,
        stage_location="@TIME_SERIES.SYNTHETIC_DATA.TEST_PROPHET_115",
        packages=['pandas', 'holidays', 'snowflake-snowpark-python',
                  'importlib_resources', 'tqdm'],
        imports = ['@TIME_SERIES.SYNTHETIC_DATA.TEST_PROPHET_115/wheel_loader.py',
                   '@TIME_SERIES.SYNTHETIC_DATA.TEST_PROPHET_115/prophet-1.1.5.whl',
                   '@TIME_SERIES.SYNTHETIC_DATA.TEST_PROPHET_115/cmdstanpy-1.2.0-py3-none-any.whl',
                   '@TIME_SERIES.SYNTHETIC_DATA.TEST_PROPHET_115/stanio-0.4.0-py3-none-any.whl'],
        replace=True
       )

class forecast:
    def __init__(self):
        self.rows=[]
        self.dfs=[]
    
    def process(self, data):
        self.rows.append(data)
        # Merge rows into a dataframe
        if len(self.rows) >= 16000:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []
        # Merge dataframes into a single dataframe. Minimizes memory footprint
        if len(self.dfs) >= 100:
            merged_df = pd.concat(self.dfs)
            self.dfs = [merged_df]
        yield None
    
    def end_partition(self):
        import sys, os
        import wheel_loader
        wheel_loader.load('prophet-1.1.5.whl')
        wheel_loader.load('stanio-0.4.0-py3-none-any.whl')
        wheel_loader.load('cmdstanpy-1.2.0-py3-none-any.whl')

        from prophet import Prophet
        import prophet

        if len(self.rows) > 0:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []

        # Preprocess Data
        df_data = pd.concat(self.dfs)
        df_data['ds'] = pd.to_datetime(df_data['DATE'])
        df_data = df_data.groupby('ds').sum('VALUE').reset_index()
        df_data = df_data.rename(columns={'VALUE':'y'})
        df_data = df_data[['ds','y']]
        df_data = df_data.sort_values(by=['ds']).reset_index(drop=True)

        # Set train start
        train_length = 600
        forecast_horizon = 30
        train_end = max(df_data['ds'])
        train_start = train_end - pd.Timedelta(days = 600)

        # Get training data
        df_data = df_data.loc[(df_data['ds'] > train_start) & (df_data['ds'] <= train_end)]

        # Model fit and predict
        model = Prophet()
        model.fit(df_data)
        future = model.make_future_dataframe(periods=forecast_horizon)
        forecast = model.predict(future)

        # Post process forecast
        forecast = forecast[['ds','yhat']]
        
        forecast.columns = ['TIMESTAMP','FORECAST']
        forecast['TRAIN_START'] = train_start
        forecast['TRAIN_END'] = train_end
        forecast['FORECAST_HORIZON'] = forecast_horizon
        forecast['LIBRARY_VERSION'] = str(prophet.__version__)

        yield from forecast.itertuples(index=False, name=None)

Package 'lightning-utilities' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
Package 'plotly' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
Package 'pmdarima' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
Package 'pytorch' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
Package 'pytorch-lightning' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
Package 'statsmodels' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local env