# Partitioned Custom Time Series Model - Statsmodels Exponential Smoothing

This notebook shows how to partition a time series model. It uses synthetic data and statsmodels exponetial smoothing. The notebooks shows how to test it locally and then run the model in a distributed fashion in Snowflake. I have also made it so you can push the datasets into a Snowflake table for running the inference from the Snowflake model registry.

In [1]:
import json
from snowflake.snowpark import Session
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend

with open('../../creds.json') as f:
    data = json.load(f)
    USERNAME = data['user']
    SF_ACCOUNT = data['account']
    SF_WH = data['warehouse']
    passphrase = data['passphrase']

# Read the private key from the .p8 file
with open('../../rsa_key.p8', 'rb') as key_file:
    private_key = key_file.read()

# If the private key is encrypted, load it with a passphrase
# Replace 'your_key_passphrase' with your actual passphrase if needed
private_key_obj = serialization.load_pem_private_key(
    private_key,
    password=passphrase.encode() if passphrase else None,
    backend=default_backend()
)

# Define connection parameters including the private key
CONNECTION_PARAMETERS = {
    'user': USERNAME,
    'account': SF_ACCOUNT,
    'private_key': private_key_obj,
    'warehouse': SF_WH,
}

# Create a session with the specified connection parameters
session = Session.builder.configs(CONNECTION_PARAMETERS).create()

from snowflake.core.warehouse import Warehouse
from snowflake.core import Root
root = Root(session)
from snowflake.snowpark.functions import col 
from time import time


In [2]:
from datetime import timedelta

import pandas as pd

from snowflake.ml.model import custom_model
from snowflake.ml.model import model_signature
from snowflake.ml.registry import registry
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F

from joblib import Parallel,delayed

In [3]:
from snowflake.snowpark.version import VERSION
snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

from snowflake.ml import version
mlversion = version.VERSION

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))
print('Snowflake ML version        : {}.{}.{}'.format(mlversion[0],mlversion[2],mlversion[4]))

User                        : RSHAH
Role                        : "RAJIV"
Database                    : "RAJIV"
Schema                      : "PUBLIC"
Warehouse                   : "RAJIV"
Snowflake version           : 8.34.0
Snowpark for Python version : 1.20.0
Snowflake ML version        : 1.6.1


In [5]:
REGISTRY_DATABASE_NAME = "TPCDS_XGBOOST"
REGISTRY_SCHEMA_NAME = "DEMO"
reg = registry.Registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)

## Generate Data

In [4]:
#Generate Series - Takes 2 minutes to run
#Only need to run this the first time
from statsforecast.utils import generate_series
#for length in [10_000, 100_000, 500_000, 1_000_000, 2_000_000]:
#for length in [1_000]:
for length in [10]:
		print(f'length: {length}')
		series = generate_series(n_series=length, seed=1)

series

  from tqdm.autonotebook import tqdm


length: 10


Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2000-01-01,0.046169
0,2000-01-02,1.093130
0,2000-01-03,2.172780
0,2000-01-04,3.198384
0,2000-01-05,4.269408
...,...,...
9,2001-05-08,6.136047
9,2001-05-09,0.476046
9,2001-05-10,1.334887
9,2001-05-11,2.333404


In [6]:
# Prep data for training
df = pd.DataFrame(series)
train_df = df.reset_index()
train_df.columns = ['ID', 'DS', 'Y']
train_df

Unnamed: 0,ID,DS,Y
0,0,2000-01-01,0.046169
1,0,2000-01-02,1.093130
2,0,2000-01-03,2.172780
3,0,2000-01-04,3.198384
4,0,2000-01-05,4.269408
...,...,...,...
3002,9,2001-05-08,6.136047
3003,9,2001-05-09,0.476046
3004,9,2001-05-10,1.334887
3005,9,2001-05-11,2.333404


In [55]:
## Save series to Snowflake table
#Only need to run this the first time
test_df = session.create_dataframe(train_df)
test_df.write.mode('overwrite').save_as_table('TPCDS_XGBOOST.DEMO.Series2M')
train_df = session.table('TPCDS_XGBOOST.DEMO.SERIES2M')

NameError: name 'series' is not defined

In [50]:
#Retrieve from Snowflake -- 
train_df = session.table('TPCDS_XGBOOST.DEMO.SERIES2M')
train_df.show()

--------------------------------------------------------
|"ID"    |"DS"                 |"Y"                    |
--------------------------------------------------------
|181695  |2000-02-22 00:00:00  |6.219272538160337      |
|181695  |2000-02-23 00:00:00  |0.3076429294607981     |
|181695  |2000-02-24 00:00:00  |1.197810254827208      |
|181695  |2000-02-25 00:00:00  |2.173458515198763      |
|181695  |2000-02-26 00:00:00  |3.102199405394565      |
|181695  |2000-02-27 00:00:00  |4.376139372280642      |
|181695  |2000-02-28 00:00:00  |5.375742028359614      |
|181695  |2000-02-29 00:00:00  |6.147630148293396      |
|181695  |2000-03-01 00:00:00  |0.0025383417716690615  |
|181695  |2000-03-02 00:00:00  |1.0790424184236609     |
--------------------------------------------------------



## Train Model Locally

In [7]:
# Local Test for Exponential Smoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
df = train_df[train_df['ID'] == 1]
df.set_index('DS', inplace=True)
model = ExponentialSmoothing(df['Y'], seasonal=None, trend='add', damped_trend=False)
fit = model.fit()
forecast = fit.forecast(steps=6)
forecast_df = pd.DataFrame({
                'DATE': forecast.index,
                'FORECAST': forecast.values
            })
forecast_df.head(6)

  self._init_dates(dates, freq)


Unnamed: 0,DATE,FORECAST
0,2000-10-12,3.273099
1,2000-10-13,3.273123
2,2000-10-14,3.273147
3,2000-10-15,3.273171
4,2000-10-16,3.273195
5,2000-10-17,3.273219


In [9]:
## ES Model
class ForecastingModel(custom_model.CustomModel):
    # Use the same decorator as for methods with FUNCTION inference.
    @custom_model.inference_api
    def predict(self, df:pd.DataFrame) -> pd.DataFrame:    #Please keep input and output here as pandas   
        ################## Replace below with your python code ######################################## 
        import pandas as pd
        from statsmodels.tsa.holtwinters import ExponentialSmoothing
        from datetime import datetime, timedelta
        print (df.head())
        print (df.columns)
        df = df.drop_duplicates(subset=['DS'])
        df.set_index('DS', inplace=True)
        df = df.asfreq('D') 
        model = ExponentialSmoothing(df['Y'], seasonal=None, trend='add', damped_trend=False,freq='D')
        fit = model.fit()
        forecast = fit.forecast(steps=7)
        forecast_df = pd.DataFrame({
                        'DATE': forecast.index,
                        'FORECAST': forecast.values
                    })
        return forecast_df

In [16]:
es_model = ForecastingModel()
local_predictions = es_model.predict(train_df[train_df['ID'] == 1])
#local_predictions = es_model.predict(train_df)
local_predictions

   ID         DS         Y
87  1 2000-01-01  3.224956
88  1 2000-01-02  4.289195
89  1 2000-01-03  5.204068
90  1 2000-01-04  6.118513
91  1 2000-01-05  0.451690
Index(['ID', 'DS', 'Y'], dtype='object')


Unnamed: 0,DATE,FORECAST
0,2000-10-12,3.273099
1,2000-10-13,3.273123
2,2000-10-14,3.273147
3,2000-10-15,3.273171
4,2000-10-16,3.273195
5,2000-10-17,3.273219
6,2000-10-18,3.273243


In [14]:
train_df.shape

(3007, 3)

In [15]:
local_predictions.shape

(7, 2)

In [27]:
# Function to parallelize
def parallel_predict(all_data, model):
    return model.predict(all_data)

# Assuming df1 is your complete dataset
all_groups = [group for _, group in train_df.groupby('ID')]

# Initialize the ForecastingModel
model = ForecastingModel()

# Parallel execution using Joblib
num_cores = -1  # Use all available cores
results = Parallel(n_jobs=num_cores)(delayed(parallel_predict)(all_data, model) for all_data in all_groups)

# Combine or process the results as needed

    ID         DS         Y
372  2 2000-01-01  1.346339
373  2 2000-01-02  2.345471
374  2 2000-01-03  3.094318
375  2 2000-01-04  4.220952
376  2 2000-01-05  5.290789
Index(['ID', 'DS', 'Y'], dtype='object')
    ID         DS         Y
940  4 2000-01-01  2.383889
941  4 2000-01-02  3.145149
942  4 2000-01-03  4.108446
943  4 2000-01-04  5.008358
944  4 2000-01-05  6.199330
Index(['ID', 'DS', 'Y'], dtype='object')
    ID         DS         Y
818  3 2000-01-01  6.113649
819  3 2000-01-02  0.212325
820  3 2000-01-03  1.185961
821  3 2000-01-04  2.177654
822  3 2000-01-05  3.028827
Index(['ID', 'DS', 'Y'], dtype='object')
  ID         DS         Y
0  0 2000-01-01  0.046169
1  0 2000-01-02  1.093130
2  0 2000-01-03  2.172780
3  0 2000-01-04  3.198384
4  0 2000-01-05  4.269408
Index(['ID', 'DS', 'Y'], dtype='object')
   ID         DS         Y
87  1 2000-01-01  3.224956
88  1 2000-01-02  4.289195
89  1 2000-01-03  5.204068
90  1 2000-01-04  6.118513
91  1 2000-01-05  0.451690
Index(['ID', '

## Train Model in Snowflake

In [28]:
options = {
    "function_type": "TABLE_FUNCTION",
}
mv = reg.log_model(
    es_model,
    model_name="es_forecast",
    version_name="v8",
    conda_dependencies=['pandas', 'statsmodels', 'snowflake-snowpark-python'],
    options=options,
    #sample_input_data=df_reset[df_reset['ID'] == 1],
    signatures={
        "predict": model_signature.ModelSignature(
            inputs=[
                model_signature.FeatureSpec(name="ID", dtype=model_signature.DataType.INT64),
                model_signature.FeatureSpec(name="DS", dtype=model_signature.DataType.TIMESTAMP_NTZ),
                model_signature.FeatureSpec(name="Y", dtype=model_signature.DataType.FLOAT),
            ],
            outputs=[
                model_signature.FeatureSpec(name="DSOUT", dtype=model_signature.DataType.TIMESTAMP_NTZ),
                model_signature.FeatureSpec(name="FORECAST", dtype=model_signature.DataType.FLOAT),
            ],
         )
     },
)

  return next(self.gen)


In [29]:
reg_model = reg.get_model("es_forecast").version("v8")  #v8 is njobs=-1 and v9 is njobs=1

In [31]:
#result = reg_model.run(df_reset, partition_column="ID",function_name="PREDICT").collect()
result = reg_model.run(train_df, partition_column="ID")

SnowparkSQLException: (1300) (1304): 01b708a6-0002-ef0b-0023-fc8702cabde2: 100357 (P0000): Expected 183 rows in the output given 183 rows in the input, but received 7 in function V8.PREDICT with handler predict.infer

In [32]:
snowpark_opt_wh = Warehouse(
  name="snowpark_opt_wh",
  warehouse_size="MEDIUM",
  warehouse_type = "SNOWPARK-OPTIMIZED",
  auto_suspend=600,
)
warehouses = root.warehouses["snowpark_opt_wh"]
warehouses.create_or_alter(snowpark_opt_wh)
session.use_warehouse("snowpark_opt_wh")

session.sql('alter session set USE_CACHED_RESULT = FALSE').collect()
session.sql('alter session set query_tag = "TS-LARGE-Chase" ').collect()
#session.sql('alter warehouse snowpark_opt_wh set max_concurrency_level = 1').collect()

print(session.get_current_warehouse())

"SNOWPARK_OPT_WH"


In [57]:
lengths = [10_000, 50_000, 100_000, 500_000, 1_000_000,2_000_000]
lengths = [10_000]

for length in lengths:
  unique_ids_df = train_df.select("ID").distinct().limit(length)
  filtered_df = train_df.join(unique_ids_df, on="ID", how="inner").cache_result() #added cache result
  print(unique_ids_df.count())
  init = time()
  # Run the regression model
  result = reg_model.run(filtered_df, partition_column="ID").collect()
  total_time = (time() - init) / 60
  print(f'n_series: {length} total time: {total_time} total rows: {filtered_df.count()}')

10000


SnowparkSQLException: (1304): 01b7064c-0002-ee9f-0023-fc8702c9a136: 100357 (P0000): Expected 222 rows in the output given 222 rows in the input, but received 6 in function V3.PREDICT with handler predict.infer