# Partitioned Custom Time Series Model - Nixta AutoArima

This notebook shows how to partition a time series model. It uses synthetic data and Nixta AutoArima. The notebooks shows how to test it locally and then run the model in a distributed fashion in Snowflake. I have also made it so you can push the datasets into a Snowflake table for running the inference from the Snowflake model registry.

In [94]:
import json
from snowflake.snowpark import Session
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend

with open('../../creds.json') as f:
    data = json.load(f)
    USERNAME = data['user']
    SF_ACCOUNT = data['account']
    SF_WH = data['warehouse']
    passphrase = data['passphrase']

# Read the private key from the .p8 file
with open('../../rsa_key.p8', 'rb') as key_file:
    private_key = key_file.read()

# If the private key is encrypted, load it with a passphrase
# Replace 'your_key_passphrase' with your actual passphrase if needed
private_key_obj = serialization.load_pem_private_key(
    private_key,
    password=passphrase.encode() if passphrase else None,
    backend=default_backend()
)

# Define connection parameters including the private key
CONNECTION_PARAMETERS = {
    'user': USERNAME,
    'account': SF_ACCOUNT,
    'private_key': private_key_obj,
    'warehouse': SF_WH,
}

# Create a session with the specified connection parameters
session = Session.builder.configs(CONNECTION_PARAMETERS).create()

from snowflake.core.warehouse import Warehouse
from snowflake.core import Root
root = Root(session)
from snowflake.snowpark.functions import col 
from time import time


In [95]:
from datetime import timedelta

import pandas as pd

from snowflake.ml.model import custom_model
from snowflake.ml.model import model_signature
from snowflake.ml.registry import registry
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F

from joblib import Parallel,delayed

In [96]:
from snowflake.snowpark.version import VERSION
snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

from snowflake.ml import version
mlversion = version.VERSION

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))
print('Snowflake ML version        : {}.{}.{}'.format(mlversion[0],mlversion[2],mlversion[4]))

User                        : RSHAH
Role                        : "RAJIV"
Database                    : "RAJIV"
Schema                      : "PUBLIC"
Warehouse                   : "RAJIV"
Snowflake version           : 8.35.1
Snowpark for Python version : 1.20.0
Snowflake ML version        : 1.6.1


In [97]:
REGISTRY_DATABASE_NAME = "TPCDS_XGBOOST"
REGISTRY_SCHEMA_NAME = "DEMO"
reg = registry.Registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)

## Generate Data

In [68]:
#Generate Series - Takes 10 minutes to run for 1M
#Only need to run this the first time
from statsforecast.utils import generate_series
#for length in [10_000, 100_000, 500_000, 1_000_000, 2_000_000]:
for length in [10_000]:
		print(f'length: {length}')
		series = generate_series(n_series=length, seed=1)

series

length: 10000


Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2000-01-01,0.227933
0,2000-01-02,1.094629
0,2000-01-03,2.129485
0,2000-01-04,3.321956
0,2000-01-05,4.310279
...,...,...
9999,2000-11-08,0.077594
9999,2000-11-09,1.126169
9999,2000-11-10,2.478491
9999,2000-11-11,3.140820


In [69]:
# Prep data for training with subset
df = pd.DataFrame(series)
train_df = df.reset_index()
train_df.columns = ['ID', 'DS', 'Y']
train_df['ID'] = train_df['ID'].astype(int)

In [23]:
## Save series to Snowflake table - takes 45 minutes to run for 1M from slow hotel wifi
#Only need to run this the first time
snow_df = session.create_dataframe(train_df)
snow_df.write.mode('overwrite').save_as_table('TPCDS_XGBOOST.DEMO.Series1M')
#train_df = session.table('TPCDS_XGBOOST.DEMO.SERIES2M')

In [24]:
session.sql('ALTER TABLE TPCDS_XGBOOST.DEMO.Series1M CLUSTER BY (ID)').collect()

[Row(status='Statement executed successfully.')]

## Train Model Locally

You want to use pandas for initial local testing

In [25]:
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, Naive

train_df = train_df[train_df['ID'] < 10_000]  ## only do this with 10k series for interactive notebook


df = train_df[train_df['ID'] == 2]
df.columns = ['unique_id', 'ds', 'y']
sf = StatsForecast(df=df,
                   models=[AutoARIMA(), Naive()],
                   freq='D', 
                   n_jobs=-1)
forecasts_df = sf.forecast(h=7)
forecasts_df.head()

  multiarray.copyto(a, fill_value, casting='unsafe')


Unnamed: 0_level_0,ds,AutoARIMA,Naive
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,2001-03-22,5.88159,5.319668
2,2001-03-23,3.04602,5.319668
2,2001-03-24,3.888017,5.319668
2,2001-03-25,2.797169,5.319668
2,2001-03-26,3.821219,5.319668


In [26]:
class ForecastingModel(custom_model.CustomModel):

    # Use the same decorator as for methods with FUNCTION inference.
    @custom_model.partitioned_inference_api
    def predict(self, df: pd.DataFrame) -> pd.DataFrame:        
        from statsforecast import StatsForecast
        from statsforecast.models import AutoARIMA, Naive
        df.columns = ['unique_id', 'ds', 'y']
        df.head()
        model = StatsForecast(models=[AutoARIMA()],
                      freq='D',
                      n_jobs=-1)  ##chaning to 1 from -1

        forecasts_df = model.forecast(df=df, h=7)
        forecasts_df.columns = ['DSOUT', 'AUTOARIMA']
        return forecasts_df

In [27]:
ts_model = ForecastingModel()
local_predictions = ts_model.predict(train_df[train_df['ID'] == 2])
local_predictions

  multiarray.copyto(a, fill_value, casting='unsafe')


Unnamed: 0_level_0,DSOUT,AUTOARIMA
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,2001-03-22,5.88159
2,2001-03-23,3.04602
2,2001-03-24,3.888017
2,2001-03-25,2.797169
2,2001-03-26,3.821219
2,2001-03-27,3.503671
2,2001-03-28,2.811226


In [28]:
# Function to parallelize to verify the model
def parallel_predict(all_data, model):
    return model.predict(all_data)

# Assuming df1 is your complete dataset
all_groups = [group for _, group in train_df.groupby('ID')]

# Initialize the ForecastingModel
model = ForecastingModel()

# Parallel execution using Joblib
num_cores = -1  # Use all available cores
results = Parallel(n_jobs=num_cores)(delayed(parallel_predict)(all_data, model) for all_data in all_groups)
results
# Combine or process the results as needed

  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='un

[               DSOUT  AUTOARIMA
 unique_id                      
 0         2000-03-28   1.897706
 0         2000-03-29   1.482552
 0         2000-03-30   1.158219
 0         2000-03-31   0.904840
 0         2000-04-01   0.706891
 0         2000-04-02   0.552247
 0         2000-04-03   0.431434,
                DSOUT  AUTOARIMA
 unique_id                      
 1         2000-10-12   0.807545
 1         2000-10-13   1.725912
 1         2000-10-14   2.384233
 1         2000-10-15   0.127623
 1         2000-10-16   0.810122
 1         2000-10-17   1.577103
 1         2000-10-18   0.723131,
                DSOUT  AUTOARIMA
 unique_id                      
 2         2001-03-22   5.881590
 2         2001-03-23   3.046020
 2         2001-03-24   3.888017
 2         2001-03-25   2.797169
 2         2001-03-26   3.821219
 2         2001-03-27   3.503671
 2         2001-03-28   2.811226,
                DSOUT  AUTOARIMA
 unique_id                      
 3         2000-05-02   1.002825
 3     

## Train Model in Snowflake

In [29]:
options = {
    "function_type": "TABLE_FUNCTION",
}

mv = reg.log_model(
    ts_model,
    model_name="statsforecast",
    version_name="v12",
    conda_dependencies=["pandas", "statsforecast"],
    options=options,
    signatures={
        "predict": model_signature.ModelSignature(
            inputs=[
                model_signature.FeatureSpec(name="ID", dtype=model_signature.DataType.INT64),
                model_signature.FeatureSpec(name="DS", dtype=model_signature.DataType.TIMESTAMP_NTZ),
                model_signature.FeatureSpec(name="Y", dtype=model_signature.DataType.DOUBLE),
            ],
            outputs=[
               # model_signature.FeatureSpec(name="ID", dtype=model_signature.DataType.INT64),
                model_signature.FeatureSpec(name="DSOUT", dtype=model_signature.DataType.TIMESTAMP_NTZ),
                model_signature.FeatureSpec(name="AUTOARIMA", dtype=model_signature.DataType.FLOAT),
            ],
        )
    },
)

  return next(self.gen)


In [98]:
reg_model = reg.get_model("statsforecast").version("v12") 

In [71]:
train_df

Unnamed: 0,ID,DS,Y
0,0,2000-01-01,0.227933
1,0,2000-01-02,1.094629
2,0,2000-01-03,2.129485
3,0,2000-01-04,3.321956
4,0,2000-01-05,4.310279
...,...,...,...
2750454,9999,2000-11-08,0.077594
2750455,9999,2000-11-09,1.126169
2750456,9999,2000-11-10,2.478491
2750457,9999,2000-11-11,3.140820


In [72]:
## running the pandas dataframe in a distributed way for training the models
result = reg_model.run(train_df, partition_column="ID")



In [60]:
result.head(20)

Unnamed: 0,DSOUT,AUTOARIMA,ID
0,2000-07-19,2.736614,7
1,2000-07-20,2.157914,7
2,2000-07-21,1.701589,7
3,2000-07-22,1.341761,7
4,2000-07-23,1.058025,7
5,2000-07-24,0.834289,7
6,2000-07-25,0.657866,7
7,2000-07-19,1.765691,19
8,2000-07-20,1.528045,19
9,2000-07-21,1.322384,19


In [77]:
## Let's get a snowpark dataframe 
test_df = session.create_dataframe(train_df)
test_df.write.mode('overwrite').save_as_table('TPCDS_XGBOOST.DEMO.TEMPTS')
df2 = session.table('TPCDS_XGBOOST.DEMO.TEMPTS')
df2.show()

TypeError: create_dataframe() function only accepts data as a list, tuple or a pandas DataFrame.

In [78]:
df2.show()

------------------------------------------------------
|"ID"    |"DS"                 |"Y"                  |
------------------------------------------------------
|643433  |2000-01-01 00:00:00  |6.492034163476288    |
|643433  |2000-01-02 00:00:00  |0.03515953942042088  |
|643433  |2000-01-03 00:00:00  |1.4259376110223878   |
|643433  |2000-01-04 00:00:00  |2.3263864999001327   |
|643433  |2000-01-05 00:00:00  |3.423718062569158    |
|643433  |2000-01-06 00:00:00  |4.395926604157056    |
|643433  |2000-01-07 00:00:00  |5.4948256659885715   |
|643433  |2000-01-08 00:00:00  |6.489458942201171    |
|643433  |2000-01-09 00:00:00  |0.4950811481325529   |
|643433  |2000-01-10 00:00:00  |1.2284352874344027   |
------------------------------------------------------



In [80]:
##Run the model across the Snowflake DF. It should be faster than the pandas dataframe - especially at scale.
df2 = session.table('TPCDS_XGBOOST.DEMO.TEMPTS')
result = reg_model.run(df2, partition_column="ID",function_name="PREDICT").collect()

In [63]:
result

[Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 7, 19, 0, 0), AUTOARIMA=-1.070127248764038, ID=59),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 7, 20, 0, 0), AUTOARIMA=0.2209031730890274, ID=59),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 7, 21, 0, 0), AUTOARIMA=-1.2555382251739502, ID=59),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 7, 22, 0, 0), AUTOARIMA=0.7259554266929626, ID=59),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 7, 23, 0, 0), AUTOARIMA=0.0, ID=59),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 7, 24, 0, 0), AUTOARIMA=0.0, ID=59),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 7, 25, 0, 0), AUTOARIMA=0.0, ID=59),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 7, 19, 0, 0), AUTOARIMA=2.7366137504577637, ID=7),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 7, 20, 0, 0), AUTOARIMA=2.15791392326355, ID=7),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 7, 21, 0, 0), AUTOARIMA=1.7015891075134277, ID=7),
 Row(DS=N

In [91]:
#Use this to test different warehouse sizes
snowpark_opt_wh = Warehouse(
  name="snowpark_opt_wh",
  warehouse_size="MEDIUM",
  warehouse_type = "SNOWPARK-OPTIMIZED",
  auto_suspend=600,
)
warehouses = root.warehouses["snowpark_opt_wh"]
warehouses.create_or_alter(snowpark_opt_wh)
session.use_warehouse("snowpark_opt_wh")

session.sql('alter session set USE_CACHED_RESULT = FALSE').collect()
session.sql('alter session set query_tag = "TS-LARGE-Chase" ').collect()
#session.sql('alter warehouse snowpark_opt_wh set max_concurrency_level = 1').collect()

print(session.get_current_warehouse())

"SNOWPARK_OPT_WH"


In [100]:
# An evaluation loop to see how the model does at the different series lengths
lengths = [10_000, 50_000, 100_000, 500_000, 1_000_000, 2_000_000]
#lengths = [10_000]

train_df = session.table('TPCDS_XGBOOST.DEMO.SERIES2M')
print(session.get_current_warehouse())

for length in lengths:
  session.use_warehouse("RAJIV")
  print ("prepping data for ", length)
  df2 = train_df.filter((col("ID") >= 0) & (col("ID") <= (length-1)))
  print ("starting training")
  init = time()
  # Run the regression model
  result = reg_model.run(df2, partition_column="ID").collect()
  total_time = (time() - init) / 60
  print(f'n_series: {length} total time: {total_time} total rows: {df2.count()}')

"RAJIV"
prepping data for  10000
starting training
n_series: 10000 total time: 1.532546631495158 total rows: 2750459
prepping data for  50000
starting training
n_series: 50000 total time: 5.431473215421041 total rows: 13776842
prepping data for  100000
starting training
n_series: 100000 total time: 9.905567797025045 total rows: 27514605
prepping data for  500000
starting training
n_series: 500000 total time: 46.26142946481705 total rows: 137488809
prepping data for  1000000
starting training
n_series: 1000000 total time: 90.53850088516872 total rows: 274926721
prepping data for  2000000
starting training
n_series: 2000000 total time: 180.65560415188472 total rows: 549884998
