# Partitioned Custom Time Series Model - Nixta AutoArima

This notebook shows how to partition a time series model. It uses synthetic data and Nixta AutoArima. The notebooks shows how to test it locally and then run the model in a distributed fashion in Snowflake. I have also made it so you can push the datasets into a Snowflake table for running the inference from the Snowflake model registry.

In [1]:
import json
from snowflake.snowpark import Session
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend

with open('../../creds.json') as f:
    data = json.load(f)
    USERNAME = data['user']
    SF_ACCOUNT = data['account']
    SF_WH = data['warehouse']
    passphrase = data['passphrase']

# Read the private key from the .p8 file
with open('../../rsa_key.p8', 'rb') as key_file:
    private_key = key_file.read()

# If the private key is encrypted, load it with a passphrase
# Replace 'your_key_passphrase' with your actual passphrase if needed
private_key_obj = serialization.load_pem_private_key(
    private_key,
    password=passphrase.encode() if passphrase else None,
    backend=default_backend()
)

# Define connection parameters including the private key
CONNECTION_PARAMETERS = {
    'user': USERNAME,
    'account': SF_ACCOUNT,
    'private_key': private_key_obj,
    'warehouse': SF_WH,
}

# Create a session with the specified connection parameters
session = Session.builder.configs(CONNECTION_PARAMETERS).create()

from snowflake.core.warehouse import Warehouse
from snowflake.core import Root
root = Root(session)
from snowflake.snowpark.functions import col 
from time import time


In [2]:
from datetime import timedelta

import pandas as pd

from snowflake.ml.model import custom_model
from snowflake.ml.model import model_signature
from snowflake.ml.registry import registry
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F

from joblib import Parallel,delayed

In [3]:
from snowflake.snowpark.version import VERSION
snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

from snowflake.ml import version
mlversion = version.VERSION

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))
print('Snowflake ML version        : {}.{}.{}'.format(mlversion[0],mlversion[2],mlversion[4]))

User                        : RSHAH
Role                        : "RAJIV"
Database                    : "RAJIV"
Schema                      : "PUBLIC"
Warehouse                   : "RAJIV"
Snowflake version           : 8.34.0
Snowpark for Python version : 1.20.0
Snowflake ML version        : 1.6.1


In [4]:
REGISTRY_DATABASE_NAME = "TPCDS_XGBOOST"
REGISTRY_SCHEMA_NAME = "DEMO"
reg = registry.Registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)

## Generate Data

In [21]:
#Generate Series - Takes 10 minutes to run for 1M
#Only need to run this the first time
from statsforecast.utils import generate_series
#for length in [10_000, 100_000, 500_000, 1_000_000, 2_000_000]:
for length in [1_000_000]:
		print(f'length: {length}')
		series = generate_series(n_series=length, seed=1)

series

length: 1000000


Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2000-01-01,0.353326
0,2000-01-02,1.079316
0,2000-01-03,2.207718
0,2000-01-04,3.058734
0,2000-01-05,4.489969
...,...,...
999999,2001-03-11,1.167035
999999,2001-03-12,2.178976
999999,2001-03-13,3.124651
999999,2001-03-14,4.311360


In [22]:
# Prep data for training with subset
df = pd.DataFrame(series)
train_df = df.reset_index()
train_df.columns = ['ID', 'DS', 'Y']
train_df['ID'] = train_df['ID'].astype(int)

In [23]:
## Save series to Snowflake table - takes 45 minutes to run for 1M from slow hotel wifi
#Only need to run this the first time
snow_df = session.create_dataframe(train_df)
snow_df.write.mode('overwrite').save_as_table('TPCDS_XGBOOST.DEMO.Series1M')
#train_df = session.table('TPCDS_XGBOOST.DEMO.SERIES2M')

In [24]:
session.sql('ALTER TABLE TPCDS_XGBOOST.DEMO.Series1M CLUSTER BY (ID)').collect()

[Row(status='Statement executed successfully.')]

## Train Model Locally

You want to use pandas for initial local testing

In [25]:
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, Naive

train_df = train_df[train_df['ID'] < 10_000]  ## only do this with 10k series for interactive notebook


df = train_df[train_df['ID'] == 2]
df.columns = ['unique_id', 'ds', 'y']
sf = StatsForecast(df=df,
                   models=[AutoARIMA(), Naive()],
                   freq='D', 
                   n_jobs=-1)
forecasts_df = sf.forecast(h=7)
forecasts_df.head()

  multiarray.copyto(a, fill_value, casting='unsafe')


Unnamed: 0_level_0,ds,AutoARIMA,Naive
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,2001-03-22,5.88159,5.319668
2,2001-03-23,3.04602,5.319668
2,2001-03-24,3.888017,5.319668
2,2001-03-25,2.797169,5.319668
2,2001-03-26,3.821219,5.319668


In [26]:
class ForecastingModel(custom_model.CustomModel):

    # Use the same decorator as for methods with FUNCTION inference.
    @custom_model.partitioned_inference_api
    def predict(self, df: pd.DataFrame) -> pd.DataFrame:        
        from statsforecast import StatsForecast
        from statsforecast.models import AutoARIMA, Naive
        df.columns = ['unique_id', 'ds', 'y']
        df.head()
        model = StatsForecast(models=[AutoARIMA()],
                      freq='D',
                      n_jobs=-1)  ##chaning to 1 from -1

        forecasts_df = model.forecast(df=df, h=7)
        forecasts_df.columns = ['DSOUT', 'AUTOARIMA']
        return forecasts_df

In [27]:
ts_model = ForecastingModel()
local_predictions = ts_model.predict(train_df[train_df['ID'] == 2])
local_predictions

  multiarray.copyto(a, fill_value, casting='unsafe')


Unnamed: 0_level_0,DSOUT,AUTOARIMA
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,2001-03-22,5.88159
2,2001-03-23,3.04602
2,2001-03-24,3.888017
2,2001-03-25,2.797169
2,2001-03-26,3.821219
2,2001-03-27,3.503671
2,2001-03-28,2.811226


In [28]:
# Function to parallelize to verify the model
def parallel_predict(all_data, model):
    return model.predict(all_data)

# Assuming df1 is your complete dataset
all_groups = [group for _, group in train_df.groupby('ID')]

# Initialize the ForecastingModel
model = ForecastingModel()

# Parallel execution using Joblib
num_cores = -1  # Use all available cores
results = Parallel(n_jobs=num_cores)(delayed(parallel_predict)(all_data, model) for all_data in all_groups)
results
# Combine or process the results as needed

  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='un

[               DSOUT  AUTOARIMA
 unique_id                      
 0         2000-03-28   1.897706
 0         2000-03-29   1.482552
 0         2000-03-30   1.158219
 0         2000-03-31   0.904840
 0         2000-04-01   0.706891
 0         2000-04-02   0.552247
 0         2000-04-03   0.431434,
                DSOUT  AUTOARIMA
 unique_id                      
 1         2000-10-12   0.807545
 1         2000-10-13   1.725912
 1         2000-10-14   2.384233
 1         2000-10-15   0.127623
 1         2000-10-16   0.810122
 1         2000-10-17   1.577103
 1         2000-10-18   0.723131,
                DSOUT  AUTOARIMA
 unique_id                      
 2         2001-03-22   5.881590
 2         2001-03-23   3.046020
 2         2001-03-24   3.888017
 2         2001-03-25   2.797169
 2         2001-03-26   3.821219
 2         2001-03-27   3.503671
 2         2001-03-28   2.811226,
                DSOUT  AUTOARIMA
 unique_id                      
 3         2000-05-02   1.002825
 3     

## Train Model in Snowflake

In [29]:
options = {
    "function_type": "TABLE_FUNCTION",
}

mv = reg.log_model(
    ts_model,
    model_name="statsforecast",
    version_name="v12",
    conda_dependencies=["pandas", "statsforecast"],
    options=options,
    signatures={
        "predict": model_signature.ModelSignature(
            inputs=[
                model_signature.FeatureSpec(name="ID", dtype=model_signature.DataType.INT64),
                model_signature.FeatureSpec(name="DS", dtype=model_signature.DataType.TIMESTAMP_NTZ),
                model_signature.FeatureSpec(name="Y", dtype=model_signature.DataType.DOUBLE),
            ],
            outputs=[
               # model_signature.FeatureSpec(name="ID", dtype=model_signature.DataType.INT64),
                model_signature.FeatureSpec(name="DSOUT", dtype=model_signature.DataType.TIMESTAMP_NTZ),
                model_signature.FeatureSpec(name="AUTOARIMA", dtype=model_signature.DataType.FLOAT),
            ],
        )
    },
)

  return next(self.gen)


In [30]:
reg_model = reg.get_model("statsforecast").version("v12") 

In [31]:
## running the pandas dataframe in a distributed way for training the models
result = reg_model.run(train_df, partition_column="ID")



In [32]:
result.head(20)

Unnamed: 0,DSOUT,AUTOARIMA,ID
0,2000-06-30,1.225607,3604
1,2000-07-01,1.060109,3604
2,2000-07-02,0.916958,3604
3,2000-07-03,0.793138,3604
4,2000-07-04,0.686038,3604
5,2000-07-05,0.593399,3604
6,2000-07-06,0.513271,3604
7,2001-04-04,3.805211,350
8,2001-04-05,3.869343,350
9,2001-04-06,4.533864,350


In [33]:
## Let's get a snowpark dataframe 
test_df = session.create_dataframe(train_df)
test_df.write.mode('overwrite').save_as_table('TPCDS_XGBOOST.DEMO.TEMPTS')
df2 = session.table('TPCDS_XGBOOST.DEMO.TEMPTS')
df2.show()

  success, _, _, ci_output = write_pandas(


---------------------------------------------------
|"ID"  |"DS"                 |"Y"                 |
---------------------------------------------------
|7284  |2000-06-26 00:00:00  |5.227254591561852   |
|7284  |2000-06-27 00:00:00  |6.0076554617818685  |
|7284  |2000-06-28 00:00:00  |0.4133436228988594  |
|7284  |2000-06-29 00:00:00  |1.4386762511824545  |
|7284  |2000-06-30 00:00:00  |2.403646226814616   |
|7284  |2000-07-01 00:00:00  |3.2825826649424137  |
|7284  |2000-07-02 00:00:00  |4.220694819644106   |
|7284  |2000-07-03 00:00:00  |5.26042403532275    |
|7284  |2000-07-04 00:00:00  |6.489685737362617   |
|7284  |2000-07-05 00:00:00  |0.3840139007826227  |
---------------------------------------------------



In [34]:
##Run the model across the Snowflake DF. It should be faster than the pandas dataframe - especially at scale.
result = reg_model.run(df2, partition_column="ID",function_name="PREDICT").collect()

In [35]:
result

[Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 5, 0, 0), AUTOARIMA=0.41019728779792786, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 6, 0, 0), AUTOARIMA=1.6095925569534302, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 7, 0, 0), AUTOARIMA=2.3790667057037354, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 8, 0, 0), AUTOARIMA=-0.1835666000843048, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 9, 0, 0), AUTOARIMA=0.47603434324264526, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 10, 0, 0), AUTOARIMA=1.7927964925765991, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 11, 0, 0), AUTOARIMA=0.6795849204063416, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 3, 3, 0, 0), AUTOARIMA=3.490996837615967, ID=5970),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 3, 4, 0, 0), AUTOARIMA=2.76536226272583, ID=5970),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2000, 3

In [36]:
#Use this to test different warehouse sizes
snowpark_opt_wh = Warehouse(
  name="snowpark_opt_wh",
  warehouse_size="MEDIUM",
  warehouse_type = "SNOWPARK-OPTIMIZED",
  auto_suspend=600,
)
warehouses = root.warehouses["snowpark_opt_wh"]
warehouses.create_or_alter(snowpark_opt_wh)
session.use_warehouse("snowpark_opt_wh")

session.sql('alter session set USE_CACHED_RESULT = FALSE').collect()
session.sql('alter session set query_tag = "TS-LARGE-Chase" ').collect()
#session.sql('alter warehouse snowpark_opt_wh set max_concurrency_level = 1').collect()

print(session.get_current_warehouse())

"SNOWPARK_OPT_WH"


In [37]:
# An evaluation loop to see how the model does at the different series lengths
lengths = [10_000, 50_000, 100_000, 500_000, 1_000_000, 2_000_000]
#lengths = [10_000]

train_df = session.table('TPCDS_XGBOOST.DEMO.SERIES1M')
print(session.get_current_warehouse())

for length in lengths:
  print ("prepping data")
  unique_ids_df = train_df.select("ID").distinct().limit(length)
  filtered_df = train_df.join(unique_ids_df, on="ID", how="inner").cache_result() #added cache result
  print(unique_ids_df.count())
  filtered_df.write.mode('overwrite').save_as_table('TPCDS_XGBOOST.DEMO.TEMP_TS')
  session.sql('ALTER TABLE TPCDS_XGBOOST.DEMO.TEMP_TS CLUSTER BY (ID)').collect()
  df2 = session.table('TPCDS_XGBOOST.DEMO.TEMP_TS')
  print ("starting training")
  init = time()
  # Run the regression model
  result = reg_model.run(df2, partition_column="ID").collect()
  total_time = (time() - init) / 60
  print(f'n_series: {length} total time: {total_time} total rows: {filtered_df.count()}')

"SNOWPARK_OPT_WH"
prepping data
10000
starting training
n_series: 10000 total time: 34.58541253805161 total rows: 2733652
prepping data
50000
starting training
