# Partitioned Custom Time Series Model - Nixta AutoArima

This notebook shows how to partition a time series model. It uses synthetic data and Nixta AutoArima. The notebooks shows how to test it locally and then run the model in a distributed fashion in Snowflake. I have also made it so you can push the datasets into a Snowflake table for running the inference from the Snowflake model registry.

In [1]:
import json
from snowflake.snowpark import Session
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend

with open('../../creds.json') as f:
    data = json.load(f)
    USERNAME = data['user']
    SF_ACCOUNT = data['account']
    SF_WH = data['warehouse']
    passphrase = data['passphrase']

# Read the private key from the .p8 file
with open('../../rsa_key.p8', 'rb') as key_file:
    private_key = key_file.read()

# If the private key is encrypted, load it with a passphrase
# Replace 'your_key_passphrase' with your actual passphrase if needed
private_key_obj = serialization.load_pem_private_key(
    private_key,
    password=passphrase.encode() if passphrase else None,
    backend=default_backend()
)

# Define connection parameters including the private key
CONNECTION_PARAMETERS = {
    'user': USERNAME,
    'account': SF_ACCOUNT,
    'private_key': private_key_obj,
    'warehouse': SF_WH,
}

# Create a session with the specified connection parameters
session = Session.builder.configs(CONNECTION_PARAMETERS).create()

from snowflake.core.warehouse import Warehouse
from snowflake.core import Root
root = Root(session)
from snowflake.snowpark.functions import col 
from time import time


In [2]:
from datetime import timedelta

import pandas as pd

from snowflake.ml.model import custom_model
from snowflake.ml.model import model_signature
from snowflake.ml.registry import registry
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F

from joblib import Parallel,delayed

In [3]:
from snowflake.snowpark.version import VERSION
snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

from snowflake.ml import version
mlversion = version.VERSION

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))
print('Snowflake ML version        : {}.{}.{}'.format(mlversion[0],mlversion[2],mlversion[4]))

User                        : RSHAH
Role                        : "RAJIV"
Database                    : "RAJIV"
Schema                      : "PUBLIC"
Warehouse                   : "RAJIV"
Snowflake version           : 8.34.0
Snowpark for Python version : 1.20.0
Snowflake ML version        : 1.6.1


In [4]:
REGISTRY_DATABASE_NAME = "TPCDS_XGBOOST"
REGISTRY_SCHEMA_NAME = "DEMO"
reg = registry.Registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)

## Generate Data

In [84]:
#Generate Series - Takes 2 minutes to run
#Only need to run this the first time
from statsforecast.utils import generate_series
#for length in [10_000, 100_000, 500_000, 1_000_000, 2_000_000]:
for length in [5_000]:
		print(f'length: {length}')
		series = generate_series(n_series=length, seed=1)

series

length: 5000


Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2000-01-01,0.268059
0,2000-01-02,1.003042
0,2000-01-03,2.103296
0,2000-01-04,3.370117
0,2000-01-05,4.469878
...,...,...
4999,2000-09-12,3.154511
4999,2000-09-13,4.177726
4999,2000-09-14,5.285619
4999,2000-09-15,6.358213


In [85]:
# Prep data for training
df = pd.DataFrame(series)
train_df = df.reset_index()
train_df.columns = ['ID', 'DS', 'Y']
train_df['ID'] = train_df['ID'].astype(int)

In [55]:
## Save series to Snowflake table
#Only need to run this the first time
test_df = session.create_dataframe(train_df)
test_df.write.mode('overwrite').save_as_table('TPCDS_XGBOOST.DEMO.Series2M')
#train_df = session.table('TPCDS_XGBOOST.DEMO.SERIES2M')

NameError: name 'series' is not defined

## Train Model Locally

Assuming you want to do this in pandas

In [69]:
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, Naive

df = train_df[train_df['ID'] == 2]
df.columns = ['unique_id', 'ds', 'y']
sf = StatsForecast(df=df,
                   models=[AutoARIMA(), Naive()],
                   freq='D', 
                   n_jobs=-1)
forecasts_df = sf.forecast(h=7)
forecasts_df.head()

  multiarray.copyto(a, fill_value, casting='unsafe')


Unnamed: 0_level_0,ds,AutoARIMA,Naive
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,2001-03-22,5.302723,5.3417
2,2001-03-23,3.128467,5.3417
2,2001-03-24,3.467109,5.3417
2,2001-03-25,2.818156,5.3417
2,2001-03-26,3.630536,5.3417


In [57]:
class ForecastingModel(custom_model.CustomModel):

    # Use the same decorator as for methods with FUNCTION inference.
    @custom_model.partitioned_inference_api
    def predict(self, df: pd.DataFrame) -> pd.DataFrame:        
        from statsforecast import StatsForecast
        from statsforecast.models import AutoARIMA, Naive
        df.columns = ['unique_id', 'ds', 'y']
        df.head()
        model = StatsForecast(models=[AutoARIMA()],
                      freq='D',
                      n_jobs=-1)  ##chaning to 1 from -1

        forecasts_df = model.forecast(df=df, h=7)
        forecasts_df.columns = ['DSOUT', 'AUTOARIMA']
        return forecasts_df

In [59]:
ts_model = ForecastingModel()
local_predictions = ts_model.predict(train_df[train_df['ID'] == 2])
local_predictions

  multiarray.copyto(a, fill_value, casting='unsafe')


Unnamed: 0_level_0,DSOUT,AUTOARIMA
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,2001-03-22,5.301099
2,2001-03-23,2.879796
2,2001-03-24,3.087316
2,2001-03-25,3.792668
2,2001-03-26,2.623497
2,2001-03-27,1.802789
2,2001-03-28,2.402022


In [73]:
# Function to parallelize
def parallel_predict(all_data, model):
    return model.predict(all_data)

# Assuming df1 is your complete dataset
all_groups = [group for _, group in train_df.groupby('ID')]

# Initialize the ForecastingModel
model = ForecastingModel()

# Parallel execution using Joblib
num_cores = -1  # Use all available cores
results = Parallel(n_jobs=num_cores)(delayed(parallel_predict)(all_data, model) for all_data in all_groups)
results
# Combine or process the results as needed

  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='unsafe')
  multiarray.copyto(a, fill_value, casting='un

[               DSOUT  AUTOARIMA
 unique_id                      
 0         2000-03-28   1.679354
 0         2000-03-29   1.319702
 0         2000-03-30   1.037073
 0         2000-03-31   0.814973
 0         2000-04-01   0.640437
 0         2000-04-02   0.503280
 0         2000-04-03   0.395497,
                DSOUT  AUTOARIMA
 unique_id                      
 1         2000-10-12   0.275201
 1         2000-10-13   1.555724
 1         2000-10-14   0.059557
 1         2000-10-15   0.918007
 1         2000-10-16   0.290150
 1         2000-10-17   0.492024
 1         2000-10-18   0.317628,
                DSOUT  AUTOARIMA
 unique_id                      
 2         2001-03-22   5.567934
 2         2001-03-23   1.563272
 2         2001-03-24   1.397986
 2         2001-03-25   0.389886
 2         2001-03-26   0.059139
 2         2001-03-27   0.008970
 2         2001-03-28   0.001361,
                DSOUT  AUTOARIMA
 unique_id                      
 3         2000-05-02   1.049066
 3     

## Train Model in Snowflake

In [63]:
options = {
    "function_type": "TABLE_FUNCTION",
}

mv = reg.log_model(
    ts_model,
    model_name="statsforecast",
    version_name="v10",
    conda_dependencies=["pandas", "statsforecast"],
    options=options,
    signatures={
        "predict": model_signature.ModelSignature(
            inputs=[
                model_signature.FeatureSpec(name="ID", dtype=model_signature.DataType.INT64),
                model_signature.FeatureSpec(name="DS", dtype=model_signature.DataType.TIMESTAMP_NTZ),
                model_signature.FeatureSpec(name="Y", dtype=model_signature.DataType.DOUBLE),
            ],
            outputs=[
               # model_signature.FeatureSpec(name="ID", dtype=model_signature.DataType.INT64),
                model_signature.FeatureSpec(name="DSOUT", dtype=model_signature.DataType.TIMESTAMP_NTZ),
                model_signature.FeatureSpec(name="AUTOARIMA", dtype=model_signature.DataType.FLOAT),
            ],
        )
    },
)

  return next(self.gen)


In [64]:
reg_model = reg.get_model("statsforecast").version("v10")  #v8 is njobs=-1 and v9 is njobs=1

In [79]:
## running the pandas dataframe in a distributed way for training the models
result = reg_model.run(train_df, partition_column="ID")



In [78]:
result.head(20)

Unnamed: 0,DSOUT,AUTOARIMA,ID
0,2000-06-30,0.361505,3604
1,2000-07-01,0.284573,3604
2,2000-07-02,0.224013,3604
3,2000-07-03,0.17634,3604
4,2000-07-04,0.138813,3604
5,2000-07-05,0.109272,3604
6,2000-07-06,0.086018,3604
7,2000-08-03,1.285548,2759
8,2000-08-04,2.437689,2759
9,2000-08-05,0.0,2759


In [86]:
test_df = session.create_dataframe(train_df)
test_df.write.mode('overwrite').save_as_table('TPCDS_XGBOOST.DEMO.TEMPTS')
df2 = session.table('TPCDS_XGBOOST.DEMO.TEMPTS')
df2.show()

---------------------------------------------------
|"ID"  |"DS"                 |"Y"                 |
---------------------------------------------------
|0     |2000-01-01 00:00:00  |0.2680588583230185  |
|0     |2000-01-02 00:00:00  |1.0030415808874993  |
|0     |2000-01-03 00:00:00  |2.103295861925282   |
|0     |2000-01-04 00:00:00  |3.3701165125183645  |
|0     |2000-01-05 00:00:00  |4.46987820017415    |
|0     |2000-01-06 00:00:00  |5.032223851324559   |
|0     |2000-01-07 00:00:00  |6.193685581903783   |
|0     |2000-01-08 00:00:00  |0.3866412931212935  |
|0     |2000-01-09 00:00:00  |1.4981094660883165  |
|0     |2000-01-10 00:00:00  |2.2200492791605595  |
---------------------------------------------------



In [87]:
##This runs fast using Snowflake DF on the paritioned model
result = reg_model.run(df2, partition_column="ID",function_name="PREDICT").collect()

In [88]:
result

[Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 5, 0, 0), AUTOARIMA=0.4318845570087433, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 6, 0, 0), AUTOARIMA=-2.115309476852417, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 7, 0, 0), AUTOARIMA=-1.1379859447479248, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 8, 0, 0), AUTOARIMA=-2.3933017253875732, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 9, 0, 0), AUTOARIMA=0.0, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 10, 0, 0), AUTOARIMA=0.0, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 4, 11, 0, 0), AUTOARIMA=0.0, ID=4090),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 5, 1, 0, 0), AUTOARIMA=4.392172336578369, ID=2797),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 5, 2, 0, 0), AUTOARIMA=3.351226806640625, ID=2797),
 Row(DS=None, Y=None, DSOUT=datetime.datetime(2001, 5, 3, 0, 0), AUTOARIMA=3.2425403594970703, ID=

In [32]:
snowpark_opt_wh = Warehouse(
  name="snowpark_opt_wh",
  warehouse_size="MEDIUM",
  warehouse_type = "SNOWPARK-OPTIMIZED",
  auto_suspend=600,
)
warehouses = root.warehouses["snowpark_opt_wh"]
warehouses.create_or_alter(snowpark_opt_wh)
session.use_warehouse("snowpark_opt_wh")

session.sql('alter session set USE_CACHED_RESULT = FALSE').collect()
session.sql('alter session set query_tag = "TS-LARGE-Chase" ').collect()
#session.sql('alter warehouse snowpark_opt_wh set max_concurrency_level = 1').collect()

print(session.get_current_warehouse())

"SNOWPARK_OPT_WH"


In [89]:
lengths = [10_000, 50_000, 100_000, 500_000, 1_000_000, 2_000_000]
lengths = [5_000]

train_df = session.table('TPCDS_XGBOOST.DEMO.SERIES2M')

for length in lengths:
  print ("prepping data")
  unique_ids_df = train_df.select("ID").distinct().limit(length)
  filtered_df = train_df.join(unique_ids_df, on="ID", how="inner").cache_result() #added cache result
  print(unique_ids_df.count())
  filtered_df.write.mode('overwrite').save_as_table('TPCDS_XGBOOST.DEMO.TEMP_TS')
  df2 = session.table('TPCDS_XGBOOST.DEMO.TEMP_TS')
  print ("starting training")
  init = time()
  # Run the regression model
  result = reg_model.run(df2, partition_column="ID").collect()
  total_time = (time() - init) / 60
  print(f'n_series: {length} total time: {total_time} total rows: {filtered_df.count()}')

prepping data
5000
starting training
n_series: 5000 total time: 35.06032951672872 total rows: 1402829
