# Partitioned Custom Time Series Model - Nixta AutoArima

This notebook shows how to partition a time series model. It uses synthetic data and Nixta AutoArima. The notebooks shows how to test it locally and then run the model in a distributed fashion in Snowflake. I have also made it so you can push the datasets into a Snowflake table for running the inference from the Snowflake model registry.

In [1]:
import json
from snowflake.snowpark import Session
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend

with open('../../creds.json') as f:
    data = json.load(f)
    USERNAME = data['user']
    SF_ACCOUNT = data['account']
    SF_WH = data['warehouse']
    passphrase = data['passphrase']

# Read the private key from the .p8 file
with open('../../rsa_key.p8', 'rb') as key_file:
    private_key = key_file.read()

# If the private key is encrypted, load it with a passphrase
# Replace 'your_key_passphrase' with your actual passphrase if needed
private_key_obj = serialization.load_pem_private_key(
    private_key,
    password=passphrase.encode() if passphrase else None,
    backend=default_backend()
)

# Define connection parameters including the private key
CONNECTION_PARAMETERS = {
    'user': USERNAME,
    'account': SF_ACCOUNT,
    'private_key': private_key_obj,
    'warehouse': SF_WH,
}

# Create a session with the specified connection parameters
session = Session.builder.configs(CONNECTION_PARAMETERS).create()

from snowflake.core.warehouse import Warehouse
from snowflake.core import Root
root = Root(session)
from snowflake.snowpark.functions import col 
from time import time


In [2]:
from datetime import timedelta

import pandas as pd

from snowflake.ml.model import custom_model
from snowflake.ml.model import model_signature
from snowflake.ml.registry import registry
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F

from joblib import Parallel,delayed

In [5]:
import os
import uuid

import pandas as pd
from absl.testing import absltest, parameterized
from sklearn.ensemble import RandomForestClassifier

from snowflake import snowpark
from snowflake.ml import registry
from snowflake.ml.model import custom_model
from snowflake.ml.model.model_signature import infer_signature
from snowflake.ml.utils import connection_params
from snowflake.snowpark import functions as F
#from tests.integ.snowflake.ml.test_utils import db_manager

In [3]:
from snowflake.snowpark.version import VERSION
snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

from snowflake.ml import version
mlversion = version.VERSION

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))
print('Snowflake ML version        : {}.{}.{}'.format(mlversion[0],mlversion[2],mlversion[4]))

User                        : RSHAH
Role                        : "RAJIV"
Database                    : "RAJIV"
Schema                      : "PUBLIC"
Warehouse                   : "RAJIV"
Snowflake version           : 8.35.1
Snowpark for Python version : 1.20.0
Snowflake ML version        : 1.6.1


In [29]:
DATABASE_NAME = "TPCDS_XGBOOST"
SCHEMA_NAME = "DEMO"
reg = registry.Registry(session=session, database_name=DATABASE_NAME, schema_name=SCHEMA_NAME)

reg_model = reg.get_model("statsforecast").version("v12") 

session.sql('USE TPCDS_XGBOOST.DEMO').collect()

query = f"""
CREATE TABLE IF NOT EXISTS RESULTS (
    test_id NUMBER,
    model VARCHAR,
    query_id VARCHAR,
    nrows VARCHAR,
    time VARCHAR,
    dataset_name VARCHAR,
    warehouse_size VARCHAR,
    warehouse_type VARCHAR
)
"""
session.sql(query).collect()

[Row(status='Table RESULTS successfully created.')]

## Train Model Locally

You want to use pandas for initial local testing

In [7]:
TEST_ID = 5

WAREHOUSE_SIZES = [
    # "XSMALL",
    "SMALL",
    "MEDIUM",
    "LARGE",
  #  "XLARGE",
  #  "XXLARGE",
]

WAREHOUSE_TYPES = [
    "STANDARD",
    # "SNOWPARK-OPTIMIZED"
]

DATASETS = [
    (1_000_000, 10),
    #(10_000_000, 10),
    # (10_000_000, 100),
   # (100_000_000, 10),
    # (100_000_000, 100),
  #  (1_000_000_000, 10),
    # (1_000_000_000, 100),
    # (10_000_000_000, 10),
]  # rows x features

REPEAT_TIMES = 1


In [32]:
# An evaluation loop to see how the model does at the different series lengths
lengths = [10_000, 50_000, 100_000, 500_000, 1_000_000, 2_000_000]
#lengths = [1_000]
TEST_ID = 1
train_df = session.table('TPCDS_XGBOOST.DEMO.SERIES2M')
print(session.get_current_warehouse())
model_name = "statsforecast"
dataset_name = "SERIES2M"

WAREHOUSE_SIZES = [
    # "XSMALL",
   # "SMALL",
   # "MEDIUM",
    "LARGE",
    #"XLARGE",
    "XXLARGE",
    "XXXLARGE"

]
for whouse in WAREHOUSE_SIZES:  
  print ("build warehouse ", whouse)
  wh_size = whouse
  wh_type = "SNOWPARK-OPTIMIZED"
  wh_type = "STANDARD"

  session.sql(
  f"""CREATE OR REPLACE WAREHOUSE {"snowpark_opt_wh"}
          WITH
              WAREHOUSE_SIZE= '{wh_size}'
              WAREHOUSE_TYPE = '{wh_type}'
              AUTO_SUSPEND = 60
              AUTO_RESUME = TRUE
              INITIALLY_SUSPENDED = FALSE
              MAX_CONCURRENCY_LEVEL = 1
              MIN_CLUSTER_COUNT = 1
              MAX_CLUSTER_COUNT = 1
  """
  ).collect()

  for length in lengths:
    print ("prepping data for ", length)
    print(session.get_current_warehouse())
    df2 = train_df.filter((col("ID") >= 0) & (col("ID") <= (length-1)))
    print ("starting training")
    init = time()
    # Run the regression model
    result = reg_model.run(df2, partition_column="ID").collect()
    total_time = (time() - init) / 60
    print(f'n_series: {length} total time: {total_time} total rows: {df2.count()}')
    query_id = session.sql("SELECT LAST_QUERY_ID()").collect()[0].as_dict()["LAST_QUERY_ID()"]
    query = f"""
    INSERT INTO RESULTS VALUES (
      '{TEST_ID}',
      '{model_name}',
      '{query_id}',
      '{length}',
      '{total_time}',
      '{dataset_name}',
      '{wh_size}',
      '{wh_type}'
      )
    """
    session.sql(query).collect()

"SNOWPARK_OPT_WH"
build warehouse  LARGE
prepping data for  10000
"SNOWPARK_OPT_WH"
starting training
n_series: 10000 total time: 2.7434086481730144 total rows: 2750459
prepping data for  50000
"SNOWPARK_OPT_WH"
starting training
n_series: 50000 total time: 9.841282288233439 total rows: 13776842
prepping data for  100000
"SNOWPARK_OPT_WH"
starting training
n_series: 100000 total time: 18.440611879030865 total rows: 27514605
prepping data for  500000
"SNOWPARK_OPT_WH"
starting training
n_series: 500000 total time: 88.59080203771592 total rows: 137488809
prepping data for  1000000
"SNOWPARK_OPT_WH"
starting training
n_series: 1000000 total time: 175.46309643189113 total rows: 274926721
prepping data for  2000000
"SNOWPARK_OPT_WH"
starting training
n_series: 2000000 total time: 349.8397043506304 total rows: 549884998
build warehouse  XXLARGE
prepping data for  10000
"SNOWPARK_OPT_WH"
starting training
n_series: 10000 total time: 1.3042607665061952 total rows: 2750459
prepping data for  50

In [25]:
results = session.table('RESULTS')
results.show()

---------------------------------------------------------------------------------------------------------------------------------------
|"TEST_ID"  |"MODEL"        |"QUERY_ID"                            |"NROWS"  |"DATASET_NAME"  |"WAREHOUSE_SIZE"  |"WAREHOUSE_TYPE"    |
---------------------------------------------------------------------------------------------------------------------------------------
|0          |statsforecast  |01b73400-0003-03dc-0023-fc8702e20cea  |1000     |SERIES2M        |MEDIUM            |SNOWPARK-OPTIMIZED  |
---------------------------------------------------------------------------------------------------------------------------------------

