# Steps in this Notebook

1. Imports
2. Snowflake Setup
3. Local testing wit MLForecast
4. Snowflake testing with MLForecast using UDTF

# Imports

In [None]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType, FloatType, StringType,StructType, StructField

import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.preprocessing import KBinsDiscretizer, OrdinalEncoder, OneHotEncoder
from snowflake.ml.modeling.impute import SimpleImputer

import json
import os
import pandas as pd
from datetime import date, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
connection_parameters = json.load(open('/Users/skhara/Documents/Code/creds.json'))
session = Session.builder.configs(connection_parameters).create()

In [None]:
import os
os.getcwd()

# Snowflake Setup: Create a Database and Schema

We will be using PUBLIC schema.

In [None]:
# Load TS data from Store_Traffic Database into ACCRUENT_TS_FORECASTING DB for testing purposes.
sdf_raw = session.table('TIME_SERIES_1K')

In [None]:
# Print the Number of Time Series/Pumps that we have to predict
session.sql('SELECT COUNT(DISTINCT SERIES_ID) FROM TIME_SERIES_1K').collect()

In [None]:
# Print the SERIES_ID and Start and End date of Time Series
session.sql('SELECT SERIES_ID, MIN(DATE), MAX(DATE) FROM TIME_SERIES_1K GROUP BY SERIES_ID').collect()

In [None]:
sdf_raw.limit(5).to_pandas()

In [None]:
sdf_raw.describe().collect()

# Local Testing

In [None]:
df_data = sdf_raw.filter((F.col("SERIES_ID") == 62)).to_pandas()

# Here onwards copy paste in UDTF
df_data['DATE'] = pd.to_datetime(df_data['DATE'])
df_data.groupby('DATE').sum('VALUE').reset_index()
# df_data = df_data[['DATE','VALUE']]
df_data = df_data.sort_values(by=['DATE']).reset_index(drop=True)

In [None]:
df_data.head()

In [None]:
df_data.set_index('DATE')['VALUE'].plot()

### Testing with Nixtla MLForecast

In [None]:
df_mlf = df_data.copy()
df_mlf.columns = ['ds', 'unique_id', 'y']
df_mlf.tail(5)

In [None]:
from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
XGBRegressor()

In [None]:
fh = 20

In [None]:
fcst = MLForecast(
    models=[LinearRegression(),XGBRegressor()],
    freq='D',  # our serie has a monthly frequency
    lags=[1,7,28,60],
    target_transforms=[Differences([1])],
)
fcst.fit(df_mlf.iloc[0:-fh])

In [None]:
preds = fcst.predict(fh)
preds

In [None]:
df_res = preds.merge(df_mlf, left_on=['ds','unique_id'], right_on = ['ds','unique_id'],how='left')
df_res = df_res.drop('unique_id', axis=1)

In [None]:
df = df_res.copy()
df.set_index('ds', inplace=True)

# Plotting the time series
plt.figure(figsize=(10, 6))

plt.plot(df.index, df['LinearRegression'], label='Linear Regression')
plt.plot(df.index, df['XGBRegressor'], label='XGB Regressor')
plt.plot(df.index, df['y'], label='Actual Values')

# Adding title and labels
plt.title('Time Series Plot')
plt.xlabel('Date')
plt.ylabel('Values')

# Adding legend
plt.legend()

# Display the plot
plt.show()


### Testing with Sktime Locally

In [None]:
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.theta import ThetaForecaster
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error

In [None]:
df_input = pd.Series(df_data['VALUE'].values, index=df_data['TIMESTAMP'])
df_input

In [None]:
df_input.index.freq = 'D'  # Set this to the appropriate frequency
df_input = df_input.resample('D').asfreq()  # Resample if needed; adjust the 'M' if using a different frequency

In [None]:
y_train, y_test = temporal_train_test_split(df_input)
fh = ForecastingHorizon(y_test.index, is_relative=False)
forecaster = ThetaForecaster(sp=12)  # monthly seasonal periodicity
forecaster.fit(y_train)
y_pred = forecaster.predict(fh)
y_pred

In [None]:
y_pred = forecaster.predict(fh)
mean_absolute_percentage_error(y_test, y_pred)

### Testing with Dart Locally

In [None]:
# !pip install darts

In [None]:
import darts
from darts import TimeSeries
from darts.models import FFT
from darts.metrics import mae, mape

In [None]:
# Train + Forecast Length
train_length = 600
forecast_horizon = 30
train_end = max(df_data['TIMESTAMP']) - pd.Timedelta(days = 30)
train_start = train_end - pd.Timedelta(days = 600)
df_input = df_data.loc[(df_data['TIMESTAMP'] >= train_start) &
                        (df_data['TIMESTAMP'] < train_end)].reset_index(drop=True)
df_input = df_input.set_index('TIMESTAMP')
df_input.index.name = 'time'

ts_train = TimeSeries.from_dataframe(df_input, fill_missing_dates=True, freq='D')
ts_train

In [None]:
from darts.models import XGBModel
my_model = XGBModel(lags = 10, n_estimators=100, max_depth=5)
my_model.fit(ts_train)

In [None]:
ts_forecast = my_model.predict(forecast_horizon)
data = ts_forecast.pd_dataframe().reset_index().values
df_forecasted = pd.DataFrame(data, columns = ['TIMESTAMP','FORECAST'])
df_forecasted

In [None]:
df_compare = df_data.merge(df_forecasted, how='left', left_on='TIMESTAMP', right_on='TIMESTAMP')
df_compare = df_compare.set_index('TIMESTAMP')
df_compare.iloc[-90:].plot()

In [None]:
df_res = df_compare.dropna()
df_res = (df_res['VALUE'] - df_res['FORECAST'])/df_res['VALUE'] * 100
print('MAPE= ',df_res.mean())

# Creating UDTF for multi-node parallelized model training

In [None]:
schema = T.StructType([
    T.StructField("ID", T.IntegerType()),
    T.StructField("TIMESTAMP", T.DateType()),
    T.StructField("LINREG", T.FloatType()),
    T.StructField("XGB", T.FloatType()),
    T.StructField("TRAIN_START", T.DateType()),
    T.StructField("TRAIN_END", T.DateType()),
    T.StructField("FORECAST_HORIZON", T.IntegerType())
                  ])

@F.udtf(output_schema = schema,
        input_types = [T.VariantType()],
        name = "TSF_MLFORECAST", is_permanent=True, stage_location= "@DEMO_DB.PUBLIC.ML_MODELS", session=session,
        packages=['pandas', 'mlforecast' ,'xgboost', 'scikit-learn'],
        replace=True
       )

class forecast:
    def __init__(self):
        self.rows=[]
        self.dfs=[]
    
    def process(self, data):
        self.rows.append(data)

        # Merge rows into a dataframe
        if len(self.rows) >= 16000:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []
        
        # Merge dataframes into a single dataframe
        # Minimizes memory footprint
        if len(self.dfs) >= 100:
            merged_df = pd.concat(self.dfs)
            self.dfs = [merged_df]

        yield None
    
    def end_partition(self):
        # Merge any remaining rows
        from mlforecast import MLForecast
        from mlforecast.target_transforms import Differences
        from xgboost import XGBRegressor
        from sklearn.linear_model import LinearRegression

        if len(self.rows) > 0:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []

        # Process Input
        df_input = pd.concat(self.dfs)
        df_input['DATE'] = pd.to_datetime(df_input['DATE'])
        df_input.groupby(['DATE','SERIES_ID']).sum('VALUE').reset_index()
        df_input = df_input[['DATE','SERIES_ID','VALUE']]
        df_input.columns = ['ds','unique_id','y']

        #Train + Forecast Length
        train_length = 600
        fh = 30 # Forecast Horizon
        train_end = max(df_input['ds'])
        train_start = train_end - pd.Timedelta(days = 600)
        
        df_input = df_input.loc[(df_input['ds'] >= train_start) &
                                (df_input['ds'] < train_end)].reset_index(drop=True)

        
        fcst = MLForecast(models=[LinearRegression(),XGBRegressor()],
                          freq='D',
                          lags=[1,7,28,60],
                          target_transforms=[Differences([1])])
        
        fcst.fit(df_mlf.iloc[0:-fh])

        ts_forecast = fcst.predict(fh)
        
        # Processing
        ts_forecast.columns = ['ID','TIMESTAMP','LINREG','XGB']
        ts_forecast['TRAIN_START'] = train_start
        ts_forecast['TRAIN_END'] = train_end
        ts_forecast['FORECAST_HORIZON'] = fh

        yield from ts_forecast.itertuples(index=False, name=None) 

In [None]:
sdf_raw.limit(5).to_pandas()

In [None]:
df = session.table('TIME_SERIES') \
        .with_column('ROW', F.object_construct_keep_null('*')) \
        .select(F.col('SERIES_ID'), F.col('ROW'))

store_forecast_test = F.table_function("TSF_MLFORECAST")

variant_column = F.parse_json(df.col('ROW').cast(T.VariantType()))

forecast = df.select(
                F.col('SERIES_ID'), 
                store_forecast_test(variant_column).over(partition_by=['SERIES_ID'])
                )

forecast = forecast.with_column('FORECAST_DATETIME', F.current_timestamp())
forecast.write.save_as_table("DEMO_DB.PUBLIC.FORECAST_USING_MLFORECAST", mode="append")

In [None]:
sdf_ref = session.table('DEMO_DB.PUBLIC.FORECAST_USING_MLFORECAST')
sdf_ref.limit(5).to_pandas()

### Upload library to Snowflake Stage
We are uploading to a stage as this library is not available through the Snowflake Anaconda Channel.

In [None]:
import nfoursid
nfoursid_path = nfoursid.__path__[0]
print(nfoursid_path)

In [None]:
import darts
darts_path = darts.__path__[0]
print(darts_path)

In [None]:
zip_file_path = "/Users/skhara/anaconda3/envs/pysnowpark_ml_tpcds/lib/python3.9/site-packages/nfoursid.zip"
session.file.put(zip_file_path, "@ML_MODELS", auto_compress=False, overwrite=True)

In [None]:
zip_file_path = "/Users/skhara/anaconda3/envs/pysnowpark_ml_tpcds/lib/python3.9/site-packages/darts.zip"
session.file.put(zip_file_path, "@ML_MODELS", auto_compress=False, overwrite=True)

In [None]:
schema = T.StructType([
    T.StructField("TIMESTAMP", T.DateType()),
    T.StructField("FORECAST", T.IntegerType()),
    T.StructField("TRAIN_START", T.DateType()),
    T.StructField("TRAIN_END", T.DateType()),
    T.StructField("FORECAST_HORIZON", T.IntegerType())
])

@F.udtf(output_schema = schema,
        input_types = [T.VariantType()],
        name = "PUMP_TS_DARTS_RANDOMFOREST", is_permanent=True, stage_location="@ML_MODELS", session=session,
        packages=['pandas', 'fsspec==2023.4.0','holidays==0.18',
                  'joblib==1.2.0','lightning-utilities==0.7.1','matplotlib==3.7.1',
                  'plotly==5.9.0','pmdarima==2.0.3','pytorch==2.0.1',
                  'pytorch-lightning==2.0.3','pyyaml==6.0','scikit-learn==1.2.2',
                  'scipy==1.10.1','snowflake-snowpark-python==1.4.0','statsmodels',
                  'tbats==1.1.3','torchmetrics==0.11.4','tqdm','xarray'
                 ], replace=True,
        imports = ["@ML_MODELS/nfoursid.zip", "@ML_MODELS/darts.zip"])

class forecast:
    def __init__(self):
        self.rows=[]
        self.dfs=[]
    
    def process(self, data):
        self.rows.append(data)

        # Merge rows into a dataframe
        if len(self.rows) >= 16000:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []
        
        # Merge dataframes into a single dataframe
        # Minimizes memory footprint
        if len(self.dfs) >= 100:
            merged_df = pd.concat(self.dfs)
            self.dfs = [merged_df]

        yield None
    
    def end_partition(self):
        # Merge any remaining rows
        from darts import TimeSeries
        from darts.models import RandomForest

        if len(self.rows) > 0:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []
        
        # Process Input
        df_input = pd.concat(self.dfs)
        df_input['TIMESTAMP'] = pd.to_datetime(df_input['TIMESTAMP'])
        df_input = df_input.groupby('TIMESTAMP').sum('VALUE').reset_index()
        df_input = df_input[['TIMESTAMP','VALUE']]

        #Train + Forecast Length
        train_length = 600
        forecast_horizon = 30
        train_end = max(df_input['TIMESTAMP'])
        train_start = train_end - pd.Timedelta(days = 600)
        
        df_input = df_input.loc[(df_input['TIMESTAMP'] >= train_start) &
                                (df_input['TIMESTAMP'] < train_end)].reset_index(drop=True)
        df_input = df_input.set_index('TIMESTAMP')
        df_input.index.name = 'time'
        
        # Convert DataFrame to Darts TS Object
        ts_train = TimeSeries.from_dataframe(df_input, fill_missing_dates=True, freq='D')
        
        # Initialize Model
        my_model = RandomForest(lags = 10, n_estimators=100, max_depth=5)

        # Fit Model and Predict
        my_model.fit(ts_train)
        ts_forecast = my_model.predict(forecast_horizon)
        
        # Processing
        data = ts_forecast.pd_dataframe().reset_index().values
        df_forecast = pd.DataFrame(data, columns = ['TIMESTAMP','VALUE'])
        df_forecast['TRAIN_START'] = train_start
        df_forecast['TRAIN_END'] = train_end
        df_forecast['FORECAST_HORIZON'] = forecast_horizon

        yield from df_forecast.itertuples(index=False, name=None)

In [None]:
df = session.table('TIME_SERIES_DATA') \
        .with_column('ROW', F.object_construct_keep_null('*')) \
        .select(F.col('PUMP_ID'), F.col('ROW'))

store_forecast_test = F.table_function("PUMP_TS_DARTS_RANDOMFOREST")

variant_column = F.parse_json(df.col('ROW').cast(T.VariantType()))

forecast = df.select(
                F.col('PUMP_ID'),
                F.col('GENDER')
                store_forecast_test(variant_column).over(partition_by=['PUMP_ID','GENDER'])
                )

forecast = forecast.with_column('MODEL', F.lit('RANDOMFOREST'))
forecast = forecast.with_column('FORECAST_DATETIME', F.current_timestamp())
forecast.write.save_as_table("FORECAST_USING_DARTS_RANDOMFOREST", mode="overwrite")

In [None]:
df_temp = session.table('FORECAST_USING_DARTS_RANDOMFOREST')
df_temp.limit(5).to_pandas()

In [None]:
schema = T.StructType([
    T.StructField("TIMESTAMP", T.DateType()),
    T.StructField("FORECAST", T.IntegerType()),
    T.StructField("TRAIN_START", T.DateType()),
    T.StructField("TRAIN_END", T.DateType()),
    T.StructField("FORECAST_HORIZON", T.IntegerType())
])

@F.udtf(output_schema = schema,
        input_types = [T.VariantType()],
        name = "PUMP_TS_DARTS_FFT", is_permanent=True, stage_location="@ML_MODELS", session=session,
        packages=['pandas', 'fsspec==2023.4.0','holidays==0.18',
                  'joblib==1.2.0','lightning-utilities==0.7.1','matplotlib==3.7.1',
                  'plotly==5.9.0','pmdarima==2.0.3','pytorch==2.0.1',
                  'pytorch-lightning==2.0.3','pyyaml==6.0','scikit-learn==1.2.2',
                  'scipy==1.10.1','snowflake-snowpark-python==1.4.0','statsmodels',
                  'tbats==1.1.3','torchmetrics==0.11.4','tqdm','xarray'
                 ], replace=True,
        imports = ["@ML_MODELS/nfoursid.zip", "@ML_MODELS/darts.zip"])

class forecast:
    def __init__(self):
        self.rows=[]
        self.dfs=[]
    
    def process(self, data):
        self.rows.append(data)

        # Merge rows into a dataframe
        if len(self.rows) >= 16000:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []
        
        # Merge dataframes into a single dataframe
        # Minimizes memory footprint
        if len(self.dfs) >= 100:
            merged_df = pd.concat(self.dfs)
            self.dfs = [merged_df]

        yield None
    
    def end_partition(self):
        # Merge any remaining rows
        from darts import TimeSeries
        from darts.models import FFT

        if len(self.rows) > 0:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []
        
        # Process Input
        df_input = pd.concat(self.dfs)
        df_input['TIMESTAMP'] = pd.to_datetime(df_input['TIMESTAMP'])
        df_input.groupby('TIMESTAMP').sum('VALUE').reset_index()
        df_input = df_input[['TIMESTAMP','VALUE']]

        #Train + Forecast Length
        train_length = 600
        forecast_horizon = 30
        train_end = max(df_input['TIMESTAMP'])
        train_start = train_end - pd.Timedelta(days = 600)
        
        df_input = df_input.loc[(df_input['TIMESTAMP'] >= train_start) &
                                (df_input['TIMESTAMP'] < train_end)].reset_index(drop=True)
        df_input = df_input.set_index('TIMESTAMP')
        df_input.index.name = 'time'
        
        # Convert DataFrame to Darts TS Object
        ts_train = TimeSeries.from_dataframe(df_input, fill_missing_dates=True, freq='D')
        
        # Initialize Model
        FFT_model = FFT(nr_freqs_to_keep=400,trend=None)
        
        # Fit Model and Predict
        FFT_model.fit(ts_train)
        ts_forecast = FFT_model.predict(forecast_horizon)
        
        # Processing
        data = ts_forecast.pd_dataframe().reset_index().values
        df_forecast = pd.DataFrame(data, columns = ['TIMESTAMP','VALUE'])
        df_forecast['TRAIN_START'] = train_start
        df_forecast['TRAIN_END'] = train_end
        df_forecast['FORECAST_HORIZON'] = forecast_horizon

        yield from df_forecast.itertuples(index=False, name=None)

In [None]:
df = session.table('TIME_SERIES_DATA') \
        .with_column('ROW', F.object_construct_keep_null('*')) \
        .select(F.col('PUMP_ID'), F.col('ROW'))

store_forecast_test = F.table_function("PUMP_TS_DARTS_FFT")

variant_column = F.parse_json(df.col('ROW').cast(T.VariantType()))

forecast = df.select(
                F.col('PUMP_ID'), 
                store_forecast_test(variant_column).over(partition_by=['PUMP_ID'])
                )

forecast = forecast.with_column('MODEL', F.lit('FFT'))
forecast = forecast.with_column('FORECAST_DATETIME', F.current_timestamp())
forecast.write.save_as_table("FORECAST_USING_DARTS_FFT", mode="overwrite")

In [None]:
df_temp = session.table('FORECAST_USING_DARTS_FFT')
df_temp.limit(5).to_pandas()

## - Darts with XGBoost

In [None]:
schema = T.StructType([
    T.StructField("TIMESTAMP", T.DateType()),
    T.StructField("FORECAST", T.IntegerType()),
    T.StructField("TRAIN_START", T.DateType()),
    T.StructField("TRAIN_END", T.DateType()),
    T.StructField("FORECAST_HORIZON", T.IntegerType())
])

@F.udtf(output_schema = schema,
        input_types = [T.VariantType()],
        name = "PUMP_TS_DARTS_XGB", is_permanent=True, stage_location="@ML_MODELS", session=session,
        packages=['pandas', 'fsspec==2023.4.0','holidays==0.18',
                  'joblib==1.2.0','lightning-utilities==0.7.1','matplotlib==3.7.1',
                  'plotly==5.9.0','pmdarima==2.0.3','pytorch==2.0.1',
                  'pytorch-lightning==2.0.3','pyyaml==6.0','scikit-learn==1.2.2',
                  'scipy==1.10.1','snowflake-snowpark-python==1.4.0','statsmodels',
                  'tbats==1.1.3','torchmetrics==0.11.4','tqdm','xarray','xgboost'
                 ], replace=True,
        imports = ["@ML_MODELS/nfoursid.zip", "@ML_MODELS/darts.zip"])

class forecast:
    def __init__(self):
        self.rows=[]
        self.dfs=[]
    
    def process(self, data):
        self.rows.append(data)

        # Merge rows into a dataframe
        if len(self.rows) >= 16000:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []
        
        # Merge dataframes into a single dataframe
        # Minimizes memory footprint
        if len(self.dfs) >= 100:
            merged_df = pd.concat(self.dfs)
            self.dfs = [merged_df]

        yield None
    
    def end_partition(self):
        # Merge any remaining rows
        from darts import TimeSeries
        import xgboost
        from darts.models import XGBModel
        

        if len(self.rows) > 0:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []
        
        # Process Input
        df_input = pd.concat(self.dfs)
        df_input['TIMESTAMP'] = pd.to_datetime(df_input['TIMESTAMP'])
        df_input.groupby('TIMESTAMP').sum('VALUE').reset_index()
        df_input = df_input[['TIMESTAMP','VALUE']]

        #Train + Forecast Length
        train_length = 600
        forecast_horizon = 30
        train_end = max(df_input['TIMESTAMP'])
        train_start = train_end - pd.Timedelta(days = 600)
        
        df_input = df_input.loc[(df_input['TIMESTAMP'] >= train_start) &
                                (df_input['TIMESTAMP'] < train_end)].reset_index(drop=True)
        df_input = df_input.set_index('TIMESTAMP')
        df_input.index.name = 'time'
        
        # Convert DataFrame to Darts TS Object
        ts_train = TimeSeries.from_dataframe(df_input, fill_missing_dates=True, freq='D')
        
        # Initialize Model
        my_model = XGBModel(lags = 10, n_estimators=100, max_depth=5)

        # Fit Model and Predict
        my_model.fit(ts_train)
        ts_forecast = my_model.predict(forecast_horizon)
        
        # Processing
        data = ts_forecast.pd_dataframe().reset_index().values
        df_forecast = pd.DataFrame(data, columns = ['TIMESTAMP','VALUE'])
        df_forecast['TRAIN_START'] = train_start
        df_forecast['TRAIN_END'] = train_end
        df_forecast['FORECAST_HORIZON'] = forecast_horizon

        yield from df_forecast.itertuples(index=False, name=None) 

In [None]:
df = session.table('TIME_SERIES_DATA') \
        .with_column('ROW', F.object_construct_keep_null('*')) \
        .select(F.col('PUMP_ID'), F.col('ROW'))

store_forecast_test = F.table_function("PUMP_TS_DARTS_XGB")

variant_column = F.parse_json(df.col('ROW').cast(T.VariantType()))

forecast = df.select(
                F.col('PUMP_ID'), 
                store_forecast_test(variant_column).over(partition_by=['PUMP_ID'])
                )

forecast = forecast.with_column('MODEL', F.lit('XGB'))
forecast = forecast.with_column('FORECAST_DATETIME', F.current_timestamp())
forecast.write.save_as_table("FORECAST_USING_DARTS_XGB", mode="append")

In [None]:
# df_temp = session.table('FORECAST_USING_DARTS_XGB')
# df_temp.limit(5).to_pandas()

# Deployment
Two options -
1. Create a task using SQL
2. Create a task using Task API (future improvement)

The code given below uses option 1 to create a task in SQL.
1. Get the command to run the forecast function
2. Create a task
3. Resume task to run on the schedule

In [None]:
forecast_script = f'''
INSERT  INTO FORECAST_USING_DARTS_XGB  SELECT "PUMP_ID", "TIMESTAMP", "FORECAST", "TRAIN_START", "TRAIN_END",
"FORECAST_HORIZON", 'XGB' AS "MODEL", current_timestamp() AS "FORECAST_DATETIME"
FROM ( SELECT T_LEFT.*, T_RIGHT."TIMESTAMP", T_RIGHT."FORECAST", T_RIGHT."TRAIN_START", T_RIGHT."TRAIN_END", T_RIGHT."FORECAST_HORIZON"
FROM ( SELECT "PUMP_ID", object_construct_keep_null(*) AS "ROW" FROM TIME_SERIES_DATA) AS T_LEFT
JOIN  TABLE (PUMP_TS_DARTS_XGB(parse_json( CAST ("ROW" AS VARIANT)))  OVER (PARTITION BY "PUMP_ID" )) AS T_RIGHT)
'''

task_script = f'''
CREATE OR REPLACE TASK POC_INVISTA.TASK_FORECAST_TS
WAREHOUSE = SSK_RESEARCH
SCHEDULE = '1 MINUTE'
AS {forecast_script};
'''

resume_script = f'''ALTER TASK POC_INVISTA.TASK_FORECAST_TS RESUME;'''

In [None]:
session.sql(task_script).collect()
# session.sql(resume_script).collect()