# Steps in this Notebook

1. Imports
2. Snowflake Setup
3. Local testing wit MLForecast
4. Snowflake testing with MLForecast using UDTF

# Imports

In [None]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
from snowflake.snowpark.functions import col

from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import IntegerType, FloatType, StringType,StructType, StructField

import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.preprocessing import KBinsDiscretizer, OrdinalEncoder, OneHotEncoder
from snowflake.ml.modeling.impute import SimpleImputer

import json
import os
import pandas as pd
from datetime import date, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
connection_parameters = json.load(open('/Users/skhara/Documents/Code/creds.json'))
session = Session.builder.configs(connection_parameters).create()

In [None]:
import os
os.getcwd()

# Snowflake Setup: Create a Database and Schema

We will be using PUBLIC schema.

In [None]:
# Load TS data from Store_Traffic Database into ACCRUENT_TS_FORECASTING DB for testing purposes.
sdf_raw = session.table('TIME_SERIES_1K')

In [None]:
# Print the Number of Time Series/Pumps that we have to predict
session.sql('SELECT COUNT(DISTINCT SERIES_ID) FROM TIME_SERIES_1K').collect()

In [None]:
# Print the SERIES_ID and Start and End date of Time Series
session.sql('SELECT SERIES_ID, MIN(DATE), MAX(DATE) FROM TIME_SERIES_1K GROUP BY SERIES_ID').collect()

In [None]:
sdf_raw.limit(5).to_pandas()

In [None]:
sdf_raw.describe().collect()

# Local Testing

In [None]:
df_data = sdf_raw.filter((F.col("SERIES_ID") == 62)).to_pandas()

# Here onwards copy paste in UDTF
df_data['DATE'] = pd.to_datetime(df_data['DATE'])
df_data.groupby('DATE').sum('VALUE').reset_index()
# df_data = df_data[['DATE','VALUE']]
df_data = df_data.sort_values(by=['DATE']).reset_index(drop=True)

In [None]:
df_data.head()

In [None]:
df_data.set_index('DATE')['VALUE'].plot()

### Testing with Nixtla MLForecast

In [None]:
df_mlf = df_data.copy()
df_mlf.columns = ['ds', 'unique_id', 'y']
df_mlf.tail(5)

In [None]:
from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
fh = 20

In [None]:
fcst = MLForecast(
    models=[LinearRegression(),XGBRegressor()],
    freq='D',  # our serie has a monthly frequency
    lags=[1,7,28,60],
    target_transforms=[Differences([1])],
)
fcst.fit(df_mlf.iloc[0:-fh])

In [None]:
preds = fcst.predict(fh)
preds

In [None]:
df_res = preds.merge(df_mlf, left_on=['ds','unique_id'], right_on = ['ds','unique_id'],how='left')
df_res = df_res.drop('unique_id', axis=1)

In [None]:
df = df_res.copy()
df.set_index('ds', inplace=True)

# Plotting the time series
plt.figure(figsize=(10, 6))

plt.plot(df.index, df['LinearRegression'], label='Linear Regression')
plt.plot(df.index, df['XGBRegressor'], label='XGB Regressor')
plt.plot(df.index, df['y'], label='Actual Values')

# Adding title and labels
plt.title('Time Series Plot')
plt.xlabel('Date')
plt.ylabel('Values')

# Adding legend
plt.legend()

# Display the plot
plt.show()


# Creating UDTF for multi-node parallelized model training

In [None]:
schema = T.StructType([
    T.StructField("ID", T.IntegerType()),
    T.StructField("TIMESTAMP", T.DateType()),
    T.StructField("LINREG", T.FloatType()),
    T.StructField("XGB", T.FloatType()),
    T.StructField("TRAIN_START", T.DateType()),
    T.StructField("TRAIN_END", T.DateType()),
    T.StructField("FORECAST_HORIZON", T.IntegerType())
                  ])

@F.udtf(output_schema = schema,
        input_types = [T.VariantType()],
        name = "TSF_MLFORECAST", is_permanent=True, stage_location= "@DEMO_DB.PUBLIC.ML_MODELS", session=session,
        packages=['pandas', 'mlforecast' ,'xgboost', 'scikit-learn'],
        replace=True
       )

class forecast:
    def __init__(self):
        self.rows=[]
        self.dfs=[]
    
    def process(self, data):
        self.rows.append(data)

        # Merge rows into a dataframe
        if len(self.rows) >= 16000:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []
        
        # Merge dataframes into a single dataframe
        # Minimizes memory footprint
        if len(self.dfs) >= 100:
            merged_df = pd.concat(self.dfs)
            self.dfs = [merged_df]

        yield None
    
    def end_partition(self):
        # Merge any remaining rows
        from mlforecast import MLForecast
        from mlforecast.target_transforms import Differences
        from xgboost import XGBRegressor
        from sklearn.linear_model import LinearRegression

        if len(self.rows) > 0:
            df = pd.DataFrame(self.rows)
            self.dfs.append(df)
            self.rows = []

        # Process Input
        df_input = pd.concat(self.dfs)
        df_input['DATE'] = pd.to_datetime(df_input['DATE'])
        df_input.groupby(['DATE','SERIES_ID']).sum('VALUE').reset_index()
        df_input = df_input[['DATE','SERIES_ID','VALUE']]
        df_input.columns = ['ds','unique_id','y']

        #Train + Forecast Length
        train_length = 600
        fh = 30 # Forecast Horizon
        train_end = max(df_input['ds'])
        train_start = train_end - pd.Timedelta(days = 600)
        
        df_input = df_input.loc[(df_input['ds'] >= train_start) &
                                (df_input['ds'] < train_end)].reset_index(drop=True)

        
        fcst = MLForecast(models=[LinearRegression(),XGBRegressor()],
                          freq='D',
                          lags=[1,7,28,60],
                          target_transforms=[Differences([1])])
        
        fcst.fit(df_mlf.iloc[0:-fh])

        ts_forecast = fcst.predict(fh)
        
        # Processing
        ts_forecast.columns = ['ID','TIMESTAMP','LINREG','XGB']
        ts_forecast['TRAIN_START'] = train_start
        ts_forecast['TRAIN_END'] = train_end
        ts_forecast['FORECAST_HORIZON'] = fh

        yield from ts_forecast.itertuples(index=False, name=None) 

In [None]:
sdf_raw.limit(5).to_pandas()

In [None]:
df = session.table('TIME_SERIES') \
        .with_column('ROW', F.object_construct_keep_null('*')) \
        .select(F.col('SERIES_ID'), F.col('ROW'))

store_forecast_test = F.table_function("TSF_MLFORECAST")

variant_column = F.parse_json(df.col('ROW').cast(T.VariantType()))

forecast = df.select(
                F.col('SERIES_ID'), 
                store_forecast_test(variant_column).over(partition_by=['SERIES_ID'])
                )

forecast = forecast.with_column('FORECAST_DATETIME', F.current_timestamp())
forecast.write.save_as_table("DEMO_DB.PUBLIC.FORECAST_USING_MLFORECAST", mode="append")

In [None]:
sdf_ref = session.table('DEMO_DB.PUBLIC.FORECAST_USING_MLFORECAST')
sdf_ref.limit(5).to_pandas()

# Deployment
Two options -
1. Create a task using SQL
2. Create a task using Task API
