In [2]:
import pandas as pd
import numpy as np

from statsforecast import StatsForecast
from statsforecast.models import (
    # HoltWinters,
    # CrostonClassic as Croston, 
    # HistoricAverage,
    DynamicOptimizedTheta as DOT,
    SeasonalNaive,
    # AutoARIMA
)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import re

  from tqdm.autonotebook import tqdm


In [3]:
# *********************
### Testing DataFrame
# *********************

## SS Query for master table
# /*
# mainDF:
# dataFillMaster().findAll(r=>r["method"] == "Python")
# //.ioWriteCsv(`io/masterTable1_.csv`)
# */

# dataFillMaster().findAll(r=>r["method"] == "Python")
# .keepCols(["data"])[1]->data.table
# //.ioWriteCsv(`io/masterTable1_data2.csv`)

df = pd.read_csv("inputs for testing (master table)/masterTable1_.csv")
df_data1 = pd.read_csv("inputs for testing (master table)/masterTable1_data1.csv", index_col= "ts")
df_data2 = pd.read_csv("inputs for testing (master table)/masterTable1_data2.csv", index_col="ts")
df_data = pd.DataFrame({"data":[df_data1, df_data2] })  
df.loc[:, "data"] = df_data

pythonDF = pd.DataFrame()
# loop over the ssData and extract the data from each row
for i in range(len(df)):
    pythonDF.loc[i, 'pointID'] = df['id'].iloc[i]
    pythonDF.loc[i, 'unit'] = df["unit"].iloc[i]
    pythonDF.loc[i, 'dqType'] = df["dqType"].iloc[i]
    pythonDF.loc[i, 'dqStart'] = pd.to_datetime(df['ts'].iloc[i], format="%Y-%m-%dT%H:%M:%S%z Dubai")
    pythonDF.loc[i, 'dqDuration'] = pd.Timedelta(df['dur'].iloc[i])
    pythonDF.loc[i, 'pointInterval'] =  pd.Timedelta(df["freq"].iloc[i])
    pythonDF.loc[i, 'features'] =  df['featId'].iloc[i]

pythonDF.loc[:, 'his'] =  df['data']

pythonDF


Unnamed: 0,pointID,unit,dqType,dqStart,dqDuration,pointInterval,features,his
0,@p:dmc_All:r:2ddf07d5-ef59ca94 DMC Building 1 ...,°C,Nulls,2023-03-12 01:05:00+04:00,1 days 11:10:00,0 days 00:05:00,[p:dmc_All:r:2de337c0-72b69972],DMC Building 1...
1,@p:dmc_All:r:2ddf07d5-ef59ca94 DMC Building 1 ...,°C,Nulls,2023-03-19 01:10:00+04:00,0 days 23:30:00,0 days 00:05:00,[p:dmc_All:r:2de337c0-72b69972],DMC Building 1...


In [None]:
def extractData(data):
    """
    Function that extracts data for python from the SS grid.

    Input:
    - data: hisGrid (<class 'hxpy.haystack.grid.Grid>)
    Output:
    - DataFrame with following columns 
        - pointID => point id of target variable
        - unit
        - dqType => type of data quality issue
        - dqStart => timestamp of start of data quality issue
        - dqDuration => duration of data quality issue
        - pointInterval => logging interval for the point
        - features => point ids of model features
        - his => history to be used as training data

    ** NOTE_: this function is written to mainly be compatable with python on SS. Running it locally will not work (since it is designed for 
    an input of <class 'hxpy.haystack.grid.Grid> type from SS) 
    
    """

    # convert the Grid object to df to be able to manipulate it (capitalizing on the hxPy facilitation using the .to_dataframe() function)
    ssData = data.to_dataframe()

    # initiate a new empty dataframe to construct the output
    pythonDF = pd.DataFrame()

    # loop over the ssData and extract the data from each row
    for i in range(len(ssData)):
        pythonDF.loc[i, 'pointID'] = ssData['id'].iloc[i]
        pythonDF.loc[i, 'unit'] = ssData["unit"].iloc[i]
        pythonDF.loc[i, 'dqType'] = ssData["dqType"].iloc[i]
        pythonDF.loc[i, 'dqStart'] = ssData['ts'].iloc[i]
        pythonDF.loc[i, 'dqDuration'] = pd.Timedelta(ssData['dur'].iloc[i], "min")
        pythonDF.loc[i, 'pointInterval'] =  pd.Timedelta(ssData["freq"].iloc[i], "min" )
        pythonDF.loc[i, 'features'] =  ssData['featId'].iloc[i]
        pythonDF.loc[i, 'his'] =  ssData['data'].iloc[i]#.to_dataframe()
        
    return pythonDF

In [None]:
def seasonalNaive(df, length_of_missing_data, data_logging_interval):
    """
    Inputs
    df: df used for training set (from SS)
    length_of_missing_data: interval length of missing data (from SS)
    data_logging_interval: data logging interval - called from the hisDQInterval tag on the point (from SS)

    Output
    forecasts_df: dataframe with predictions for the period missing data. Index names as ts, values column named as "v0
    """
    
    # step 1 convert the grid to a dataframe, and set first column as index
    #df = df.to_dataframe()
    #df.set_index(df.columns[0], inplace=True, drop=True)

    # rename the first column as "target"
    new_column_name = "target"
    df = df.rename(columns={df.columns[0]: new_column_name})

    # number of predictions
    horizon = int(length_of_missing_data/data_logging_interval)
    
    # season length
    season_length = int(pd.Timedelta(24, 'h') / data_logging_interval)      

    # frequency
    #freq = str(data_logging_interval.total_seconds()/3600)+"h"

    # The Model
    model = SeasonalNaive(season_length=season_length)
        
    # Model fitting
    model = model.fit(y=df["target"])
    
    # Predictions
    forecasts_df = model.predict(h=horizon)
    forecasts_df = pd.DataFrame(forecasts_df)

    forecasts_df = forecasts_df.rename(columns={forecasts_df.columns[0]: "predictions"})

    return forecasts_df#.reset_index()