In [44]:
import pandas as pd
from data_extraction.dummy_data_extractor import extract_dummy_data

from statsforecast import StatsForecast
from statsforecast.models import (
    # HoltWinters,
    # CrostonClassic as Croston, 
    # HistoricAverage,
    DynamicOptimizedTheta as DOT,
    SeasonalNaive,
    # AutoARIMA
)

In [45]:
master_table = extract_dummy_data("dummy_data")

### Models

In [46]:
#### SEASONAL NAIVE

def seasonal_naive(df, length_of_missing_data, data_logging_interval):
    """
    Inputs
    df: df used for training set (from SS)
    length_of_missing_data: interval length of missing data (from SS)
    data_logging_interval: data logging interval - called from the hisDQInterval tag on the point (from SS)

    Output
    forecasts_df: dataframe with predictions for the period missing data. Index names as ts, values column named as "v0
    """
    
    # step 1 convert the grid to a dataframe, and set first column as index
    #df = df.to_dataframe()
    #df.set_index(df.columns[0], inplace=True, drop=True)

    # rename the first column as "target"
    new_column_name = "target"
    df = df.rename(columns={df.columns[0]: new_column_name})

    # number of predictions
    horizon = int(length_of_missing_data/data_logging_interval)
    
    # season length
    season_length = int(pd.Timedelta(24, 'h') / data_logging_interval)      

    # frequency
    #freq = str(data_logging_interval.total_seconds()/3600)+"h"

    # The Model
    model = SeasonalNaive(season_length=season_length)
        
    # Model fitting
    model = model.fit(y=df["target"])
    
    # Predictions
    forecasts_df = model.predict(h=horizon)
    forecasts_df = pd.DataFrame(forecasts_df)

    forecasts_df = forecasts_df.rename(columns={forecasts_df.columns[0]: "predictions"})

    return forecasts_df#.reset_index()

In [56]:
master_table.at[0, "his"]

Unnamed: 0_level_0,DMC Building 1 Data Quality Tests Dup of AHU_04_B1 Return Air Temp,DMC Building 1 Data Quality Tests New-Point
ts,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-03-07T21:05:00+04:00 Dubai,23.522281646728516°C,34.58906555175781%
2023-03-07T21:10:00+04:00 Dubai,23.522281646728516°C,34.58906555175781%
2023-03-07T21:15:00+04:00 Dubai,23.522281646728516°C,34.58906555175781%
2023-03-07T21:20:00+04:00 Dubai,23.522281646728516°C,34.58906555175781%
2023-03-07T21:25:00+04:00 Dubai,23.522281646728516°C,34.58906555175781%
...,...,...
2023-03-12T00:40:00+04:00 Dubai,24.119338989257812°C,31.959213256835938%
2023-03-12T00:45:00+04:00 Dubai,24.119338989257812°C,33.179718017578125%
2023-03-12T00:50:00+04:00 Dubai,24.119338989257812°C,35.324806213378906%
2023-03-12T00:55:00+04:00 Dubai,24.119338989257812°C,33.5062370300293%


In [48]:
master_table

for i, row in master_table.iterrows():
    # run models
    # extract RMSE
    print(i, row["his"])

0                                 DMC Building 1 Data Quality Tests Dup of AHU_04_B1 Return Air Temp  \
ts                                                                                                   
2023-03-07T21:05:00+04:00 Dubai                               23.522281646728516°C                   
2023-03-07T21:10:00+04:00 Dubai                               23.522281646728516°C                   
2023-03-07T21:15:00+04:00 Dubai                               23.522281646728516°C                   
2023-03-07T21:20:00+04:00 Dubai                               23.522281646728516°C                   
2023-03-07T21:25:00+04:00 Dubai                               23.522281646728516°C                   
...                                                                            ...                   
2023-03-12T00:40:00+04:00 Dubai                               24.119338989257812°C                   
2023-03-12T00:45:00+04:00 Dubai                               24.119338989257812

In [49]:
def ensemble_model(pythonData):
    """
    Function to run all models, and return the one with lowest RMSE.
    
    """

    # dictionary to save predictions for each point
    scores_df_dict = {
    "pointID": [],
    "predictions": [],
    "rmse": [],
    "modelName": []
    }

    # Create a DataFrame from the dictionary
    scores_df = pd.DataFrame(scores_df_dict)

    for i, row in pythonData.iterrows():
        pointID = row["pointID"]
        df = row["his"].to_dataframe()
        df.set_index(df.columns[0], inplace=True, drop=True)
        length_of_missing_data = row["dqDuration"]
        data_logging_interval = row["pointInterval"]

        # number of predictions
        horizon = int(length_of_missing_data/data_logging_interval)

        # training set size (relative to the horizon/prediction size)
        training_set_size = horizon * 10
        #training_set_size = int(228)                                         ################## REMOVE THIS
        
        # training / testing set to evaluate the model (relative to horizon of prediction)
        train_data = df.iloc[-1*int(training_set_size):]#.reset_index(drop=True)
        test_data = df.iloc[:-1*int(training_set_size)]#.reset_index(drop=True)


        seasonal_naive_predictions = seasonal_naive(train_data, length_of_missing_data, data_logging_interval)
        model_name = "Seasonal Naive"
        
        #rmse_score = mean_squared_error(test_data[test_data.columns[0]].to_numpy(), seasonalNaivePredictions[seasonalNaivePredictions.columns[0]].to_numpy(), squared=False)

        # append data to the scored DF
        row_to_append = {'pointID': pointID, 'predictions': seasonal_naive_predictions, "rmse": "", "modelName": model_name}
        scores_df = pd.concat([scores_df, pd.DataFrame([row_to_append])], ignore_index=True)

        
    return scores_df

In [50]:
master_table.at[0, "his"]

Unnamed: 0_level_0,DMC Building 1 Data Quality Tests Dup of AHU_04_B1 Return Air Temp,DMC Building 1 Data Quality Tests New-Point
ts,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-03-07T21:05:00+04:00 Dubai,23.522281646728516°C,34.58906555175781%
2023-03-07T21:10:00+04:00 Dubai,23.522281646728516°C,34.58906555175781%
2023-03-07T21:15:00+04:00 Dubai,23.522281646728516°C,34.58906555175781%
2023-03-07T21:20:00+04:00 Dubai,23.522281646728516°C,34.58906555175781%
2023-03-07T21:25:00+04:00 Dubai,23.522281646728516°C,34.58906555175781%
...,...,...
2023-03-12T00:40:00+04:00 Dubai,24.119338989257812°C,31.959213256835938%
2023-03-12T00:45:00+04:00 Dubai,24.119338989257812°C,33.179718017578125%
2023-03-12T00:50:00+04:00 Dubai,24.119338989257812°C,35.324806213378906%
2023-03-12T00:55:00+04:00 Dubai,24.119338989257812°C,33.5062370300293%


In [55]:
data_logging_interval

Timedelta('0 days 00:05:00')

In [None]:
seasonal_naive(train_data, length_of_missing_data, data_logging_interval)

In [57]:

# dictionary to save predictions for each point
scores_df_dict = {
"pointID": [],
"predictions": [],
"rmse": [],
"modelName": []
}

# Create a DataFrame from the dictionary
scores_df = pd.DataFrame(scores_df_dict)

for i, row in master_table.iterrows():
    pointID = row["pointID"]
    df = row["his"]#.to_dataframe()
    # df.set_index(df.columns[0], inplace=True, drop=True)
    length_of_missing_data = row["dqDuration"]
    data_logging_interval = row["pointInterval"]

    # number of predictions
    horizon = int(length_of_missing_data/data_logging_interval)

    # training set size (relative to the horizon/prediction size)
    training_set_size = horizon * 10
    #training_set_size = int(228)                                         ################## REMOVE THIS
    
    # training / testing set to evaluate the model (relative to horizon of prediction)
    train_data = df.iloc[-1*int(training_set_size):]#.reset_index(drop=True)
    test_data = df.iloc[:-1*int(training_set_size)]#.reset_index(drop=True)

    seasonal_naive_predictions = seasonal_naive(train_data, length_of_missing_data, data_logging_interval)
    model_name = "Seasonal Naive"
    
    #rmse_score = mean_squared_error(test_data[test_data.columns[0]].to_numpy(), seasonalNaivePredictions[seasonalNaivePredictions.columns[0]].to_numpy(), squared=False)

    # append data to the scored DF
    row_to_append = {'pointID': pointID, 'predictions': seasonal_naive_predictions, "rmse": "", "modelName": model_name}
    scores_df = pd.concat([scores_df, pd.DataFrame([row_to_append])], ignore_index=True)



ValueError: could not convert string to float: '24.120765686035156°C'