In [1]:
!python --version

Python 3.10.14


In [2]:
import os
import torch
import matplotlib.pyplot as plt
import glob
import pandas as pd
from gluonts.dataset.pandas import PandasDataset
from gluonts.dataset.split import split
from tqdm.autonotebook import tqdm
import matplotlib.dates as mdates
from itertools import islice
from collections import defaultdict
import gluonts
from datetime import datetime
import sklearn.metrics

  from tqdm.autonotebook import tqdm


In [3]:
from gluonts.evaluation import make_evaluation_predictions, Evaluator
from autogluon.timeseries.metrics import TimeSeriesScorer
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [4]:
# Function to calculate NRMSE.
TimeSeriesScorer.greater_is_better_internal = True

class NRMSE(TimeSeriesScorer):
   greater_is_better_internal = True
   optimum = 0.0

   def compute_metric(self, data_future, predictions, target, **kwargs):
      return sklearn.metrics.root_mean_squared_error(y_true=data_future[target], y_pred=predictions["mean"]) / data_future[target].mean()

In [5]:
# Data pipelining Sliding window extraction
def get_batched_data_fn(sub_df,
    batch_size: int = 128, 
    context_len: int = 168, 
    horizon_len: int = 24):
    
    examples = defaultdict(list)
    num_examples = 0
    for start in range(0, len(sub_df) - (context_len + horizon_len), horizon_len):
      num_examples += 1
      examples["inputs"].append(sub_df["y"][start:(context_end := start + context_len)].tolist())
      examples["outputs"].append(sub_df["y"][context_end:(context_end + horizon_len)].tolist())
      examples['inputs_ts'].append(sub_df.index[start:(context_end := start + context_len)])
      examples["outputs_ts"].append(sub_df.index[context_end:(context_end + horizon_len)])

    return examples

In [6]:
def forecast_building(df):
    torch.cuda.empty_cache()
    # Set numerical columns as float32
    for col in df.columns:
        # Check if column is not of string type
        if df[col].dtype != 'object' and pd.api.types.is_string_dtype(df[col]) == False:
            df[col] = df[col].astype('float32')
    
    # Create the Timeseries Dataframe
    dataset = TimeSeriesDataFrame(df.reset_index())

    backtest_dataset = dataset
    prediction_length = 24  # Define your prediction length. We use 24 here since the data is of hourly frequency
    num_samples = 100

    train_data, test_data = backtest_dataset.train_test_split(prediction_length)

    predictor = TimeSeriesPredictor(prediction_length=prediction_length).fit(
    train_data,
    hyperparameters={
        "Chronos": {
            "model_path": "small",
            "batch_size": 32,
            "device": "auto",
            "context_length": 168
        }},
    skip_model_selection=True,
    verbosity=0)
    predictions = predictor.predict(train_data)
    agg_metrics = predictor.evaluate(backtest_dataset, metrics=["RMSE", "MSE", "MAE", "MSE", "MAPE", "SMAPE", NRMSE(), "SQL"])

    res_all = pd.DataFrame(test_data[test_data.index.isin(predictions.index)].target)
    res_all.columns = ['y_true']
    res_all.insert(1, 'y_pred', predictions['mean'])
    res_all_df = res_all.reset_index().drop('item_id', axis = 1).sort_values('timestamp')
    
    return res_all_df, agg_metrics


In [7]:
def process_building(df): 
    building_name = df.columns[0]
    df.columns = ['y']
    input_data = get_batched_data_fn(df, batch_size=500)
    
    windows_all = []
    counter = 1
    for inputs_ts, inputs, outputs_ts, outputs in zip(input_data['inputs_ts'], 
                                                      input_data['inputs'], 
                                                      input_data['outputs_ts'], 
                                                      input_data['outputs']):
        
        input_df = pd.DataFrame({'timestamp': inputs_ts, 
                                 'target': inputs})
        
        output_df = pd.DataFrame({'timestamp': outputs_ts, 
                                 'target': outputs})
        combined = pd.concat([input_df, output_df], axis=0)
        combined['item_id'] = str(building_name) + '_' + str(counter)
        combined['item_id_no'] = counter
        counter += 1
        windows_all.append(combined)
        
    windows_all_df = pd.concat(windows_all)
    windows_all_df.timestamp = pd.to_datetime(windows_all_df.timestamp)
    windows_all_df.set_index('timestamp', inplace=True)

    res, agg_metrics = forecast_building(windows_all_df)
    return res, agg_metrics

In [8]:
batch_size = 32
context_len = 168
horizon_len = 24

def process_file(filename):
    df = pd.read_csv(filename)
    df = df.set_index(['timestamp'])

    if df.shape[1] < 2:
        return None
        
    print(datetime.now(), df.shape, flush=True)

    res_all = []
    agg_metrics_all = []
    
    i = 0
    for building_name in df.columns:
        print(datetime.now(), i, '/', len(df.columns), building_name, flush=True)
        df1 = df[[building_name]]

        res, agg_metrics = process_building(df1)
        res['building'] = building_name
        res['filename'] = filename
        res_all.append(res)
        
        agg_metrics_df = pd.DataFrame([agg_metrics])
        agg_metrics_df.insert(0, 'building', building_name)
        agg_metrics_df.insert(0, 'filename', filename)
        agg_metrics_all.append(agg_metrics_df)

        i += 1
        if i % 5 == 0:
            print(datetime.now(), 'Saving...')
            res_all_df = pd.concat(res_all).round(6)
            res_all_df = res_all_df.reset_index()
            res_all_df = res_all_df.rename(columns={res_all_df.columns[0]: "timestamp" })
            res_all_df.to_csv(f'forecasts/{dataset}/{os.path.basename(filename)}', index=False)                       

            agg_metrics_all_df = pd.concat(agg_metrics_all).round(6)            
            agg_metrics_all_df.to_csv(f'results/{dataset}/agg_metrics_{os.path.basename(filename)}', index=False)            
    
    
    res_all_df = pd.concat(res_all).round(6)
    res_all_df = res_all_df.reset_index()
    res_all_df = res_all_df.rename(columns={res_all_df.columns[0]: "timestamp" })
    res_all_df.to_csv(f'forecasts/{dataset}/{os.path.basename(filename)}', index=False)                       

    agg_metrics_all_df = pd.concat(agg_metrics_all).round(6)   
    agg_metrics_all_df.to_csv(f'results/{dataset}/agg_metrics_{os.path.basename(filename)}', index=False)                

    return res_all_df, agg_metrics_all_df

In [10]:
files_list = glob.glob('/home/user/BuildingsBenchNREL/BuildingsBenchData/BuildingsBench/Electricity/*.csv')

dataset = 'Electricity'
os.makedirs(f'forecasts/{dataset}/', exist_ok = True)
os.makedirs(f'results/{dataset}/', exist_ok = True)

for filename in files_list:
    print(datetime.now(), filename)
    results = process_file(filename)
    # if results is not None:
    #     results.to_csv(f'../forecasts/{dataset}/{os.path.basename(filename)}', index=False)
    print('')

2024-08-29 17:13:58.923942 /home/user/BuildingsBenchNREL/BuildingsBenchData/BuildingsBench/Electricity/LD2011_2014_clean=2011.csv
2024-08-29 17:13:59.118879 (8759, 156)
2024-08-29 17:13:59.119486 0 / 156 MT_124
2024-08-29 17:14:09.668157 1 / 156 MT_156
2024-08-29 17:14:22.185069 2 / 156 MT_158
2024-08-29 17:14:34.235169 3 / 156 MT_159
2024-08-29 17:14:45.262334 4 / 156 MT_161
2024-08-29 17:14:56.080973 Saving...
2024-08-29 17:14:56.301182 5 / 156 MT_162
2024-08-29 17:15:08.167695 6 / 156 MT_163
2024-08-29 17:15:19.091254 7 / 156 MT_166
2024-08-29 17:15:30.474823 8 / 156 MT_168
2024-08-29 17:15:41.813046 9 / 156 MT_169
2024-08-29 17:15:52.359617 Saving...
2024-08-29 17:15:52.809479 10 / 156 MT_171
2024-08-29 17:16:03.966800 11 / 156 MT_172
2024-08-29 17:16:11.017339 12 / 156 MT_174
2024-08-29 17:16:21.871551 13 / 156 MT_175
2024-08-29 17:16:32.604543 14 / 156 MT_176
2024-08-29 17:16:42.936026 Saving...
2024-08-29 17:16:43.711944 15 / 156 MT_180
2024-08-29 17:16:52.219952 16 / 156 MT_182

In [11]:
# res_, agg_metrics_, ts_metrics_ = process_building(df[['Bear_education_Lidia']])

In [12]:
# res, ts_metrics, agg_metrics = process_file("Bear_clean=2016.csv")

In [13]:
# res

In [14]:
# ts_metrics

In [15]:
# agg_metrics