# Running Gift-Eval with FlowState

This notebook demonstrate how to evaluate FlowState on the [Gift-Eval Benchmark](https://huggingface.co/spaces/Salesforce/GIFT-Eval).

### Installation

The FlowState source codes will be installed from the [granite-tsfm repository](https://github.com/ibm-granite/granite-tsfm).
Note that `granite-tsfm` installs `pandas==2.2.3` but GIFT-EVAL requires `pandas==2.0.0`.
Hence, after installing TTM from `granite-tsfm`, we forece reinstall `pandas==2.0.0`.

Run the following code once to install granite-tsfm in your working python environment.

In [None]:
import os
import sys

if not os.path.exists("granite-tsfm"):
    !git clone git@github.com:ibm-granite/granite-tsfm.git
    %cd granite-tsfm
    !pwd
    # Switch to the desired branch
    !git switch gift-flowstate
    ! pip install ".[notebooks]"
    ! pip install pandas==2.0.0
    %cd ..
else:
    print("Folder 'granite-tsfm' already exists. Skipping git clone.")
sys.path.append(os.path.realpath("granite-tsfm/"))

Import the necessary third-party dependencies

In [2]:
import csv
from gift_eval.data import Dataset
from gluonts.ev.metrics import (MAE,MAPE,MASE,MSE,MSIS,ND,NRMSE,RMSE,SMAPE,MeanWeightedSumQuantileLoss,)
from gluonts.model import evaluate_model
import json
import numpy as np
import pandas as pd
import random
import torch
import warnings
warnings.filterwarnings("ignore")

Import the FlowState model

In [None]:
from tsfm_public import FlowStateForPrediction
from notebooks.hfdemo.flowstate.gift_wrapper import FlowState_Gift_Wrapper

Prepare the configurations

In [None]:
# Path configurations
model_dir = './checkpoints'
config_dir = './configs'
out_dir = './results/FlowState-9.1M'
# out_dir = './results/granite-flowstate-r1' # to recreate the results of granite-flowstate-r1 uncomment this line and the one below

# Model configurations
model_name = 'ibm-research/FlowState'
# model_name = 'ibm-granite/granite-timeseries-flowstate-r1' # to recreate the results of granite-flowstate-r1 uncomment this line and the one above

# Auxiliary configurations
seed = 0
device = 'cuda'
batch_size = 16


In [5]:
# Experiment configurations
short_datasets = "m4_yearly m4_quarterly m4_monthly m4_weekly m4_daily m4_hourly electricity/15T electricity/H electricity/D electricity/W solar/10T solar/H solar/D solar/W hospital covid_deaths us_births/D us_births/M us_births/W saugeenday/D saugeenday/M saugeenday/W temperature_rain_with_missing kdd_cup_2018_with_missing/H kdd_cup_2018_with_missing/D car_parts_with_missing restaurant hierarchical_sales/D hierarchical_sales/W LOOP_SEATTLE/5T LOOP_SEATTLE/H LOOP_SEATTLE/D SZ_TAXI/15T SZ_TAXI/H M_DENSE/H M_DENSE/D ett1/15T ett1/H ett1/D ett1/W ett2/15T ett2/H ett2/D ett2/W jena_weather/10T jena_weather/H jena_weather/D bitbrains_fast_storage/5T bitbrains_fast_storage/H bitbrains_rnd/5T bitbrains_rnd/H bizitobs_application bizitobs_service bizitobs_l2c/5T bizitobs_l2c/H"
med_long_datasets = "electricity/15T electricity/H solar/10T solar/H kdd_cup_2018_with_missing/H LOOP_SEATTLE/5T LOOP_SEATTLE/H SZ_TAXI/15T M_DENSE/H ett1/15T ett1/H ett2/15T ett2/H jena_weather/10T jena_weather/H bitbrains_fast_storage/5T bitbrains_rnd/5T bizitobs_application bizitobs_service bizitobs_l2c/5T bizitobs_l2c/H"
dataset_properties = 'notebooks/dataset_properties.json'

In [None]:
# Auxiliary functions
def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
def LoadFlowState(pred_length, n_ch, freq, device='cpu', domain=None, nd=False):
    flowstate = FlowStateForPrediction.from_pretrained(model_name).to(device)

    config = flowstate.config
    config.min_context = 0
    config.device = device
    flowstate = FlowState_Gift_Wrapper(flowstate, pred_length, n_ch=n_ch, batch_size=batch_size, 
                                 f=freq, device=device, domain=domain, no_daily=nd)
    return flowstate

Experiment wrapper

In [7]:
base_row = [
            "dataset",
            "model",
            "eval_metrics/MSE[mean]",
            "eval_metrics/MSE[0.5]",
            "eval_metrics/MAE[mean]",
            "eval_metrics/MAE[0.5]",
            "eval_metrics/MASE[0.5]",
            "eval_metrics/MAPE[0.5]",
            "eval_metrics/sMAPE[0.5]",
            "eval_metrics/MSIS",
            "eval_metrics/RMSE[mean]",
            "eval_metrics/NRMSE[mean]",
            "eval_metrics/ND[0.5]",
            "eval_metrics/mean_weighted_sum_quantile_loss",
            "domain",
            "num_variates"
        ]

def run_gift_eval(zs=False, save=False, verbose=True):    
    set_seed(seed)

    # Get union of short and med_long datasets
    all_datasets = sorted(set(short_datasets.split() + med_long_datasets.split()))
    dataset_properties_map = json.load(open(dataset_properties))

    # Instantiate the metrics
    metrics = [
        MSE(forecast_type="mean"),
        MSE(forecast_type=0.5),
        MAE(forecast_type="mean"),
        MAE(forecast_type=0.5),
        MASE(),
        MAPE(),
        SMAPE(),
        MSIS(),
        RMSE(),
        NRMSE(),
        ND(),
        MeanWeightedSumQuantileLoss(quantile_levels=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
    ]

    # ## Evaluation
    # Define the path for the CSV file
    csv_file_path = os.path.join(out_dir, "all_results.csv")

    pretty_names = {
        "saugeenday": "saugeen",
        "temperature_rain_with_missing": "temperature_rain",
        "kdd_cup_2018_with_missing": "kdd_cup_2018",
        "car_parts_with_missing": "car_parts",
    }

    if not os.path.exists(csv_file_path) and save:
        with open(csv_file_path, "a", newline="") as csvfile:
            writer = csv.writer(csvfile)

            # Write the header
            writer.writerow(base_row)
    if save:
        df_res_done = pd.read_csv(csv_file_path)
        done_datasets = df_res_done["dataset"].values
    else:
        done_datasets = []
    df_res = pd.DataFrame(columns=base_row)

    if zs: # the zero-shot subset whithout data leakage for the chronos pretraining corpus (to fairly compare with tirex and chronos models)
        excluded = ["solar/H", "m4_monthly", "m4_weekly", "m4_daily", "m4_hourly", "electricity/15T", "electricity/H", "electricity/W", "kdd_cup_2018_with_missing/D", "kdd_cup_2018_with_missing/H", "temperature_rain_with_missing"]
    else:
        excluded = []

    for ds_name in all_datasets:
        if ds_name in excluded:
            continue
        set_seed(seed)
        terms = ["short", "medium", "long"]
        # terms = ["short"]
        # terms = ["medium", "long"]
        for term in terms:
            if (term == "medium" or term == "long") and ds_name not in med_long_datasets.split():
                continue

            if "/" in ds_name:
                ds_key = ds_name.split("/")[0]
                ds_freq = ds_name.split("/")[1]
                ds_key = ds_key.lower()
                ds_key = pretty_names.get(ds_key, ds_key)
            else:
                ds_key = ds_name.lower()
                ds_key = pretty_names.get(ds_key, ds_key)
                ds_freq = dataset_properties_map[ds_key]["frequency"]
            ds_config = f"{ds_key}/{ds_freq}/{term}"

            to_univariate = (
                False
                if Dataset(name=ds_name, term=term, to_univariate=False).target_dim == 1
                else True
                )
            dataset = Dataset(name=ds_name, term=term, to_univariate=to_univariate)

            all_lengths = []
            for x in dataset.test_data:
                if len(x[0]["target"].shape) == 1:
                    all_lengths.append(len(x[0]["target"]))
                    num_channels = 1
                else:
                    all_lengths.append(x[0]["target"].shape[1])
                    num_channels = x[0]["target"].shape[0]

            if ds_config in done_datasets:
                df_res = df_res._append(df_res_done.loc[df_res_done['dataset'] ==ds_config], ignore_index=True)
                continue

            if verbose: print(f"Dataset: {ds_name}, Freq = {dataset.freq}, H = {dataset.prediction_length}")

            no_daily = 'l2c' in ds_name # necessary to get correct seasonality for bizitobs_l2c datasets (which have no daily cycles)
            flowstate = LoadFlowState(pred_length=dataset.prediction_length,
                                     n_ch=num_channels,
                                     freq=dataset.freq,
                                     device=device,
                                     domain=dataset_properties_map[ds_key]["domain"],
                                     nd=no_daily,
                                     )

            with torch.no_grad():
                # Evaluate
                res = evaluate_model(
                    flowstate,
                    test_data=dataset.test_data, 
                    metrics=metrics,
                    batch_size=batch_size,
                    axis=None,
                    mask_invalid_label=True,
                    allow_nan_forecast=False,
                    # seasonality=season_length,
                )
            if verbose: print(f'MASE: {res["MASE[0.5]"][0]}')
            # Append the results to the CSV file
            row =   [
                    ds_config,
                    model_name,
                    res["MSE[mean]"][0],
                    res["MSE[0.5]"][0],
                    res["MAE[mean]"][0],
                    res["MAE[0.5]"][0],
                    res["MASE[0.5]"][0],
                    res["MAPE[0.5]"][0],
                    res["sMAPE[0.5]"][0],
                    res["MSIS"][0],
                    res["RMSE[mean]"][0],
                    res["NRMSE[mean]"][0],
                    res["ND[0.5]"][0],
                    res["mean_weighted_sum_quantile_loss"][0],
                    dataset_properties_map[ds_key]["domain"],
                    dataset_properties_map[ds_key]["num_variates"],
                ]
            if save:
                with open(csv_file_path, "a", newline="") as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(row)
                if verbose: print(f"Results for {ds_name} have been written to {csv_file_path}")
            df_res.loc[len(df_res)] = row 


    # Print Results
    seasonal_naive = pd.read_csv(f"results/seasonal_naive/all_results.csv").sort_values('dataset')
    dataset = seasonal_naive['dataset'].to_list()
    seasonal_naive_mase = seasonal_naive[f'eval_metrics/MASE[0.5]'].to_numpy()
    seasonal_naive_crps = seasonal_naive[f'eval_metrics/mean_weighted_sum_quantile_loss'].to_numpy()
    df = df_res
    df = df.sort_values(by="dataset")
    df['normalized MASE'] = np.zeros(len(df))
    df['normalized CRPS'] = np.zeros(len(df))
    df['freq'] = np.zeros(len(df))
    df['len'] = np.zeros(len(df))
    for ds in df['dataset']:
        idx = dataset.index(ds)
        _, f, l = ds.split('/')
        df.loc[df['dataset'] ==ds, 'freq'] = f
        df.loc[df['dataset'] ==ds, 'len'] = l
        df.loc[df['dataset'] ==ds, 'normalized MASE'] = df.loc[df['dataset'] ==ds, f'eval_metrics/MASE[0.5]'].values / seasonal_naive_mase[idx]
        df.loc[df['dataset'] ==ds, 'normalized CRPS'] = df.loc[df['dataset'] ==ds, f'eval_metrics/mean_weighted_sum_quantile_loss'].values / seasonal_naive_crps[idx]


    df = df.sort_values(by=['dataset'])
    def geo_mean(iterable):
        a = np.array(iterable)
        return a.prod()**(1.0/len(a))

    mase = geo_mean(df['normalized MASE'].to_numpy())
    crps = geo_mean(df['normalized CRPS'].to_numpy())
    if verbose:
        print(
            df[
                [
                    "dataset",
                    "freq",
                    'eval_metrics/MASE[0.5]',
                    'eval_metrics/mean_weighted_sum_quantile_loss',
                    "normalized MASE",
                    "normalized CRPS",
                ]
            ].to_markdown()
        )

    return mase, crps

Start the experiment

In [8]:
mase, crps = run_gift_eval(verbose=True, save=True)
print(f'Final GIFT-Eval Performance of {model_name}:\nMASE = {mase}, CRPS = {crps}')

|    | dataset                          | freq   |   eval_metrics/MASE[0.5] |   eval_metrics/mean_weighted_sum_quantile_loss |   normalized MASE |   normalized CRPS |
|---:|:---------------------------------|:-------|-------------------------:|-----------------------------------------------:|------------------:|------------------:|
| 17 | bitbrains_fast_storage/5T/long   | 5T     |                 1.03746  |                                      0.825588  |          0.91274  |          0.701251 |
| 16 | bitbrains_fast_storage/5T/medium | 5T     |                 1.26089  |                                      0.779547  |          1.03329  |          0.650754 |
| 15 | bitbrains_fast_storage/5T/short  | 5T     |                 1.00325  |                                      0.511307  |          0.883125 |          0.422433 |
| 18 | bitbrains_fast_storage/H/short   | H      |                 1.10441  |                                      0.699613  |          0.850518 |          0.684393 