In [4]:
import pandas as pd

## Collect results of experiments from Metaflow

In [5]:
def get_results(run_id: int) -> pd.DataFrame:
    try:
        # Try loading the final results if run has finished
        results_for_run = Run(f"ForecastEvaluation/{run_id}").data.results_full
    except (KeyError, AttributeError):
        # Manually collecting results for run if some jobs are still in progress
        results_list = []
        for t in Step(f"ForecastEvaluation/{run_id}/evaluate_dataset").tasks():
            try:
                results_list.append(t.data.results)
            except KeyError:
                pass
        results_for_run = pd.concat(results_list)
    return results_for_run

### Option 1: Provide IDs of runs for each model

In [6]:
save_to_disk = False
collect_results = False

In [7]:
# Make sure to replace these with your run IDs!
if collect_results:
    from metaflow import Flow, Run, Step
    results_all = pd.concat([
        get_results(1712079941097970),  # SeasonalNaive
        get_results(1712079795572065),  # StatisticalEnsemble
        get_results(1712080010851589),  # chronos_mini
        get_results(1712081461874960),  # chronos_large
    ])
if save_to_disk:
    results_all.to_csv("results/results_all.csv", index=False)

### Option 2: Collect results from the most recent runs

In [8]:
if collect_results:
    results_all = []
    for run in list(Flow("ForecastEvaluation").runs())[:5]:
        results_all.append(get_results(run.id))
    results_all = pd.concat(results_all).dropna(subset="value").drop_duplicates(["dataset", "model", "metric"])
if save_to_disk:
    results_all.to_csv("results/results_all.csv", index=False)

### Option 3: Load results from disk

In [43]:
results_all = pd.read_csv("results/results_all.csv")
results_all = results_all.query("model.str.startswith('amazon')")

## Patch Results modifying context length seen by the Statistical Ensemble

In [44]:
results_all = pd.concat([
    results_all,
    pd.read_csv("results/complete-results.csv"),
])

## Combine the results into a table

In [45]:
table = results_all.set_index(["dataset", "metric", "model"]).unstack(level=1).unstack(1).round(3).droplevel(0, axis=1)

In [46]:
# StatisticalEnsemble takes >24 hours to forecast on `ett_small_15min`, so the result is missing for this dataset.
original_datasets = [
    "m1_monthly",
    "m1_quarterly", 
    "m1_yearly", 
    "m3_monthly", 
    "m3_other", 
    "m3_quarterly",
    "m3_yearly", 
    "m4_quarterly", 
    "m4_yearly", 
    "tourism_monthly",
    "tourism_quarterly", 
    "tourism_yearly", 
]
dataset_order = [
    "australian_electricity_demand", 
    "car_parts_without_missing",
    "cif_2016", 
    "covid_deaths", 
    "dominick", 
    "ercot", 
    "ett_small_15min",
    "ett_small_1h", 
    "exchange_rate", 
    "fred_md", 
    "hospital", 
    "m5",
    "nn5_daily_without_missing", 
    "nn5_weekly", 
    "traffic", 
    "weather",
    *original_datasets
]
table = table.reindex(dataset_order)
table

metric,mase,mase,mase,mase,scaled_crps,scaled_crps,scaled_crps,scaled_crps,smape,smape,smape,smape,time,time,time,time
model,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
australian_electricity_demand,1.34,1.115,1.184,0.882,0.098,0.057,0.054,0.042,0.059,0.051,0.051,0.04,0.36,2716.869,4.703,2.116
car_parts_without_missing,1.12,1.051,0.807,0.803,2.225,1.132,1.059,1.022,0.31,0.897,0.947,0.957,0.915,37.927,61.375,5.296
cif_2016,1.289,0.902,0.986,1.025,0.056,0.021,0.015,0.019,0.094,0.057,0.074,0.076,0.603,9.755,3.744,1.532
covid_deaths,7.762,5.248,6.54,6.555,0.116,0.024,0.05,0.072,0.093,0.054,0.205,0.204,0.815,22.031,32.049,3.871
dominick,0.828,0.848,0.786,0.782,2.21,0.529,0.414,0.399,0.16,0.782,0.809,0.817,12.371,874.045,8661.922,653.726
ercot,0.761,1.356,0.578,0.585,0.039,0.034,0.017,0.016,0.016,0.027,0.012,0.012,0.397,390.501,3.666,1.979
ett_small_15min,0.768,0.638,0.714,0.739,0.143,0.083,0.083,0.088,0.095,0.1,0.11,0.116,0.457,5034.763,4.933,2.05
ett_small_1h,0.932,0.852,0.737,0.805,0.153,0.109,0.083,0.085,0.103,0.117,0.091,0.1,0.209,332.13,4.607,1.688
exchange_rate,1.524,1.407,1.882,2.118,0.016,0.007,0.011,0.01,0.005,0.004,0.006,0.007,0.094,9.354,3.613,1.554
fred_md,1.101,0.482,0.571,0.564,0.082,0.033,0.029,0.029,0.073,0.052,0.052,0.052,0.847,49.065,15.487,2.359


### Compute average performance using geometric mean
1. For each dataset and each model, we compute the **relative score** by dividing the model score by the score of the baseline (SeasonalNaive). This makes the scores comparable across datasets.
2. We aggregate the relative scores of each model across all datasets by taking the **geometric mean** (as recommended by [Fleming & Wallace](https://dl.acm.org/doi/10.1145/5666.5673)).

In [47]:
from scipy.stats import gmean

def calculate_gmean(table: pd.DataFrame):
    results = []
    for metric in ["mase", "scaled_crps", "smape"]:
        scaled = table[metric].divide(table[metric]["SeasonalNaive"], axis=0).fillna(1.0)
        agg = pd.concat({metric: scaled.apply(gmean)})
        results.append(agg)
    return pd.concat(results).round(3).to_frame().T

In [48]:
calculate_gmean(table)

Unnamed: 0_level_0,mase,mase,mase,mase,scaled_crps,scaled_crps,scaled_crps,scaled_crps,smape,smape,smape,smape
model,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini
0,1.0,0.809,0.81,0.845,1.0,0.483,0.472,0.485,1.0,0.964,1.034,1.085


## Create LaTeX table

In [51]:
full_df = []
for metric in ["mase", "scaled_crps", "smape", "time"]:
    tab = table[metric][["StatisticalEnsemble", "amazon/chronos-t5-large", "amazon/chronos-t5-mini", "SeasonalNaive"]]
    if metric == "time":
        tab = tab.round(1)
    tab = tab.rename(columns={"amazon/chronos-t5-large": "Chronos (large)", "amazon/chronos-t5-mini": "Chronos (mini)"})
    formatted_df = []
    for dataset, row in tab.iterrows():
        first, second = row.nsmallest(2).index
        d = 1 if metric == "time" else 3
        row = row.apply(lambda x: f"{x:.{d}f}")
        row.loc[first] = r"\textbf{" + row.loc[first] + "}"
        row.loc[second] = r"\underline{" + row.loc[second] + "}"
        formatted_df.append(row.to_frame().T)
    formatted_df = pd.concat(formatted_df)
    formatted_df.columns = pd.MultiIndex.from_product([[metric], formatted_df.columns])
    full_df.append(formatted_df)
full_df = pd.concat(full_df, axis=1)
full_df.index = [x.replace("_", "-") for x in full_df.index]

In [52]:
print(full_df.style.to_latex().replace("nan", "N/A"))

\begin{tabular}{lllllllllllllllll}
 & \multicolumn{4}{r}{mase} & \multicolumn{4}{r}{scaled_crps} & \multicolumn{4}{r}{smape} & \multicolumn{4}{r}{time} \\
model & StatisticalEnsemble & Chronos (large) & Chronos (mini) & SeasonalNaive & StatisticalEnsemble & Chronos (large) & Chronos (mini) & SeasonalNaive & StatisticalEnsemble & Chronos (large) & Chronos (mini) & SeasonalNaive & StatisticalEnsemble & Chronos (large) & Chronos (mini) & SeasonalNaive \\
australian-electricity-demand & \underline{1.115} & 1.184 & \textbf{0.882} & 1.340 & 0.057 & \underline{0.054} & \textbf{0.042} & 0.098 & \underline{0.051} & 0.051 & \textbf{0.040} & 0.059 & 2716.9 & 4.7 & \underline{2.1} & \textbf{0.4} \\
car-parts-without-missing & 1.051 & \underline{0.807} & \textbf{0.803} & 1.120 & 1.132 & \underline{1.059} & \textbf{1.022} & 2.225 & \underline{0.897} & 0.947 & 0.957 & \textbf{0.310} & 37.9 & 61.4 & \underline{5.3} & \textbf{0.9} \\
cif-2016 & \textbf{0.902} & \underline{0.986} & 1.025 & 1.289 & 0.021