In [1]:
import pandas as pd
from autogluon_benchmark.evaluation.evaluate_results import evaluate

In [2]:
# Replace this with the path to your CSV file
path_to_combined_csv = "results/full_results.csv"

In [3]:
def framework_name(row):
    """Get a unique framework name from the configuration."""
    if row.framework == "AutoGluon_bestquality":
        if row.constraint == "10m16c":
            return "AutoGluon_10m"
        elif row.constraint == "1h16c":
            return "AutoGluon_1h"
        if row.metric == "neg_wql":
            return "AutoGluon_wql"
        elif row.metric == "neg_mase":
            return "AutoGluon_mase"
    return row.framework

In [4]:
results = pd.read_csv(path_to_combined_csv)
results["framework"] = results.apply(framework_name, axis=1)
results["time_total_s"] = results["training_duration"] + results["predict_duration"]
results.rename(columns={"task": "dataset", "training_duration": "time_train_s", "predict_duration": "time_infer_s"}, inplace=True)

In [5]:
results_avg = results.groupby(["dataset", "framework"], as_index=False).mean(numeric_only=True)
results_avg["problem_type"] = "timeseries"

## Aggregated results (Tables 3 & 4)

In [6]:
results_mase = results_avg.copy()
results_mase["metric_error"] = results_avg["mase"]

results_wql = results_avg.copy()
results_wql["metric_error"] = results_avg["wql"]

baselines = ["AutoARIMA", "AutoETS", "AutoTheta", "StatEnsemble", "DeepAR", "TFT"]

In [7]:
# Table 3
table_mase = evaluate(
    results_mase, 
    frameworks=["AutoGluon_mase", "AutoPyTorch"] + baselines,
    frameworks_compare_vs_all=["AutoGluon_mase"],
)[4]

FOUND 7 unused columns, dropping... Unused columns: ['result', 'time_infer_s', 'models_count', 'seed', 'mase', 'wql', 'time_total_s']
Filtering to only valid columns: ['framework', 'dataset', 'fold', 'problem_type', 'metric_error', 'time_train_s']
Filtered to only valid frameworks: 8 frameworks
num_datasets: 29
num_folds: 1
errors: 9
################################################
framework: AutoARIMA
	datasets_framework_errors: ['electricity_hourly', 'kdd_cup_2018', 'pedestrian_counts']
	datasets_framework_errors_count: 3
	framework_fold_errors: 3
################################################
framework: AutoETS
	datasets_framework_errors: []
	datasets_framework_errors_count: 0
	framework_fold_errors: 0
################################################
framework: AutoGluon_mase
	datasets_framework_errors: []
	datasets_framework_errors_count: 0
	framework_fold_errors: 0
################################################
framework: AutoPyTorch
	datasets_framework_errors: []
	datasets_fr

In [8]:
cols = ["framework", ">", "<", "=", "error_count", "rank=1_count", "rank", "loss_rescaled"]
table_mase["AutoGluon_mase"][cols].round(3).set_index("framework")

Unnamed: 0_level_0,>,<,=,error_count,rank=1_count,rank,loss_rescaled
framework,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AutoGluon_mase,0,0,29,0,19,2.08,0.073
StatEnsemble,6,20,0,3,3,3.12,0.238
AutoPyTorch,4,25,0,0,2,4.12,0.257
AutoETS,4,25,0,0,1,4.64,0.374
AutoTheta,4,23,0,2,0,4.92,0.427
DeepAR,4,24,0,1,2,5.08,0.434
AutoARIMA,4,22,0,3,1,5.92,0.612
TFT,2,27,0,0,1,6.12,0.635


In [9]:
# Table 4
table_wql = evaluate(
    results_wql, 
    frameworks=["AutoGluon_wql"] + baselines,
    frameworks_compare_vs_all=["AutoGluon_wql"],
)[4]

FOUND 7 unused columns, dropping... Unused columns: ['result', 'time_infer_s', 'models_count', 'seed', 'mase', 'wql', 'time_total_s']
Filtering to only valid columns: ['framework', 'dataset', 'fold', 'problem_type', 'metric_error', 'time_train_s']
Filtered to only valid frameworks: 7 frameworks
num_datasets: 29
num_folds: 1
errors: 9
################################################
framework: AutoARIMA
	datasets_framework_errors: ['electricity_hourly', 'kdd_cup_2018', 'pedestrian_counts']
	datasets_framework_errors_count: 3
	framework_fold_errors: 3
################################################
framework: AutoETS
	datasets_framework_errors: []
	datasets_framework_errors_count: 0
	framework_fold_errors: 0
################################################
framework: AutoGluon_wql
	datasets_framework_errors: []
	datasets_framework_errors_count: 0
	framework_fold_errors: 0
################################################
framework: AutoTheta
	datasets_framework_errors: ['electricity_hour

In [10]:
cols = ["framework", ">", "<", "=", "error_count", "rank=1_count", "rank", "loss_rescaled"]
table_wql["AutoGluon_wql"][cols].round(3).set_index("framework")

Unnamed: 0_level_0,>,<,=,error_count,rank=1_count,rank,loss_rescaled
framework,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AutoGluon_wql,0,0,29,0,19,1.8,0.086
StatEnsemble,3,23,0,3,0,3.36,0.33
DeepAR,5,23,0,1,1,4.08,0.455
TFT,5,24,0,0,5,4.24,0.487
AutoETS,3,26,0,0,2,4.4,0.489
AutoTheta,2,25,0,2,1,5.0,0.545
AutoARIMA,4,22,0,3,1,5.12,0.641


## Ablation studies

In [11]:
ablation_names = ["AutoGluon_NoDeepModels", "AutoGluon_NoStatModels", "AutoGluon_NoTreeModels", "AutoGluon_NoEnsemble", "AutoGluon_10m", "AutoGluon_1h"]

In [12]:
results_per_ablation = []
for abl in ablation_names + ["AutoGluon_mase"]:
    abl_results = evaluate(
        results_mase, 
        frameworks=[abl, "AutoPyTorch"] + baselines,
        frameworks_compare_vs_all=[abl],
    )[4]
    results_per_ablation.append(abl_results[abl].set_index("framework").loc[abl])

FOUND 7 unused columns, dropping... Unused columns: ['result', 'time_infer_s', 'models_count', 'seed', 'mase', 'wql', 'time_total_s']
Filtering to only valid columns: ['framework', 'dataset', 'fold', 'problem_type', 'metric_error', 'time_train_s']
Filtered to only valid frameworks: 8 frameworks
num_datasets: 29
num_folds: 1
errors: 9
################################################
framework: AutoARIMA
	datasets_framework_errors: ['electricity_hourly', 'kdd_cup_2018', 'pedestrian_counts']
	datasets_framework_errors_count: 3
	framework_fold_errors: 3
################################################
framework: AutoETS
	datasets_framework_errors: []
	datasets_framework_errors_count: 0
	framework_fold_errors: 0
################################################
framework: AutoGluon_NoDeepModels
	datasets_framework_errors: []
	datasets_framework_errors_count: 0
	framework_fold_errors: 0
################################################
framework: AutoPyTorch
	datasets_framework_errors: []
	dat

In [13]:
abl_results = pd.DataFrame(results_per_ablation).sort_values("rank")
abl_results[["rank=1_count", "rank", "loss_rescaled"]].round(3)

Unnamed: 0,rank=1_count,rank,loss_rescaled
AutoGluon_1h,19.0,2.04,0.07
AutoGluon_mase,19.0,2.08,0.073
AutoGluon_NoStatModels,16.0,2.12,0.094
AutoGluon_NoTreeModels,15.0,2.12,0.085
AutoGluon_NoDeepModels,15.0,2.28,0.124
AutoGluon_10m,14.0,2.5,0.099
AutoGluon_NoEnsemble,7.0,3.52,0.177


## Individual results (Tables 9, 10, 11)

In [14]:
# Table 9
columns = ["SeasonalNaive"] + baselines + ["AutoPyTorch", "AutoGluon_mase"]
results_avg.pivot_table(index="dataset", columns="framework")["mase"][columns].round(3)

  results_avg.pivot_table(index="dataset", columns="framework")["mase"][columns].round(3)


framework,SeasonalNaive,AutoARIMA,AutoETS,AutoTheta,StatEnsemble,DeepAR,TFT,AutoPyTorch,AutoGluon_mase
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
car_parts,1.127,1.118,1.133,1.208,1.052,0.749,0.751,0.746,0.747
cif_2016,1.289,1.069,0.898,1.006,0.945,1.278,1.372,1.023,1.073
covid_deaths,8.977,6.029,5.907,7.719,5.884,7.166,5.192,4.911,5.805
electricity_hourly,1.405,,1.465,,,1.251,1.389,1.42,1.227
electricity_weekly,3.037,3.009,3.076,3.113,3.077,2.447,2.861,2.322,1.892
fred_md,1.101,0.478,0.505,0.564,0.498,0.634,0.901,0.682,0.656
hospital,0.921,0.82,0.766,0.764,0.753,0.771,0.814,0.77,0.741
kdd_cup_2018,0.975,,0.988,1.01,,0.841,0.844,0.764,0.709
m1_monthly,1.314,1.152,1.083,1.092,1.045,1.117,1.534,1.278,1.235
m1_quarterly,2.078,1.77,1.665,1.667,1.622,1.742,2.099,1.813,1.615


In [15]:
# Table 10
columns = ["SeasonalNaive"] + baselines + ["AutoGluon_wql"]
results_avg.pivot_table(index="dataset", columns="framework")["wql"][columns].round(3)

  results_avg.pivot_table(index="dataset", columns="framework")["wql"][columns].round(3)


framework,SeasonalNaive,AutoARIMA,AutoETS,AutoTheta,StatEnsemble,DeepAR,TFT,AutoGluon_wql
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
car_parts,1.717,1.589,1.338,1.367,1.324,0.963,0.878,0.923
cif_2016,0.031,0.017,0.039,0.027,0.028,0.114,0.01,0.019
covid_deaths,0.14,0.03,0.046,0.094,0.046,0.072,0.031,0.03
electricity_hourly,0.108,,0.1,,,0.081,0.097,0.076
electricity_weekly,0.141,0.138,0.144,0.146,0.141,0.123,0.118,0.088
fred_md,0.104,0.056,0.05,0.057,0.054,0.054,0.114,0.056
hospital,0.062,0.058,0.053,0.055,0.053,0.053,0.054,0.051
kdd_cup_2018,0.489,,0.55,0.553,,0.363,0.488,0.323
m1_monthly,0.153,0.146,0.163,0.159,0.152,0.136,0.224,0.135
m1_quarterly,0.119,0.088,0.081,0.082,0.083,0.084,0.093,0.09


In [17]:
# Table 11
columns = ["SeasonalNaive"] + baselines + ["AutoPyTorch", "AutoGluon_mase"]
(results_avg.pivot_table(index="dataset", columns="framework")["time_total_s"][columns] / 60).round(1)

  (results_avg.pivot_table(index="dataset", columns="framework")["time_total_s"][columns] / 60).round(1)


framework,SeasonalNaive,AutoARIMA,AutoETS,AutoTheta,StatEnsemble,DeepAR,TFT,AutoPyTorch,AutoGluon_mase
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
car_parts,0.1,2.4,0.6,0.7,3.3,6.9,9.2,240.3,17.4
cif_2016,0.1,0.4,0.5,0.6,1.3,4.1,6.2,240.2,16.7
covid_deaths,0.1,1.4,0.5,0.7,2.3,7.9,8.8,240.4,29.3
electricity_hourly,0.2,,21.6,,,10.4,19.5,240.4,61.2
electricity_weekly,0.2,0.3,0.4,0.5,1.0,3.1,6.6,240.2,14.9
fred_md,0.1,2.4,0.7,0.6,3.4,6.8,5.5,240.2,16.8
hospital,0.1,0.9,0.7,0.7,2.1,4.6,7.6,240.2,17.4
kdd_cup_2018,0.1,,16.3,22.8,,12.4,11.9,240.3,56.0
m1_monthly,0.1,1.5,0.8,0.7,2.7,5.5,6.2,240.2,21.6
m1_quarterly,0.1,0.3,0.5,0.7,1.3,5.9,5.4,240.2,15.6
