In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

In [168]:
df = pd.read_csv("/home/shchuro/data/af_amlb/solar_10_minutes_dataset/test.csv")

In [174]:
df.groupby("item_id").tail(36)["target"]

52524      0.0
52525      0.0
52526      0.0
52527      0.0
52528      0.0
          ... 
7200715    0.0
7200716    0.0
7200717    0.0
7200718    0.0
7200719    0.0
Name: target, Length: 4932, dtype: float64

In [4]:
results_dir = Path("/home/shchuro/workspace/amlb/results/")
run_name = "autogluon_bestquality.ts2.8h16c.aws.20230707T202109"
run_dir = results_dir / run_name

In [227]:
all_results = []
for f in sorted(run_dir.iterdir()):
    if f.is_dir():
        results = pd.read_csv(f / "output" / "results.csv")
        all_results.append(results)

In [232]:
results_ag = pd.concat(all_results, ignore_index=True)
results_ag = results_ag.rename(
    columns={"task": "dataset", "duration": "total_duration"}
).set_index(["dataset", "framework"])[["wql", "mase", "predict_duration", "total_duration"]]

In [233]:
results_af = pd.read_csv("af_results.csv").fillna(method="ffill").drop("Unnamed: 0", axis=1)
results_af = results_af.rename(
    columns={"DatasetName": "dataset", "Service": "framework", "AVG_WQL": "wql", "TotalTime": "total_duration", "ForecastTime": "predict_duration", "MASE": "mase"}
).set_index(["dataset", "framework"])[["wql", "mase", "predict_duration", "total_duration"]]

In [234]:
results = pd.concat([results_ag, results_af]).sort_index()
results = results.query("framework != 'NewHorizon'").unstack().dropna()

## Setup
- Evaluate AutoGluon (AG) and Amazon Forecast (AF) on 63 datasets using the WQL metric computed on P10, P50, P90

## Results
- AutoGluon fails on 13 out of 69 datasets for following reasons:
    - 5x irregular DateTimeIndex (`airline_delay`, `iowa_liqour`, `iowa_liqour_subset`, `Kaggle_retail`, `online_retail_I`)
    - 5x missing values represented by NaNs not supported by AG out of the box (`bitcoin_dataset_with_missing_values`, `car_parts_dataset_with_missing_values`, `kdd_cup_2018_dataset_with_missing_values`, `nn5_daily_dataset_with_missing_values`, `sunspot_dataset_with_missing_values`)
    - 1x time series with length <= 2 in training data (`algo`)
    - 1x OOM error for `DirectTabular` model (`m5`)
    - 1x Installation error (???) (`pedestrian_counts_dataset`) - investigating

For the remaining 56 datasets:
- AutoGluon achieves better WQL than AF for 29 (52%) of the datasets with median WQL difference of 0.001
    - TODO: Make sure that the metrics are computed consistently between the frameworks
- AutoGluon is faster than AF for 56 (100%) of the datasets with median speedup of 3.5x (defined as `total_time_af / total_time_ag`) & median runtime difference of 3823 seconds (`total_time_af - total_time_ag`).

In [253]:
af = "AmazonForecast"
ag = "AutoGluon_bestquality"
wql = results["wql"]
mase = results["mase"]
time = results["total_duration"]

In [262]:
(wql[af] - wql[ag]).median()

0.0009562500000000404

In [259]:
(time[af] - time[ag]).median()

3823.8

In [258]:
(time[af] / time[ag]).median()

3.22265209364153

In [3]:
ets = pd.read_csv("../results/autoets.timeseries.test.local.20230717T105456/scores/results.csv")

In [4]:
snaive = pd.read_csv("../results/seasonalnaiveforecast.timeseries.test.local.20230717T110853/scores/results.csv")

In [9]:
snaive[["fold", "rmse", "mase", "smape", "mape", "sql", "wql"]]

Unnamed: 0,fold,rmse,mase,smape,mape,sql,wql
0,0,1901.15,1.19321,0.139123,0.15612,0.665828,0.026355
1,1,2794.11,1.27521,0.151116,0.192171,0.716536,0.031972


In [10]:
ets[["fold", "rmse", "mase", "smape", "mape", "sql", "wql"]]

Unnamed: 0,fold,rmse,mase,smape,mape,sql,wql
0,0,2600.08,1.60941,0.172009,0.292844,1.0966,0.044856
1,1,3180.03,1.66247,0.183552,0.317726,1.23566,0.056119
