## Loading TabPFN-TS results

If you have already evaluated TabPFN-TS yourself and would like to use the results for aggregation and visualization, here's a quick routine to load the results.

### Loading your results

In [1]:
# import glob
# import pandas as pd

# RESULTS_ROOT_DIR = "gift_eval/results"

# def find_all_results_csv(root_dir):
#     return glob.glob(f"{root_dir}/**/results.csv", recursive=True)

# all_baseline_results_files = find_all_results_csv(RESULTS_ROOT_DIR)

# # Merge all results
# all_results = pd.concat([pd.read_csv(f) for f in all_baseline_results_files])
# all_results.head()

### Loading results provided by us

We have also provided the results for all the benchmarking tasks in the `gift_eval/tabpfn_ts_results` directory.

In [2]:
import pandas as pd

ALL_RESULTS_PATH = "gift_eval/tabpfn_ts_results/all_results.csv"

all_results = pd.read_csv(ALL_RESULTS_PATH)
all_results.head()


Unnamed: 0,dataset,model,eval_metrics/MSE[mean],eval_metrics/MSE[0.5],eval_metrics/MAE[0.5],eval_metrics/MASE[0.5],eval_metrics/MAPE[0.5],eval_metrics/sMAPE[0.5],eval_metrics/MSIS,eval_metrics/RMSE[mean],eval_metrics/NRMSE[mean],eval_metrics/ND[0.5],eval_metrics/mean_weighted_sum_quantile_loss,domain,num_variates
0,bitbrains_fast_storage/5T/long,TabPFN-TS,5094671.0,5094671.0,483.162473,1.152664,6.841835,0.805314,70.219287,2257.13783,5.964941,1.276854,0.885016,Web/CloudOps,2
1,bitbrains_fast_storage/5T/medium,TabPFN-TS,4622063.0,4622063.0,441.637621,1.307634,6.80794,0.805071,98.995552,2149.898456,6.531985,1.341817,0.948632,Web/CloudOps,2
2,bitbrains_fast_storage/5T/short,TabPFN-TS,2246919.0,2246919.0,263.976427,0.998051,4.243834,0.746523,83.52859,1498.972668,4.706403,0.828821,0.661819,Web/CloudOps,2
3,bitbrains_fast_storage/H/short,TabPFN-TS,3140360.0,3140360.0,318.895975,1.184095,4.487001,0.558861,23.623996,1772.106184,5.051078,0.908957,0.669886,Web/CloudOps,2
4,bitbrains_rnd/5T/long,TabPFN-TS,2963662.0,2963662.0,265.860295,3.874638,4.921696,0.749694,253.212313,1721.528843,6.594789,1.018451,0.819124,Web/CloudOps,2


## Loading Baseline results

In [3]:
from pathlib import Path

BASELINE_RESULTS_ROOT_DIR = Path("gift_eval/baseline_results")

# Recursively find all "all_results.csv" files under the EXTERNAL_RESULTS_ROOT_DIR
all_baseline_results_files = list(BASELINE_RESULTS_ROOT_DIR.glob("**/all_results.csv"))

# Read and concatenate all the CSV files
baseline_results_dfs = []
for file_path in all_baseline_results_files:
    single_df = pd.read_csv(file_path)
    baseline_results_dfs.append(single_df)

# Combine all dataframes if any were successfully read
if baseline_results_dfs:
    all_baseline_results_df = pd.concat(baseline_results_dfs, ignore_index=True)
    print(f"Found and combined {len(baseline_results_dfs)} 'all_results.csv' files")
else:
    all_baseline_results_df = pd.DataFrame()
    print("No 'all_results.csv' files found")

print("all_baseline_results_df.shape", all_baseline_results_df.shape)

Found and combined 10 'all_results.csv' files
all_baseline_results_df.shape (970, 15)


In [4]:
all_results_df = pd.concat([all_results, all_baseline_results_df], ignore_index=True)
all_results_df.rename(columns={
    "eval_metrics/MASE[0.5]": "MASE_0.5",
    "eval_metrics/mean_weighted_sum_quantile_loss": "wQL_mean",
}, inplace=True)
all_results_df.shape

# 10 Baselines + TabPFN-TS = 11 models
# 11 models * 97 tasks = 1067 rows

(1067, 15)

## Utils Functions

In [5]:
import scipy.stats as st
import numpy as np

def gmean_and_ci(x, confidence=0.95):
    if np.any(x < 0):
        raise ValueError("Input contains negative values which are not valid for geometric mean")
    
    # If all values are the same, return the value
    if x.nunique() == 1:
        return pd.Series({'mean': x.values[0], 'ci_lower': x.values[0], 'ci_upper': x.values[0]})
    
    gmean = st.gmean(x)
    
    # Calculate confidence intervals
    log_x = np.log(x[x > 0])
    se = st.sem(log_x)
    ci = np.exp(st.t.interval(confidence, len(log_x)-1, loc=np.mean(log_x), scale=se))
    ci_lower = ci[0]
    ci_upper = ci[1]

    return pd.Series({'mean': gmean, 'ci_lower': ci_lower, 'ci_upper': ci_upper})


def amean_and_ci(x, confidence=0.95):
    # If all values are the same, return the value
    if x.nunique() == 1:
        return pd.Series({'mean': x.values[0], 'ci_lower': x.values[0], 'ci_upper': x.values[0]})
    
    amean = np.mean(x)
    
    # Calculate confidence intervals
    se = st.sem(x)
    ci = st.t.interval(confidence, len(x)-1, loc=amean, scale=se)
    ci_lower = ci[0]
    ci_upper = ci[1]
    
    return pd.Series({'mean': amean, 'ci_lower': ci_lower, 'ci_upper': ci_upper})

def normalize_metric_by_baseline(main_df, metric='MASE_0.5', baseline_model='Seasonal_Naive'):
    """
    Normalize a metric by dividing each model's value by the baseline model's value for each dataset.
    
    Parameters:
    -----------
    main_df : pandas.DataFrame
        The dataframe containing the results
    metric : str, default='MASE_0.5'
        The metric to normalize
    baseline_model : str, default='Seasonal_Naive'
        The model to use as baseline for normalization
        
    Returns:
    --------
    pandas.DataFrame
        A dataframe with normalized metric values
    """
    # Create pivot table with datasets as rows and models as columns
    main_df_pivot = main_df.pivot_table(
        index='dataset',
        columns='model',
        values=metric,
    )
    
    # Divide each model's metric by the baseline model's metric for the same dataset
    normalized_pivot = main_df_pivot.div(main_df_pivot[baseline_model], axis=0)
    
    # Convert back to long format
    normalized_df = normalized_pivot.reset_index().melt(
        id_vars='dataset',
        var_name='model',
        value_name=metric
    )
    
    return normalized_df


# Function to compute ranking based on a metric
def compute_ranking(df, metric_column):
    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    df_copy['rank'] = df_copy[metric_column].rank()
    return df_copy

## Normalize MASE and wQL by Seasonal Naive

In [6]:
normalized_mase_df = normalize_metric_by_baseline(all_results_df, metric="MASE_0.5")
normalized_wql_df = normalize_metric_by_baseline(all_results_df, metric="wQL_mean")

In [7]:
# Compute rankings for each individual dataset based on wSQL_mean
rankings = all_results_df.groupby('dataset').apply(
    lambda x: compute_ranking(x, 'wQL_mean')[['model', 'dataset', 'rank']]
).reset_index(drop=True)

ranking_mean_and_ci = rankings.groupby('model').apply(
    lambda x: amean_and_ci(x['rank'])
).reset_index()

norm_wql_mean_and_ci = normalized_wql_df.groupby('model').apply(
    lambda x: gmean_and_ci(x['wQL_mean'])
).reset_index()

norm_mase_mean_and_ci = normalized_mase_df.groupby('model').apply(
    lambda x: gmean_and_ci(x['MASE_0.5'])
).reset_index()

## The final results

### Mean WQL Ranking

In [8]:
ranking_mean_and_ci.sort_values(by='mean', ascending=True)

Unnamed: 0,model,mean,ci_lower,ci_upper
6,TabPFN-TS,3.680412,3.24063,4.120195
10,timesfm_2_0_500m,3.865979,3.363271,4.368688
9,chronos_bolt_base,3.907216,3.390026,4.424407
7,chronos-bolt-small,4.494845,4.095846,4.893845
3,PatchTST,4.93299,4.463778,5.402201
8,chronos-bolt-tiny,5.010309,4.51194,5.508678
5,TFT,5.407216,4.851001,5.963432
2,DeepAR,7.603093,7.082516,8.123669
0,Auto_Arima,8.35567,7.960543,8.750797
1,Auto_Theta,9.097938,8.55397,9.641906


### Relative WQL

In [9]:
norm_wql_mean_and_ci.sort_values(by='mean', ascending=True)

Unnamed: 0,model,mean,ci_lower,ci_upper
6,TabPFN-TS,0.459651,0.412526,0.51216
10,timesfm_2_0_500m,0.464993,0.415508,0.520371
9,chronos_bolt_base,0.485164,0.430311,0.54701
7,chronos-bolt-small,0.486897,0.43478,0.545262
3,PatchTST,0.495931,0.443727,0.554277
5,TFT,0.510881,0.453434,0.575605
8,chronos-bolt-tiny,0.516447,0.459316,0.580683
2,DeepAR,0.720685,0.618764,0.839393
0,Auto_Arima,0.770179,0.718023,0.826124
4,Seasonal_Naive,1.0,1.0,1.0


### Relative MASE

In [10]:
norm_mase_mean_and_ci.sort_values(by='mean', ascending=True)

Unnamed: 0,model,mean,ci_lower,ci_upper
10,timesfm_2_0_500m,0.680467,0.559042,0.828267
6,TabPFN-TS,0.691968,0.567182,0.844208
9,chronos_bolt_base,0.724925,0.60112,0.874229
7,chronos-bolt-small,0.737635,0.612263,0.888681
3,PatchTST,0.761765,0.62269,0.931903
8,chronos-bolt-tiny,0.772482,0.641667,0.929966
5,TFT,0.821651,0.684951,0.985634
0,Auto_Arima,0.963724,0.927916,1.000912
1,Auto_Theta,0.978467,0.791824,1.209104
4,Seasonal_Naive,1.0,1.0,1.0
