In [2]:
import pandas as pd
import glob
import os
from autorank import autorank, plot_stats, create_report
from itertools import product
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, to_rgb
from matplotlib.lines import Line2D
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

plt.rcParams.update({
    "font.family": "serif",
    "font.serif": ['DejaVu Serif', 'Bitstream Vera Serif', 'Computer Modern Roman', 'New Century Schoolbook', 'Century Schoolbook L', 'Utopia', 'ITC Bookman', 'Bookman', 'Nimbus Roman No9 L', 'Palatino', 'Charter', 'serif'],
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "legend.title_fontsize": 12,
    "legend.fontsize": 11,
    "xtick.labelsize": 11,
    "ytick.labelsize": 11,
    "lines.linewidth": 2,
    "lines.markersize": 6,
    "legend.frameon": False,
    "figure.figsize": (9, 3)  # standard size for subplots
})
COLORS = ['#4c90b8', '#2ac3c1', '#f5b811', '#de653e', '#ff912a']
color_dict = {
    'BaselineAll': 'black',
    'BaselineNone': 'gray',
    'DisagreementSampling(DT)': COLORS[3],
    'DisagreementSampling(DS)': COLORS[4],
    'RandomSampling': COLORS[0],
    # add other sampling methods as needed
}

# 0. Load Result Data

In [3]:
# downstream performance and time results
df_all = pd.read_csv('results/tabnet.csv')

df_all['sampling_method'] = df_all['sampling_method'].replace({
    'ConsensusSampling(DT)': 'DisagreementSampling(DT)',
    'ConsensusSampling(DS)': 'DisagreementSampling(DS)'
})

In [4]:
# invert sign for running times
df_all.loc[df_all['metric'] == 'total_time', 'value'] *= -1
df_all.loc[df_all['metric'] == 'sampling_time', 'value'] *= -1
df_all.loc[df_all['metric'] == 'model_time', 'value'] *= -1

# cleaner presentation of brackets
df_clean = df_all.copy()
bracket_cleaner = {40: '0-40', 60: '20-60', 80: '40-80', 100: '60-100', 0: 'all', np.nan: 'all'}
df_clean['bracket'] = df_clean['bracket_max'].map(bracket_cleaner)
df_clean = df_clean.drop(['bracket_min', 'bracket_max', 'mean', 'sampling_strategy', 'n_selected'], axis=1)

# joining corresponding baselines
baseline_none = df_clean[df_clean['sampling_method'] == 'BaselineNone'].drop(['bracket', 'sampling_method', 'n_sample'], axis=1).rename(columns={'value': 'baseline_none'})
baseline_all = df_clean[df_clean['sampling_method'] == 'BaselineAll'].drop(['bracket', 'sampling_method', 'n_sample'], axis=1).rename(columns={'value': 'baseline_all'})
df_clean = df_clean[~df_clean['sampling_method'].isin(['BaselineNone', 'BaselineAll'])]
join_keys = ['dataset', 'n_labeled', 'metric', 'seed']
df_clean = pd.merge(df_clean, baseline_none, on=join_keys, how='left')
df_clean = pd.merge(df_clean, baseline_all, on=join_keys, how='left')
df_clean_avg = df_clean.groupby(['dataset', 'n_labeled', 'n_sample', 'sampling_method', 'metric', 'bracket'])[['value', 'baseline_none', 'baseline_all']].mean().reset_index()

# finding the best bracket by AUPRC
df_agg = df_clean_avg[df_clean_avg['metric'] == 'auprc_macro']
idx_max_value = df_agg.groupby(['dataset', 'n_labeled', 'sampling_method', 'n_sample'])['value'].idxmax()
df_best_bracket = df_agg.loc[idx_max_value]
idx_max_value = df_agg.groupby(['dataset', 'n_labeled', 'sampling_method'])['value'].idxmax()
df_best_nsample_bracket = df_agg.loc[idx_max_value]

# 1. Time Compared to Baselines

How much faster than the baselines is sampling?

In [5]:
# Step 1: Filter to only AUPRC and total_time
df_eff = df_clean_avg[df_clean_avg['metric'].isin(['auprc_macro', 'total_time'])].copy()
df_eff = df_eff[df_eff['sampling_method'] != 'ConsensusSampling(DS)']  ### <----- or remove DT

# Step 2: Pivot to wide format with value + baselines
df_eff = df_eff.pivot_table(
    index=['dataset', 'n_labeled', 'n_sample', 'sampling_method', 'bracket'],
    columns='metric',
    values=['value', 'baseline_all', 'baseline_none']
).reset_index()

# Flatten the multi-index columns
df_eff.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col for col in df_eff.columns]

# Step 3: Get best row per group (highest value_auprc_macro)
df_best = df_eff.sort_values('value_auprc_macro', ascending=False).groupby(
    ['dataset', 'n_labeled', 'n_sample', 'sampling_method'], as_index=False
).first()

In [7]:
df_best['time_ratio_to_baseline_all'] = df_best['value_total_time'] / df_best['baseline_all_total_time']

In [9]:
df_time = df_best.groupby(['n_sample','sampling_method'])['time_ratio_to_baseline_all'].mean()
df_time

n_sample  sampling_method         
100       DisagreementSampling(DS)    0.198008
          DisagreementSampling(DT)    0.199271
          RandomSampling              0.205311
250       DisagreementSampling(DS)    0.218954
          DisagreementSampling(DT)    0.218949
          RandomSampling              0.215452
500       DisagreementSampling(DS)    0.255799
          DisagreementSampling(DT)    0.255688
          RandomSampling              0.252686
750       DisagreementSampling(DS)    0.295843
          DisagreementSampling(DT)    0.296200
          RandomSampling              0.293007
1000      DisagreementSampling(DS)    0.329250
          DisagreementSampling(DT)    0.329841
          RandomSampling              0.331858
Name: time_ratio_to_baseline_all, dtype: float64

In [10]:
print(df_time.to_latex())

\begin{tabular}{llr}
\toprule
 &  & time_ratio_to_baseline_all \\
n_sample & sampling_method &  \\
\midrule
\multirow[t]{3}{*}{100} & DisagreementSampling(DS) & 0.198008 \\
 & DisagreementSampling(DT) & 0.199271 \\
 & RandomSampling & 0.205311 \\
\cline{1-3}
\multirow[t]{3}{*}{250} & DisagreementSampling(DS) & 0.218954 \\
 & DisagreementSampling(DT) & 0.218949 \\
 & RandomSampling & 0.215452 \\
\cline{1-3}
\multirow[t]{3}{*}{500} & DisagreementSampling(DS) & 0.255799 \\
 & DisagreementSampling(DT) & 0.255688 \\
 & RandomSampling & 0.252686 \\
\cline{1-3}
\multirow[t]{3}{*}{750} & DisagreementSampling(DS) & 0.295843 \\
 & DisagreementSampling(DT) & 0.296200 \\
 & RandomSampling & 0.293007 \\
\cline{1-3}
\multirow[t]{3}{*}{1000} & DisagreementSampling(DS) & 0.329250 \\
 & DisagreementSampling(DT) & 0.329841 \\
 & RandomSampling & 0.331858 \\
\cline{1-3}
\bottomrule
\end{tabular}



# 2. Time vs. AUPRC

Close look at the time/performance trade-off