In [3]:
import pandas as pd
import glob
import os
from autorank import autorank, plot_stats, create_report
from itertools import product
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, to_rgb
import seaborn as sns
import numpy as np
from scipy.stats import ttest_ind
# from pymfe.mfe import MFE
from dataset import load_data
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

plt.rcParams.update({
    "font.family": "serif",
    "font.serif": ['DejaVu Serif', 'Bitstream Vera Serif', 'Computer Modern Roman', 'New Century Schoolbook', 'Century Schoolbook L', 'Utopia', 'ITC Bookman', 'Bookman', 'Nimbus Roman No9 L', 'Palatino', 'Charter', 'serif'],
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "legend.title_fontsize": 12,
    "legend.fontsize": 11,
    "xtick.labelsize": 11,
    "ytick.labelsize": 11,
    "lines.linewidth": 2,
    "lines.markersize": 6,
    "legend.frameon": False,
    "figure.figsize": (9, 3)  # standard size for subplots
})
COLORS = ['#4c90b8', '#2ac3c1', '#f5b811', '#de653e', '#ff912a']

# 0. Load Results Data

In [59]:
# dataset metafeatures
df_meta = pd.read_csv('results/metafeatures.csv')
df_meta = df_meta[df_meta['seed'] == 'average']
df_meta = df_meta.drop(columns=['seed'])
df_meta = df_meta.loc[:, df_meta.nunique(dropna=False) > 1]

In [45]:
# downstream performance and time results
df_all = pd.read_csv('results/tabnet.csv')

In [46]:
# invert sign for running times
df_all.loc[df_all['metric'] == 'total_time', 'value'] *= -1
df_all.loc[df_all['metric'] == 'sampling_time', 'value'] *= -1
df_all.loc[df_all['metric'] == 'model_time', 'value'] *= -1

# cleaner presentation of brackets
df_clean = df_all.copy()
bracket_cleaner = {40: '0-40', 60: '20-60', 80: '40-80', 100: '60-100', 0: 'all', np.nan: 'all'}
df_clean['bracket'] = df_clean['bracket_max'].map(bracket_cleaner)
df_clean = df_clean.drop(['bracket_min', 'bracket_max', 'mean', 'sampling_strategy', 'n_selected'], axis=1)

# joining corresponding baselines
baseline_none = df_clean[df_clean['sampling_method'] == 'BaselineNone'].drop(['bracket', 'sampling_method', 'n_sample'], axis=1).rename(columns={'value': 'baseline_none'})
baseline_all = df_clean[df_clean['sampling_method'] == 'BaselineAll'].drop(['bracket', 'sampling_method', 'n_sample'], axis=1).rename(columns={'value': 'baseline_all'})
df_clean = df_clean[~df_clean['sampling_method'].isin(['BaselineNone', 'BaselineAll'])]
join_keys = ['dataset', 'n_labeled', 'metric', 'seed']
df_clean = pd.merge(df_clean, baseline_none, on=join_keys, how='left')
df_clean = pd.merge(df_clean, baseline_all, on=join_keys, how='left')
df_clean_avg = df_clean.groupby(['dataset', 'n_labeled', 'n_sample', 'sampling_method', 'metric', 'bracket'])[['value', 'baseline_none', 'baseline_all']].mean().reset_index()


In [61]:
# categorical metafeatures (low/medium/high)
df_meta_cat = df_meta.copy()
df_meta_cat = df_meta_cat.loc[:, df_meta_cat.nunique(dropna=False) > 2]
feature_cols = [col for col in df_meta_cat.columns if col != "dataset_id"]

# Apply quantile-based binning to each feature
for col in feature_cols:
    try:
        df_meta_cat[col] = pd.qcut(df_meta_cat[col], q=3, labels=["low", "medium", "high"])
    except:
        continue

# 1. Sampling vs. All Unlabeled Data vs. None (Rank)

Does sampling generally work?

# 2. Sampling vs. Baselines (AUPRC)

How large are the gaps?

# 3. n_sample

How much do we need to sample?

# 4. Brackets

Which disagreement bracket makes the most sense?

# 5. Bracket Recommender

We can get good results, but when should we use which bracket? Let's train a model on the dataset features and find out. 

### 5a. Bracket Recommender: Performance

### 5b. Bracket Recommender: Feature Importance