In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import tukey_hsd

from pathlib import Path
from collections import OrderedDict, defaultdict
from functools import partial
from multiprocessing import Pool, Manager

import lib
from lib import env
from lib.prelude import *

In [3]:
DETAILS = ['task_type', 'n_objects', 'n_features']


def collect_outputs(dir_, output_filter_fn):
    # output_filter_fn: output_dir -> bool
    if isinstance(output_filter_fn, int):
        output_filter_fn = [str(x) for x in range(output_filter_fn)]
    if output_filter_fn is None:
        output_filter_fn = lambda x: True
    elif isinstance(output_filter_fn, list):
        assert all(isinstance(x, str) for x in output_filter_fn)
        valid_names = output_filter_fn
        output_filter_fn = lambda x: x.name in valid_names
    assert callable(output_filter_fn)

    outputs = []
    dir_ = lib.get_path(dir_)
    if dir_.exists():
        for config_path in dir_.rglob('**/*'):
            if config_path.name.endswith(".toml"): continue
            path = config_path.with_suffix('')
            if (
                path.is_dir()
                and (path / 'DONE').exists()
                and output_filter_fn(path.relative_to(lib.PROJ))
            ):
                outputs.append(path)
    return outputs


def load_record(output, key, subkey):
    output = lib.get_path(output)
    report = lib.load_report(output)
    if Path(report['program']).stem == 'tune':
        report = report['best']

    name = output.parent.name
    if "ensemble" in name:
        single_report = lib.load_report(output.parent.parent/name.replace("ensemble_5", "evaluation/0"))
    else:
        single_report = report


    dataset = Path(single_report["config"]["data"]["path"]).name


    dataset_info = lib.load_dataset_info(dataset)
    record = {
        'dataset': dataset_info['name'],
        'task_type': dataset_info['task_type'],
        'n_objects': dataset_info['size'],
        'n_features': dataset_info['n_num_features'] + dataset_info['n_cat_features'],
        'key': (
            f'{Path(program).stem} | {output.relative_to(lib.PROJ).parent.name}'
            if key is None
            else key
        ),
        'subkey': subkey,
    }
    for part in lib.Part:
        part = part.value
        if part in report['metrics']:
            record[f'{part}_score'] = report['metrics'][part]['score']
        try:
            record[f'val_out_score'] = report['metrics']['val_out']['score']
            record[f'test_out_score'] = report['metrics']['test_out']['score']
        except:
            pass
    return record


def sort(df, by):
    if isinstance(by, str):
        by = [by]
    return df.sort_values(
        ['n_objects', 'dataset'] + by,
        ascending=[True, True] + ['score' not in x for x in by],
    ).reset_index(drop=True)


def make_df(records):
    df = pd.DataFrame(records)
    return df


def format_scores(df, precision):
    def f(record):
        if record['task_type'] == lib.TaskType.REGRESSION.value:
            for part in lib.Part:
                for suffix in 'best', 'score':
                    if isinstance(part, str): key = f'{part}_{suffix}'
                    else: key = f'{part.value}_{suffix}'
                    if key in record:
                        record[key] *= -1
        for k, v in list(record.items()):
            if isinstance(v, float):
                if record['dataset'] == 'House 16H': v = v / 10_000
                record[k] = round(v, precision)
        return record

    return df.apply(f, axis=1)


def drop_details(df):
    return df.drop(columns=DETAILS)


def drop_std(df):
    return df.drop(columns=[x for x in df.columns if x.endswith('_std')])


def build_df(records_info, precision=None, details=True):
    # (sub_)key_fn: Union[None, str, Callable[[output_dir], str]]
    records = []
    for dir_, output_filter_fn, key_fn, subkey_fn in records_info:
        for output in collect_outputs(dir_, output_filter_fn):
            key, subkey = [
                None if fn is None else fn if isinstance(fn, str) else fn(output)
                for fn in [key_fn, subkey_fn]
            ]
            record = load_record(output, key, subkey)
            if record is not None:
                records.append(record)

    if not records:
        raise RuntimeError('No records are available!')
    df = make_df(records)
    if precision is not None:
        df = format_scores(df, precision)
    if not details:
        df = drop_details(df)
    return df


def aggregate(df):
    aggrs = dict(
        test_score=('test_score', 'mean'),
        test_std=('test_score', 'std'),
        val_score=('val_score', 'mean'),
        val_std=('val_score', 'std'),
        train_score=('train_score', 'mean'),
        train_std=('train_score', 'std'),
        count=('test_score', 'count'),
    )
    for x in DETAILS:
        if x in df.columns:
            aggrs[x] = (x, 'first')
    df = df.groupby(['dataset', 'key']).agg(**aggrs)
    df['count'] = df['count'].astype(int)
    return df.reset_index().fillna(0.0)

def get_ranks_by_dataset(dataset, df, pvalue_threshold):
    dataset_results = df[df['dataset'] == dataset][['dataset', 'key', 'test_score']].\
                          groupby('key').apply(lambda x : np.array(x['test_score']))
    pvalues = tukey_hsd(*list(dataset_results.values)).pvalue
    key_to_idx = {}
    for i, key in enumerate(dataset_results.index):
        key_to_idx[key] = i

    means = df[df['dataset'] == dataset][['dataset', 'key', 'test_score']].\
            groupby('key')['test_score'].mean()

    sorted_res = means.sort_values(ascending=False)
    prev = None
    first = None
    rank = 1
    ranks = []

    for key in sorted_res.index:
        if first is not None:
            first_idx = key_to_idx[first]
            cur_idx = key_to_idx[key]
            if pvalues[cur_idx][first_idx] < pvalue_threshold:
                rank += 1
                first = key
        else:
            first = key
        ranks.append(rank)
    return ranks, sorted_res.index


def generate_ranks(df, pvalue_threshold=0.05):
    datasets = df.dataset.unique()
    results = defaultdict(lambda : defaultdict())
    
    f = partial(get_ranks_by_dataset, df=df, pvalue_threshold=pvalue_threshold)
    with Pool(len(datasets)) as p:
        result = p.map(f, datasets)
    for i in range(len(datasets)):
        for rank, key in zip(*result[i]):
            results[datasets[i]][key] = rank
    ranks = pd.DataFrame(results)
    ranks.columns = ranks.columns.map(lambda x: x.split()[0].split("-")[0])
    datasets_cols = list(ranks.columns.map(lambda x: x.split()[0].split("-")[0]))
    ranks['AVG'] = ranks.mean(1)
    ranks['STD'] = ranks.std(1)
    ranks = ranks[['AVG', 'STD'] + datasets_cols]
    return ranks.sort_values(by=['AVG'])

In [4]:
def display_results(pretrains, models, additional=None, ensemble=False, gbdt=True, test_ranks=True, datasets=DATASETS):
    results = []

    kind = "ensemble_5" if ensemble else "evaluation"

    for name, model in itertools.product(pretrains, models):
        p = lib.EXP/name/model
        if p.exists():
            results.append((p, f"{model.upper()} ({name})"))


    results = sum([
        [
            (r/d/f"3_{kind}", 15, n, None)
            for d in datasets
        ]
        for r,n in results
    ], [])

    if gbdt:
        results += [
            (lib.EXP/"catboost"/d/f"0_{kind}", 15, "Catboost", None)
            for d in datasets
        ]

    if additional:
        results += [
            (lib.EXP/a.format(d, kind), 15, n, None) for a,n in additional for d in datasets
        ]

    df = build_df(results)                       

    if test_ranks:
        df_ranks = generate_ranks(df)

    df = aggregate(df)
    df = sort(df, 'test_score')
    df = format_scores(df, 4)
    df = df.set_index(['dataset', 'key'])

    if not test_ranks:
        df_ranks = deepcopy(df).reset_index()
        df_ranks.loc[df_ranks["task_type"] == "regression", "test_score"] *= -1
        df_ranks = df_ranks.pivot("key", "dataset", "test_score")
        df_ranks.columns = df_ranks.columns.map(lambda x: x.split()[0].split("-")[0])
        df_ranks = df_ranks.rank(0, ascending=False)
        df_ranks["avg"] = df_ranks.mean(1)
        df_ranks["std"] = df_ranks.std(1)
        df_ranks.insert(0, "avg", df_ranks.pop("avg"))
        df_ranks.insert(1, "std", df_ranks.pop("std"))
        df_ranks = df_ranks.sort_values("avg")

    display(df_ranks)
    display(df)

# Results

In [6]:
# vanilla pretrains
display_results(
    pretrains = [
        "scratch",
        "mask",
        "rec",
        "sup",
        "contrastive",

        "rec-target",
        "mask-target",

        "mask-sup",
        "rec-sup",
    ],
    models = [
#         "resnet",
        "mlp",
#         "transformer",
        "mlp-p-lr",
        "mlp-t-lr",
    ],
    datasets = DATASETS,
    ensemble = True,
    test_ranks = True,
    gbdt = True
)

Unnamed: 0,AVG,STD,Gesture,Churn,California,House,Otto,Higgs,Facebook,Adult,Shifts,Covertype,MSLR
MLP-P-LR (rec-sup),2.454545,1.558766,1,2,3,2,2,7,2,1,2,2,3
MLP-P-LR (mask-target),2.545455,1.437399,2,1,1,2,4,1,4,1,5,3,4
MLP-P-LR (mask),2.636364,1.431638,1,2,3,1,3,1,5,2,5,2,4
MLP-P-LR (rec),2.636364,1.298442,3,4,2,2,3,6,2,2,2,1,2
MLP-T-LR (mask),2.727273,1.135454,4,1,2,1,2,2,4,3,4,3,4
MLP-T-LR (mask-target),2.909091,1.781447,4,1,1,1,4,1,5,1,5,4,5
MLP-T-LR (rec),2.909091,1.378705,3,3,2,3,4,6,1,4,1,3,2
MLP-T-LR (rec-sup),3.0,1.595448,2,1,3,3,1,6,5,3,2,2,5
MLP-P-LR (mask-sup),3.181818,1.402477,1,1,3,3,5,5,4,3,5,2,3
MLP (mask-target),3.272727,1.813631,2,3,2,2,3,1,6,5,1,6,5


Unnamed: 0_level_0,Unnamed: 1_level_0,test_score,test_std,val_score,val_std,train_score,train_std,count,task_type,n_objects,n_features
dataset,key,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Gesture Phase,MLP-P-LR (rec-sup),0.7369,0.0025,0.7479,0.0057,0.9988,0.0002,3,multiclass,9873,32
Gesture Phase,MLP-P-LR (mask-sup),0.7325,0.0025,0.7498,0.0052,0.9986,0.0006,3,multiclass,9873,32
Gesture Phase,MLP-P-LR (mask),0.7252,0.006,0.7464,0.0086,0.9958,0.0016,3,multiclass,9873,32
Gesture Phase,MLP (mask),0.7217,0.002,0.7525,0.0049,0.9916,0.0015,3,multiclass,9873,32
Gesture Phase,MLP-P-LR (mask-target),0.719,0.0044,0.7447,0.0079,0.9883,0.001,3,multiclass,9873,32
Gesture Phase,MLP (mask-sup),0.7158,0.007,0.7297,0.0022,0.9948,0.0003,3,multiclass,9873,32
Gesture Phase,MLP (mask-target),0.709,0.009,0.7329,0.007,0.9777,0.006,3,multiclass,9873,32
Gesture Phase,MLP (rec-sup),0.7089,0.0046,0.7293,0.006,0.9948,0.0016,3,multiclass,9873,32
Gesture Phase,MLP (contrastive),0.7082,0.0054,0.7241,0.0067,0.9836,0.004,3,multiclass,9873,32
Gesture Phase,MLP-P-LR (rec-target),0.7055,0.0029,0.7287,0.003,0.9836,0.002,3,multiclass,9873,32
