In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import os.path
import numpy as np
import pandas as pd
import ipywidgets as widgets

from pyrejection.datasets import EXP_NOISY_DATASETS, DATASETS
from pyrejection.experiments import run_experiments
from pyrejection.evaluation import (save_tradeoff_reports, get_experiment_stats,
                                    ct_vs_nl_statistical_test, noise_feature_influence_plots)

## Configuration

In [None]:
metric = 'accuracy'
classifier = 'logreg'
noisy_datasets = list(EXP_NOISY_DATASETS.keys())
radial_datasets = [dataset for dataset in DATASETS.keys() if dataset.startswith('radial-synthetic-exp-noise')]
datasets = noisy_datasets + radial_datasets
sample_random_states = range(100)
test_size = 0.3
cache_dir = 'results_cache'

In [None]:
def do_experiments(random_states, discard_results=False, drop_test_preds=True):
    return run_experiments(
        [metric],
        [classifier],
        datasets,
        random_states=random_states,
        test_size=test_size,
        worker_count=4,
        cache_dir=cache_dir,
        drop_test_preds=drop_test_preds,
        discard_results=discard_results,
    )

## Run All Experiments

In [None]:
do_experiments(sample_random_states[:1], discard_results=True, drop_test_preds=False)
do_experiments(sample_random_states, discard_results=True)
None

## Generate Experiment Reports

In [None]:
# Load first set of results for reports.
first_results = do_experiments(sample_random_states[:1])
reports_base_dir = os.path.join('tradeoff_reports')
save_tradeoff_reports(first_results, reports_base_dir)
del first_results

## Sampled Statistics Comparison

In [None]:
sample_stats = {dataset: {'ct': [], 'nl': []} for dataset in datasets}
# Load summary statistics from each sample of experiments.
for random_state in sample_random_states:
    exp_results = do_experiments([random_state])
    for exp_result in exp_results:
        dataset = exp_result['config']['dataset']
        # Add statistics from this experiment to the sample_evaluation.
        for rej_method, stats in get_experiment_stats(exp_result).items():
            sample_stats[dataset][rej_method].append(stats)
    # Delete the loaded results to save memory.
    del exp_results

In [None]:
# Perform statistical tests
stat_names = sample_stats[datasets[0]]['ct'][0].keys()
statistical_tests = {
    stat: {
        dataset: {
            **ct_vs_nl_statistical_test(test_size,
                                        [stats[stat] for stats in method_stats['ct']],
                                        [stats[stat] for stats in method_stats['nl']]),
        }
        for dataset, method_stats in sample_stats.items()
    }
    for stat in stat_names
}

### Full Summary Table

In [None]:
alpha = 0.05
rows = []
for dataset in datasets:
    row = {'dataset': dataset}
    for stat in stat_names:
        res = statistical_tests[stat][dataset]
        row[f'{stat}-ct-mean'] = res['ct-mean']
        row[f'{stat}-nl-mean'] = res['nl-mean']
        row[f'{stat}-max-stddev'] = max(res["ct-std"], res["nl-std"])
        row[f'{stat}-nl-stddev-less'] = res["nl-std"] < row[f'{stat}-max-stddev']
        row[f'{stat}-t-test-significant'] = res['ct-vs-nl-2t-p'] < alpha
        row[f'{stat}-wilcox-significant'] = res['ct-vs-nl-2w-p'] < alpha
    rows.append(row)
summary_df = pd.DataFrame(rows)
summary_df

### Formatted Summary Table

In [None]:
import re

def format_bold(text):
    return r'\textbf{' + text + '}'

stat_formatting = {
    'Capacity': {'formatter': '{:.3f}', 'bold_greater': True},
    'E^u at 80% C': {'formatter': '{:.1%}', 'bold_greater': False},
    'C at 50% of Original E^u': {'formatter': '{:.1%}', 'bold_greater': True},
}

latex_table_rows = []
for dataset in datasets:
    row = {'Dataset': dataset}
    for stat in stat_names:
        res = statistical_tests[stat][dataset]
        num_formatter = stat_formatting[stat]['formatter']
        row[f'{stat} - CT'] = num_formatter.format(res['ct-mean'])
        row[f'{stat} - NL'] = num_formatter.format(res['nl-mean'])
        row[f'{stat} - σ'] = num_formatter.format(max(res["ct-std"], res["nl-std"]))
        if res['ct-vs-nl-2t-p'] < alpha:
            if res['ct-mean'] > res['nl-mean']:
                if stat_formatting[stat]['bold_greater']:
                    row[f'{stat} - CT']  = format_bold(row[f'{stat} - CT'])
                else:
                    row[f'{stat} - NL']  = format_bold(row[f'{stat} - NL'])
            elif res['ct-mean'] < res['nl-mean']:
                if stat_formatting[stat]['bold_greater']:
                    row[f'{stat} - NL']  = format_bold(row[f'{stat} - NL'])
                else:
                    row[f'{stat} - CT']  = format_bold(row[f'{stat} - CT'])
            else:
                raise Exception('Difference should not be significant if mean is equal.')
    latex_table_rows.append(row)
latex = pd.DataFrame(latex_table_rows).to_latex(index=False)
latex = latex.replace('\\textbackslash ', '\\')
latex = latex.replace('\\{', '{')
latex = latex.replace('\\}', '}')
latex = latex.replace('  ', ' ')
latex = re.sub(' +', ' ', latex)
print(latex)

## Noise Feature Influence Histograms

In [None]:
first_results = do_experiments(sample_random_states[:1])
dataset_options = {exp_result['config']['dataset']: exp_result
                   for i, exp_result in enumerate(first_results)}
widgets.interact(noise_feature_influence_plots,
                 exp_result=widgets.Dropdown(description='Dataset: ', options=dataset_options))