# Stacking upper limit

This notebook tries to figure out the upper limit on ensemble performance using simple weighted combination of probability distributions.

As of now the stacked ensemble models based on degenerate EM return new probability distributions which are convex combinations of the component distributions. That means that at each time step, the best performing component model sets the upper limit on how good a weight based ensemble can do.

In this notebook, we explore these limits.

In [76]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
np.random.seed(1234)

import sys
sys.path.append("../src")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [77]:
import matplotlib.pyplot as plt
import pandas as pd
import utils.data as udata
import utils.dists as udists
import utils.misc as u
import models
import os
import losses
import yaml
import keras.backend as K

from functools import partial
from jrun import jin
from tqdm import tqdm
from copy import deepcopy

## Setup notebook parameters

In [78]:
EXP_NAME = "collaborative"
data_dir = "../data"
exp_dir = os.path.join(data_dir, "processed", EXP_NAME)

with open("../config.yaml") as fp:
    CONFIG = yaml.load(fp)

TEST_SPLIT_THRESH = CONFIG["TEST_SPLIT_THRESH"][EXP_NAME]

COMPONENTS = [udata.Component(exp_dir, name) for name in u.available_models(exp_dir)]
ACTUAL_DL = udata.ActualDataLoader(data_dir)

REGIONS = ["nat", *[f"hhs{i}" for i in range(1, 11)], None]
TARGETS = [udata.Target(t) for t in [1, 2, 3, 4, "peak", "peak_wk", "onset_wk"]]

In [81]:
def do_target(target):
    all_scores = { "region": [], "target": [], "score": [] }
    for region in tqdm([r for r in REGIONS if r]):
        y, Xs, yi = target.get_training_data(
            ACTUAL_DL, COMPONENTS, region, TEST_SPLIT_THRESH
        )
        y_one_hot = udists.actual_to_one_hot(y, bins=target.bins)
        scores = np.array([K.categorical_crossentropy(y_one_hot, X).eval() for X in Xs])

        all_scores["region"].append(region if region else "all")
        all_scores["target"].append(target.name)
        all_scores["score"].append(scores.min(axis=0).mean())
    return pd.DataFrame(all_scores)

In [86]:
limits = pd.concat([do_target(t) for t in TARGETS], ignore_index=True)

100%|██████████| 11/11 [02:32<00:00, 13.90s/it]
100%|██████████| 11/11 [02:36<00:00, 14.25s/it]
100%|██████████| 11/11 [02:38<00:00, 14.42s/it]
100%|██████████| 11/11 [02:53<00:00, 15.81s/it]
100%|██████████| 11/11 [02:58<00:00, 16.25s/it]
100%|██████████| 11/11 [00:57<00:00,  5.24s/it]
100%|██████████| 11/11 [01:02<00:00,  5.68s/it]


## Correcting bias
Some of the models have non-zero centered bias. We now apply the error correction and recompute the limits here.

In [103]:
# Read in bias file
biases = pd.read_csv(f"{data_dir}/processed/biases.csv")
model_id_map = pd.read_csv(f"{data_dir}/processed/model-id-map.csv")

In [117]:
def get_error(model_name, target_name, region_name):
    """
    Return error
    """
    
    model_dir = model_id_map[model_id_map["model-id"] == model_name]["model-dir"].iloc[0]

    selection = (biases["model"] == model_dir) &\
                (biases["target"] == target_name) &\
                (biases["region"] == region_name)
    return biases[selection]["mean-error"].iloc[0]

In [144]:
def do_target_with_bias(target):
    all_scores = { "region": [], "target": [], "score": [] }
    for region in tqdm([r for r in REGIONS if r]):
        y, Xs, yi = target.get_training_data(
            ACTUAL_DL, COMPONENTS, region, TEST_SPLIT_THRESH
        )
        y_one_hot = udists.actual_to_one_hot(y, bins=target.bins)
        # Add bias correction
        Xsc = []
        for cmp, X in zip(COMPONENTS, Xs):
            shift_by = get_error(cmp.name, target.name, region)
            Xsc.append(udists.shift_dists(X, np.repeat(shift_by, X.shape[0]), bins=target.bins))
        
        scores = np.array([K.categorical_crossentropy(y_one_hot, X).eval() for X in Xsc])

        all_scores["region"].append(region if region else "all")
        all_scores["target"].append(target.name)
        all_scores["score"].append(scores.min(axis=0).mean())
    return pd.DataFrame(all_scores)

In [None]:
limits_b = pd.concat([do_target_with_bias(t) for t in TARGETS], ignore_index=True)

In [153]:
# Looks like there is some issue in the without bias scores ("wo_bias").
# One possible problem in week bins is that the week errors might not be right if
# calculated simply using truth - prediction
# For other issue, should look into the shifting code
limits.merge(limits_b, on=["region", "target"], suffixes=["", "_wo_bias"])

Unnamed: 0,region,score,target,score_wo_bias
0,nat,1.763561,1,2.004860
1,hhs1,1.676201,1,2.213853
2,hhs2,2.121429,1,2.642579
3,hhs3,2.025137,1,2.530678
4,hhs4,1.786016,1,2.092064
5,hhs5,1.681135,1,1.951591
6,hhs6,2.326385,1,2.680873
7,hhs7,2.068342,1,2.395886
8,hhs8,1.803134,1,2.393586
9,hhs9,2.397755,1,2.994532
