# Upper limit

This notebook explores the best performance for a weight based ensemble if its allowed to peak in the future at each time step.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
np.random.seed(1234)

import sys
sys.path.append("../src")
from utils import data as udata
from utils import dists as udists
from utils import misc as u
from truth import mask_truths, impute, FILL_SPACE
from predictors import make_predictor
import losses
from pymmwr import Epiweek
from tqdm import tqdm, trange
import ledge.merge as merge
import ledge.update as update
import ledge.fill as fill
from functools import partial
import os.path as path

In [2]:
EXP_DIR = "../data/processed/cdc-flusight-ensemble/"
OUTPUT_DIR = "../models/cdc-flusight-ensemble/"
TARGET = "4-ahead"
MAX_LAG = 29
REGIONS = ["nat", *[f"hhs{i}" for i in range(1, 11)]]
SEASONS = list(range(2010, 2017))
LOSS_FN = losses.ploss

In [3]:
components = [udata.Component(EXP_DIR, m) for m in udata.available_models(EXP_DIR)]
ad = udata.ActualData(EXP_DIR)

In [4]:
def evaluate(region, season):
    """
    Return upper limit for given selection
    """
    
    truths = [ad.get(TARGET, region, season, lag=l) for l in range(MAX_LAG + 1)]
    c_preds = [cmp.get(TARGET, region, season) for cmp in components]

    first_truth = merge.earliest(truths)
    final_truth = merge.latest(truths)

    first_losses = []
    final_losses = []
    
    for ew in first_truth.epiweek:
        pick = slice(ew, (ew + 1))
        first_losses.append(
            min([LOSS_FN(c_pred.loc[pick], first_truth.loc[pick]).values[0]
                 for c_pred in c_preds])
        )
        final_losses.append(
            min([LOSS_FN(c_pred.loc[pick], final_truth.loc[pick]).values[0]
                 for c_pred in c_preds])
        )
    return {
        "first_loss": np.mean(first_losses),
        "final_loss": np.mean(final_losses)
    }

In [5]:
first_limits = { season: [] for season in SEASONS }
final_limits = { season: [] for season in SEASONS }

for region in tqdm(REGIONS):
    for season in SEASONS:
        l = evaluate(region, season)
        first_limits[season].append(l["first_loss"])
        final_limits[season].append(l["final_loss"])

first_limits = pd.DataFrame(first_limits, index=REGIONS)
final_limits = pd.DataFrame(final_limits, index=REGIONS)

100%|██████████| 11/11 [16:37<00:00, 90.65s/it]


In [7]:
first_limits

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016
nat,0.749975,0.75828,0.803748,0.793652,0.766138,0.806086,0.797714
hhs1,0.803038,0.756632,0.818286,0.766821,0.829103,0.805997,0.815081
hhs2,0.890311,0.798394,0.882833,0.923522,0.882587,0.897622,0.930528
hhs3,0.85497,0.832902,0.893658,0.865091,0.824799,0.872445,0.846141
hhs4,0.836839,0.813969,0.871149,0.819082,0.811999,0.866695,0.855813
hhs5,0.783027,0.843969,0.839232,0.753315,0.882879,0.849035,0.806548
hhs6,0.912526,0.91337,0.917886,0.878438,0.917597,0.878042,0.907438
hhs7,0.853877,0.877813,0.879289,0.842137,0.897785,0.844012,0.861027
hhs8,0.812601,0.86061,0.859842,0.82671,0.844122,0.751133,0.838429
hhs9,0.908005,0.90917,0.921731,0.913309,0.878797,0.879665,0.882705


In [8]:
final_limits

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016
nat,0.752436,0.786466,0.85946,0.757843,0.762317,0.820441,0.813682
hhs1,0.789193,0.751731,0.808583,0.809139,0.837802,0.803557,0.806256
hhs2,0.898689,0.823262,0.886129,0.913629,0.884849,0.88585,0.904216
hhs3,0.856271,0.825243,0.890147,0.724439,0.865898,0.879739,0.891068
hhs4,0.840791,0.834148,0.883261,0.812626,0.829444,0.861615,0.858437
hhs5,0.809216,0.821301,0.847365,0.781961,0.83864,0.814445,0.784706
hhs6,0.870326,0.904911,0.920036,0.883548,0.896201,0.903204,0.907967
hhs7,0.835916,0.885548,0.858858,0.813109,0.875682,0.869211,0.861164
hhs8,0.81339,0.857594,0.852276,0.824274,0.831763,0.769347,0.838799
hhs9,0.891324,0.908698,0.917522,0.866936,0.835691,0.900442,0.843935
