# Residual fitting

This notebook trains a neural network to fit residuals left after a component model is trained. The ensemble model here takes a prediction distribution from a single component model and tries to fit the error using the actual data and the peak of the distribution.

NOTE: This is an old notebook and will not work with current structure of repo.

In [149]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append("../src")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [150]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import utils.data as udata
import utils.dists as udists
import utils.misc as u
import os
import losses
import pymmwr

from functools import partial
from jrun import jin
from keras.layers import (Activation, Dense, Dropout, Embedding, Flatten, Merge, Reshape)
from keras.models import Sequential
from keras.callbacks import EarlyStopping
import keras.backend as K

## Setup notebook parameters

In [151]:
EXP_NAME = jin("exp", "residual/seasons-14-to-5")
COMPONENT = jin("component", "sarima")
WEEK_NUMBER = jin("week", 4)
TEST_SPLIT_THRESH = jin("splitweek", 201143)

## Load data

In [152]:
class Component:
    """
    Helper class for working with components
    """
    
    def __init__(self, name):
        self.name = name
        self.loader = udata.ComponentDataLoader("../data", name)

In [153]:
component = Component(COMPONENT)
actual_dl = udata.ActualDataLoader("../data")

## Working on week ahead predictions

In [154]:
REGION = None # Specify None for using all the data

y, Xs, yi = udata.get_week_ahead_training_data(
    WEEK_NUMBER, REGION,
    actual_dl, [component.loader]
)

component.data = Xs[0]

### Getting error using the distribution max

In [155]:
component_out = udists.get_merged_features(
    Xs,
    [udists.dist_median]
)[:, 0]

component_error = y - component_out

## Network inputs and outputs

The network will take distribution peak from the component, a week encoding and fit on *model_error*

In [156]:
def encode_epiweek(epiweek: int):
    """
    Encode epiweek of the form YYYYWW
    """
    
    year = epiweek // 100
    week = epiweek % 100
    
    # Get the limit of weeks in year
    n_weeks = pymmwr.mmwr_weeks_in_year(year)
    
    radian = 2 * np.pi * week / n_weeks
    
    return np.array([np.sin(radian), np.cos(radian)])

weeks = np.array([encode_epiweek(ew) for ew in yi[:, 0]])

### Split based on year
We take items before a certain epiweek as train and rest as test

In [157]:
train_indices = yi[:, 0] < TEST_SPLIT_THRESH

## Model

In [158]:
def res_mod(n_input):
    """
    Return an error fit model
    """
    
    model = Sequential()
    model.add(Dense(10, input_shape=(n_input,)))
    model.add(Activation("relu"))
    model.add(Dense(5))
    model.add(Activation("relu"))
    model.add(Dense(5))
    model.add(Activation("relu"))
    
    # Return the error
    model.add(Dense(1))
    
    return model

### Training

In [None]:
# model generator
def gen_model():
    return res_mod(3)

def train_model(
    model, train_data, val_data,
    batch_size=64, epochs=100
):
    model.compile(optimizer="rmsprop", loss="mse")

    if val_data is not None:
        callbacks = [EarlyStopping(monitor="val_loss", patience=4, mode="auto")]
    else:
        callbacks = []

    history = model.fit(train_data[0],
                        train_data[1],
                        batch_size=batch_size, epochs=epochs,
                        verbose=0,
                        callbacks=callbacks,
                        validation_data=val_data)
    return history

In [None]:
X_train = np.concatenate([component_out[train_indices][:, None], weeks[train_indices]], axis=1)
y_train = component_error[train_indices]
yi_train = yi[train_indices]

cv_metadata = u.cv_train_loso(
    gen_model, train_model,
    X_train, y_train, yi_train
)
u.cv_plot(cv_metadata)
cv_report = u.cv_report(cv_metadata)
cv_report

  0%|          | 0/15 [00:00<?, ?it/s]

## Final model training

In [None]:
model = gen_model()
final_epochs = int(cv_report["epochs"][-1])
final_history = train_model(model, (X_train, y_train), None, epochs=final_epochs)
final_loss = final_history.history["loss"][-1]
plt.plot(final_history.history["loss"])
final_loss

## Evaluation

In [None]:
regions = ["nat", *[f"hhs{i}" for i in range(1, 11)], None]

mdls = [component.name, "res-fit"]

eval_df = {mdl: [] for mdl in mdls}

for region in regions:
    if region is None:
        eval_indices = ~train_indices
    else:
        eval_indices = (~train_indices) & (yi[:, 1] == region)
        
    component_dist = component.data[eval_indices]
    model_in = np.concatenate([component_out[eval_indices][:, None], weeks[eval_indices]], axis=1) 
    rf_dist = udists.shift_dists(component_dist, model.predict(model_in)[:, 0])

    dists = [
        component_dist,
        rf_dist
    ]
    y_one_hot = udists.actual_to_one_hot(y[eval_indices])
    
    for name, output in zip(mdls, dists):
        eval_df[name].append(losses.mean_cat_cross(y_one_hot, output))
        
eval_df = pd.DataFrame(eval_df)
eval_df.index = [*regions[:-1], "all"]
eval_df = eval_df[mdls]

# Save results
output_dir = u.ensure_dir(f"../results/{EXP_NAME}")
u.save_exp_summary(model, cv_report, {
    "loss": final_loss,
    "epochs": final_epochs
}, f"{output_dir}/{WEEK_NUMBER}_summary.txt")
eval_df.to_csv(f"{output_dir}/{WEEK_NUMBER}_eval.csv")
eval_df