In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
!date
!ls -ltrh ../input/covid19-global-forecasting-week-5/

In [None]:
test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-5/test.csv').fillna('')
train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-5/train.csv').fillna('')

# Weight

In [None]:
plot_data = train[train.Target == 'ConfirmedCases'][['Population','Weight']]\
                .drop_duplicates()\
                .sort_values('Population')

plt.figure(figsize=(9,3))
plt.subplot(131)
plt.title('Population')
plt.plot(plot_data.Population.values)

plt.subplot(132)
plt.title('log1p(Population)')
plt.plot(np.log1p(plot_data.Population.values))

plt.subplot(133)
plt.title('1/log1p(Population)')
plt.plot(1/np.log1p(plot_data.Population.values))

plt.tight_layout()

In [None]:
plot_data = train[train.Target == 'ConfirmedCases'][['Population','Weight']]\
                .drop_duplicates()\
                .sort_values('Population')

plt.figure(figsize=(9,3))
plt.subplot(131)
plt.title('Population')
plt.hist(plot_data.Population.values)

plt.subplot(132)
plt.title('log1p(Population)')
plt.hist(np.log1p(plot_data.Population.values))

plt.subplot(133)
plt.title('1/log1p(Population)')
plt.hist(1/np.log1p(plot_data.Population.values))

plt.tight_layout();

In [None]:
train[train.Target == 'ConfirmedCases'].Weight.min(), train[train.Target == 'ConfirmedCases'].Weight.max()

In [None]:
train['W'] = 1 / np.log1p(train.Population)

In [None]:
(train['W'] == train['Weight'].values).mean()

In [None]:
train[train['W'] != train['Weight']].head()

In [None]:
(train['W'] - train['Weight']).max()

In [None]:
plot_data = train[(train.Target == 'ConfirmedCases')
      & (train.Province_State == '')
      & (train.County == '')][['Country_Region','Population','Weight']].drop_duplicates()

_, axes = plt.subplots(1,2, figsize=(15,60))
plot_data.sort_values('Population').plot(kind='barh', y='Population', x='Country_Region', ax=axes[0])
plot_data.sort_values('Weight').plot(kind='barh', y='Weight', x='Country_Region', ax=axes[1])
plt.tight_layout()

In [None]:
country_data = plot_data.set_index('Country_Region').copy()
country_data.head()

In [None]:
plot_data = train[(train.Target == 'ConfirmedCases')
      & (train.Province_State != '')
      & (train.County == '')][['Country_Region','Province_State','Population','Weight']].drop_duplicates()

plot_data = plot_data.groupby('Country_Region')[['Population', 'Weight']].sum()
plot_data

In [None]:
country_data = plot_data[['Population']].join(country_data, lsuffix='_left', rsuffix='_right')

In [None]:
country_data['diff'] = country_data.Population_left - country_data.Population_right

In [None]:
country_data

In [None]:
_, axes = plt.subplots(1,2, figsize=(15,6))
plot_data.sort_values('Population').plot(kind='barh', y='Population', ax=axes[0])
plot_data.sort_values('Weight').plot(kind='barh', y='Weight', ax=axes[1])
plt.tight_layout()

In [None]:
plot_data = train[(train.Target == 'ConfirmedCases')
      & (train.Province_State != '')
      & (train.County != '')][['Country_Region','Province_State','County','Population','Weight']]\
    .drop_duplicates()
plot_data.head()

In [None]:
plot_data.groupby('Country_Region')[['Population', 'Weight']].sum()

In [None]:
plot_data.shape[0] * plot_data.Weight.mean()

# Loss

$$ L_\tau(y,\hat{y}) = \begin{cases} 
      (y-\hat{y})\tau       & if\: y \ge \hat{y} \\
      (\hat{y}-y)(1 - \tau) & if\: y \lt \hat{y}
   \end{cases}
$$

In [None]:
def L(tau, y_true, y_pred):
    y_true_gte_y_pred = y_true >= y_pred
    return (y_true - y_pred) * tau * y_true_gte_y_pred \
         + (y_pred - y_true) * (1 - tau) * (~y_true_gte_y_pred)

In [None]:
def plot_example(tau, y, p):
    plt.figure(figsize=(15,3))

    plt.subplot(131)
    plt.plot(y, label='y_true')
    plt.plot(p, label='y_pred')
    plt.legend()

    plt.subplot(132)
    plt.stem(y - p, use_line_collection=True)
    plt.ylabel('y_true - y_pred')
    plt.xlabel('y_true')

    plt.subplot(133)
    plt.plot(L(tau, y, p));
    plt.ylabel(f'L({tau}, y_true, y_pred)')
    plt.xlabel('y_true')
    
    plt.show()

In [None]:
n = 21
plot_example(tau=.95,
             y=np.zeros(n),
             p=np.linspace(-1, 1, n))


In [None]:
from ipywidgets import interactive
interactive(lambda t: plot_example(tau=t, y=np.zeros(n), p=np.linspace(-1, 1, n)),
            t=(0.1, .9))

# Intuition about the Pinball Loss

The pinball loss function has been named after its shape that looks like the trajectory of a ball on a pinball.

When you predict a timeseries forecast, you can estimate an lower and upper bound that you warrant that the real/true/future value will be inside with an interval with a certain probability.

When you predict the quantile regression with `tau=0.05` you are saying that this value is the lower bound of your model. With `tau=0.95`, the upper bound. With this two bounds you are saying that your model predicts that the true value have 90% of chance of being inside the bounds.

The pinball loss penalizes true values that are outside this bounds more that values that are inside.

So it is better to make this band thin and make the true values inside this band.

# Example

In [None]:
X = train[(train.Target == 'ConfirmedCases')
      & (train.Country_Region == 'US')
      & (train.Province_State == '')
      & (train.County == '')][['Date', 'TargetValue']]

In [None]:
X['Date'] = pd.to_datetime(X['Date'])

In [None]:
X.set_index('Date', inplace=True)

In [None]:
X.index.freq = 'D'

In [None]:
X.plot();

In [None]:
import statsmodels.tsa.ar_model

In [None]:
%psource model_result.plot_predict

In [None]:
from statsmodels.tsa.ar_model import ar_select_order, arma2ma
from scipy.stats import norm

In [None]:
X_train = X.iloc[:X.shape[0]-7]
X_test = X.iloc[-7:]

In [None]:
selection = ar_select_order(X_train, 13, seasonal=True)
model_result = selection.model.fit()

In [None]:
fig = plt.figure(figsize=(9,3))
fig = model_result.plot_predict(start=0, end=X_train.shape[0] + 7, alpha=.1, fig=fig)
plt.plot(X_test, marker='x', linestyle='none')

In [None]:
fig = plt.figure(figsize=(15,3))
fig = model_result.plot_predict(start=X_train.shape[0]-7, end=X_train.shape[0] + 7 - 1, alpha=.1, fig=fig)
plt.plot(X_test, marker='x', linestyle='none')

When you eval the test values with 

In [None]:
start       = X_train.shape[0] - 7
end         = X_train.shape[0] + 7 - 1
dynamic     = False
exog        = None
exog_oos    = None
alpha       = .1
in_sample   = True
predictions = model_result.predict(
    start=start,
    end=end,
    dynamic=dynamic,
    exog=exog,
    exog_oos=exog_oos
)
start = 0 if start is None else start
end = model_result.model._index[-1] if end is None else end
_, _, oos, _ = model_result.model._get_prediction_index(start, end)

if oos:
    if isinstance(predictions, pd.Series):
        predictions = predictions.iloc[-oos:]
    else:
        predictions = predictions[-oos:]
else:
    raise ValueError('in_sample is False but there are no'
                     'out-of-sample forecasts to plot.')

if oos and alpha is not None:
    pred_oos = np.asarray(predictions)[-oos:]
    ar_params = model_result._lag_repr()
    ma = arma2ma(ar_params, [1], lags=oos)
    fc_error = np.sqrt(model_result.sigma2) * np.cumsum(ma ** 2)
    quantile = norm.ppf(alpha / 2)
    lower = pred_oos + fc_error * quantile
    upper = pred_oos + fc_error * -quantile

In [None]:
plt.figure(figsize=(12,3))
plt.plot(X_test.values, marker='o', color='r', alpha=.5, linestyle='none', label='y_true')
plt.plot(pred_oos, marker='x', linestyle='none', label='y_pred')
plt.fill_between(np.arange(pred_oos.shape[0]), lower, upper, color='gray', alpha=.5, label='90% confidence interval')
plt.legend()
plt.show()

plt.figure(figsize=(12,3))
plt.plot(L(0.05, X_test.TargetValue.values, lower), label='$L_{0.05}$(y_true, lower_bound)')
plt.plot(L(0.50,  X_test.TargetValue.values, pred_oos), label='$L_{0.50}$(y_true, predictions)')
plt.plot(L(0.95, X_test.TargetValue.values, upper), label='$L_{0.95}$(y_true, upper_bound)')
plt.legend()
plt.show()

plt.figure(figsize=(12,3))
plt.plot(np.mean(np.stack([L(0.05, X_test.TargetValue.values, lower),
                           L(0.50,  X_test.TargetValue.values, pred_oos),
                           L(0.95, X_test.TargetValue.values, upper)]),
                 axis=0))
plt.show();

Let's change the lower bound on the first out-of-sample forecast day so that the true value gets outside the band (bellow the lower bound).

In [None]:
 X_test.TargetValue.values[0], lower[0]

In [None]:
lower[0] = X_test.values[0] + 100

In [None]:
 X_test.TargetValue.values[0], lower[0]

In [None]:
plt.figure(figsize=(12,3))
plt.plot(X_test.values, marker='o', color='r', alpha=.5, linestyle='none', label='y_true')
plt.plot(pred_oos, marker='x', linestyle='none', label='y_pred')
plt.fill_between(np.arange(pred_oos.shape[0]), lower, upper, color='gray', alpha=.5, label='90% confidence interval')
plt.legend()
plt.show()

plt.figure(figsize=(12,3))
plt.plot(L(0.05, X_test.TargetValue.values, lower), label='$L_{0.05}$(y_true, lower_bound)')
plt.plot(L(0.50,  X_test.TargetValue.values, pred_oos), label='$L_{0.50}$(y_true, predictions)')
plt.plot(L(0.95, X_test.TargetValue.values, upper), label='$L_{0.95}$(y_true, upper_bound)')
plt.legend()
plt.show()

plt.figure(figsize=(12,3))
plt.plot(np.mean(np.stack([L(0.05, X_test.TargetValue.values, lower),
                           L(0.50,  X_test.TargetValue.values, pred_oos),
                           L(0.95, X_test.TargetValue.values, upper)]),
                 axis=0))
plt.show();

One way of putting the true values outside the confidence interval is using the median predictions as quantile predictions ;)

In [None]:
lower = pred_oos
upper = pred_oos

In [None]:
plt.figure(figsize=(12,3))
plt.plot(X_test.values, marker='o', color='r', alpha=.5, linestyle='none', label='y_true')
plt.plot(pred_oos, marker='x', linestyle='none', label='y_pred')
plt.fill_between(np.arange(pred_oos.shape[0]), lower, upper, color='gray', alpha=.5, label='90% confidence interval')
plt.legend()
plt.show()

plt.figure(figsize=(12,3))
plt.plot(L(0.05, X_test.TargetValue.values, lower), label='$L_{0.05}$(y_true, lower_bound)')
plt.plot(L(0.50,  X_test.TargetValue.values, pred_oos), label='$L_{0.50}$(y_true, predictions)')
plt.plot(L(0.95, X_test.TargetValue.values, upper), label='$L_{0.95}$(y_true, upper_bound)')
plt.legend()
plt.show()

plt.figure(figsize=(12,3))
plt.plot(np.mean(np.stack([L(0.05, X_test.TargetValue.values, lower),
                           L(0.50,  X_test.TargetValue.values, pred_oos),
                           L(0.95, X_test.TargetValue.values, upper)]),
                 axis=0))
plt.show();

The objective is to make the confidence band narrow (having small width) but also keep the true values inside. Look above at the data points at day 1 and 4.