WM data based on our public kernel https://www.kaggle.com/philippsinger/covid-w5-worldometer-scraper

In [None]:
import os
import datetime
import numpy as np
import pandas as pd
import time
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm

pd.set_option('display.max_rows', 100)

path = '../input/covid19-global-forecasting-week-5/'
train = pd.read_csv(path + 'train.csv')
test  = pd.read_csv(path + 'test.csv')
sub   = pd.read_csv(path + 'submission.csv')

In [None]:
train_extra = pd.read_csv("../input/wmscraperfinal2/train_extra.csv")
train_extra.TargetValue.isna().sum()
train_extra.loc[train_extra['Province_State'].isnull(), 'Province_State'] = 'N/A'
train_extra.loc[train_extra['County'].isnull(), 'County'] = 'N/A'

In [None]:
train_extra2 = pd.read_csv("../input/wmscraperfinal3/extra_data_11_5_2020_v2.csv")
train_extra2.loc[train_extra2['Province_State'].isnull(), 'Province_State'] = 'N/A'
train_extra2.loc[train_extra2['County'].isnull(), 'County'] = 'N/A'

In [None]:
train_extra.loc[~train_extra["Country_Region"].isin(train_extra2["Country_Region"]), "TargetValue"] = np.nan

In [None]:
len(train_extra)

In [None]:
train_extra.head()

In [None]:
len(train_extra2)

In [None]:
train_extra = train_extra.merge(train_extra2[["County", "Province_State", "Country_Region", "Target", "TargetValue"]], on=["County", "Province_State", "Country_Region", "Target"], how="left")

In [None]:
train_extra[(train_extra.TargetValue_x != train_extra.TargetValue_y) & (~train_extra.TargetValue_y.isna())]

In [None]:
train_extra["TargetValue"] = train_extra["TargetValue_y"]
del train_extra["TargetValue_x"]
del train_extra["TargetValue_y"]

In [None]:
train_extra.TargetValue.isna().sum()

In [None]:
train = pd.concat([train, train_extra], axis=0)

train.groupby("Date")["Id"].count().unique()

In [None]:
train = train.sort_values(["Country_Region", "Province_State", "County", "Date", "Target"]).reset_index(drop=True)

In [None]:
train['Date'] = train['Date'].apply(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d')))
test['Date'] = test['Date'].apply(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d')))

train['days'] = (train['Date'].dt.date - train['Date'].dt.date.min()).dt.days
test['days'] = (test['Date'].dt.date - train['Date'].dt.date.min()).dt.days

train.loc[train['Province_State'].isnull(), 'Province_State'] = 'N/A'
test.loc[test['Province_State'].isnull(), 'Province_State'] = 'N/A'

train.loc[train['County'].isnull(), 'County'] = 'N/A'
test.loc[test['County'].isnull(), 'County'] = 'N/A'

train['Area'] = train['Country_Region'] + '_' + train['Province_State'] + '_' + train['County']
test['Area'] = test['Country_Region'] + '_' + test['Province_State'] + '_' + test['County']

print(train['Date'].max())
print(test['Date'].min())
print(train['days'].max())

AREAS = np.sort(train['Area'].unique())
#VAL_LEN = 28
#TRAIN_N = train['days'].max() - VAL_LEN

TRAIN_N = 95 + 14
VAL_LEN = train['days'].max() - TRAIN_N + 1
print(TRAIN_N)

print(train[train['days'] < TRAIN_N]['Date'].max())
print()
print(train[train['days'] >= TRAIN_N]['Date'].min())
print(train[train['days'] >= TRAIN_N]['Date'].max())

TAU_LIST = [0.05, 0.5, 0.95]

test_orig = test.copy()
train.head()


In [None]:
train[train["Country_Region"]=="Mexico"].tail()

In [None]:
train_c_raw = train[train['Target'] == 'ConfirmedCases'].pivot(index='Area', columns='days', values='TargetValue').sort_index().values
train_f_raw = train[train['Target'] == 'Fatalities'].pivot(index='Area', columns='days', values='TargetValue').sort_index().values

#train_c = np.clip(train_c_raw, , None)
#train_f = np.clip(train_f_raw, 0, None)

train_c = train_c_raw
train_f = train_f_raw

weights_c = train[train['Target'] == 'ConfirmedCases'].groupby('Area')['Weight'].mean().sort_index().values.reshape(-1,1)
weights_f = train[train['Target'] == 'Fatalities'].groupby('Area')['Weight'].mean().sort_index().values.reshape(-1,1)


In [None]:
train_c

In [None]:
idx_nan = np.isnan(train_c[:,-1])
train_c[idx_nan] = np.roll(train_c[idx_nan], 1, axis=1)


In [None]:
train_c

In [None]:
idx_nan = np.isnan(train_f[:,-1])
train_f[idx_nan] = np.roll(train_f[idx_nan], 1, axis=1)

In [None]:
train_f

In [None]:
# train_c = train_c[:,1:]
# train_f = train_f[:,1:]

In [None]:
train_c = np.nan_to_num(train_c)
train_f = np.nan_to_num(train_f)

In [None]:
X_c = train_c
X_f = train_f

In [None]:
def pinball_loss_single(ytrue, pred, weight, tau=0.5):
    cond = (ytrue >= pred).astype(int)
    error = np.sum(weight * (ytrue - pred) * cond * tau) - \
            np.sum(weight * (ytrue - pred) * (1-cond) * (1-tau))
    return error / ytrue.shape[0] / ytrue.shape[1]

def pinball_loss_many(ytrue, preds, weight, tau=[0.05, 0.50, 0.95]):
    return np.mean([pinball_loss_single(ytrue, preds[i], weight, t) for i,t in enumerate(tau)])


from hyperopt import hp, space_eval, fmin, tpe, Trials, rand
import multiprocessing as mp
from hyperopt.pyll.base import scope
from joblib import Parallel, delayed

class ZmodelBase():
    def __init__(self, loss_fun):
        self.loss_fun = loss_fun
        self.space = {}
    
    def _predict(self, params, X, horizon):
        pass
    
    def _objective(self, params, X, horizon):
        preds = self._predict(params, X[:,:-horizon], horizon)
        loss = self.loss_fun(X[:, -horizon:], preds[:, -horizon:])
        return loss

    def opt(self, X, valid_horizon=14, rstate=42, max_trials=30, overrides={}):
        for key, value in overrides.items():
            self.space[key] = value
        trials = Trials()
        rstate = np.random.RandomState(rstate)

        
        best = fmin(lambda p: self._objective(p, X, valid_horizon),
                    self.space,
                    algo=tpe.suggest,
                    max_evals=max_trials,
                    trials=trials,
                    rstate=rstate,
                    show_progressbar=False,
                    verbose=0)
        self.best_params = space_eval(self.space, best)
        self.best_loss = self._objective(self.best_params, X, valid_horizon)
        
        print("best loss", self.best_loss)
        
        return self
    
    def predict(self, X, test_horizon=50):
        return self._predict(self.best_params, X, test_horizon)



In [None]:
from hyperopt import hp, space_eval, fmin, tpe, Trials, rand
import multiprocessing as mp
from hyperopt.pyll.base import scope
from joblib import Parallel, delayed
import time

class Zmodel1(ZmodelBase):
    def __init__(self, loss_fun):
        super(Zmodel1, self).__init__(loss_fun)
        self.space = {
            "min cases for local growth": hp.choice("min cases for local growth", [0,10,100,500]),
            "N days for local growth": scope.int(hp.quniform('N days for local growth', 1, 25, 1)),
            "growth default": hp.quniform('growth default', -0.1, 0.1, 0.01),

            #"N days for start": scope.int(hp.quniform('N days for start', 1, 25, 1)),
            "N days for start": hp.choice('N days for start', [1,2,3,4,5,6,7,8,9,12,13,14,15,16,19,20,21,22,23]),
            #"start function": hp.choice("start function", [np.min, np.mean, np.max]),
            "quantile": hp.quniform('quantile', 0, 1, 0.1),

            "growth factor": hp.quniform('growth factor', 0.4, 1.0, 0.01),
            "growth scale factor": hp.quniform('growth scale factor', 0.5, 10.0, 0.1),
            
            "delta factor": hp.quniform('delta factor', 0.9, 0.99, 0.001),
            "growth multiplier": hp.quniform('growth multiplier', 0.0, 1.0, 0.01),
        }
    
    def _predict(self, params, X, horizon):
        gr_base = []
        gr_base_factor = []

        #X = X.copy()
        
        X = np.clip(X, 0, None)
    
#         X_shift = np.roll(X, 1, axis=1)
#         X[np.where(X<0)] = X_shift[np.where(X<0)]

        threshold = params['min cases for local growth']
        num_days = params['N days for local growth']
        
        check = (X > threshold).sum(axis=1) > num_days
        d = np.mean(np.diff(X[check], axis=1)[-num_days:])
        d += np.abs(d) * params['growth multiplier']
        
        gr_base = np.zeros(X.shape[0])
        gr_base[~check] = params['growth default']
        gr_base[check] = d

#         for i in range(X.shape[0]):
#             temp = X[i,:]
            
#             if check[i]:
#                 d = np.mean(np.diff(temp[temp > threshold])[-num_days:])
#                 d += np.abs(d) * params['growth multiplier']
#                 gr_base.append(d)
#             else:
#                 gr_base.append(params['growth default'])

#         gr_base = np.array(gr_base)
        preds = X.copy()

        deltas = []
        for i in range(horizon):
            if i == 0:
                pr_base = np.quantile(preds[:, -params['N days for start']:], axis=1, q=params['quantile'])
            else:
                pr_base = deltas[-1]
            #delta = pr_base + gr_base * params['growth factor']
            #delta = np.clip(delta, 0, None) * params['delta factor']
            
            delta = np.clip(pr_base, 0, None) * params['delta factor'] + params['growth scale factor'] * gr_base * params['growth factor'] ** i
            #delta = np.clip(delta, 0, None) * params['delta factor']
            
            deltas.append(delta)
        
        deltas = np.vstack(deltas).T
        preds = np.hstack((preds, deltas))

        return preds
    
s = time.time()

VAL_LEN = 7
    
overrides = [{},{},{}]
zmodels_c = Parallel(n_jobs=1)(delayed(lambda t: \
                Zmodel1(loss_fun = lambda X, Y: pinball_loss_single(X, Y, weights_c, tau=t)).\
                 opt(train_c[:, :], valid_horizon=VAL_LEN, max_trials=1000, overrides=overrides[i]))(tau) for i, tau in enumerate(TAU_LIST))

overrides = [{},{},{}]
zmodels_f = Parallel(n_jobs=1)(delayed(lambda t: \
                Zmodel1(loss_fun = lambda X, Y: pinball_loss_single(X, Y, weights_f, tau=t)).\
                 opt(train_f[:, :], valid_horizon=VAL_LEN, max_trials=1000, overrides=overrides[i]))(tau) for i, tau in enumerate(TAU_LIST))


preds_c = [m.predict(train_c[:, :-VAL_LEN]) for m,tau in zip(zmodels_c, TAU_LIST)]
preds_f = [m.predict(train_f[:, :-VAL_LEN]) for m,tau in zip(zmodels_f, TAU_LIST)]


for i, tau in enumerate(TAU_LIST):
    print(pinball_loss_single(train_c[:, -VAL_LEN:], preds_c[i][:, train_c.shape[1]-VAL_LEN:train_c.shape[1]], weights_c, tau=tau))

loss_c = pinball_loss_many(train_c[:, -VAL_LEN:], [p[:, train_c.shape[1]-VAL_LEN:train_c.shape[1]] for p in preds_c], weights_c)
print()
print(loss_c)
print()


for i, tau in enumerate(TAU_LIST):
    print(pinball_loss_single(train_f[:, -VAL_LEN:], preds_f[i][:, train_c.shape[1]-VAL_LEN:train_c.shape[1]], weights_f, tau=tau))

loss_f = pinball_loss_many(train_f[:, -VAL_LEN:], [p[:, train_c.shape[1]-VAL_LEN:train_c.shape[1]] for p in preds_f], weights_f)
print()
print(loss_f)
print()

print((loss_c + loss_f) / 2)

#print()
#print(time.time() - s)

In [None]:
preds_c = [m.predict(train_c[:, :]) for m,tau in zip(zmodels_c, TAU_LIST)]
preds_f = [m.predict(train_f[:, :]) for m,tau in zip(zmodels_f, TAU_LIST)]

In [None]:
#shift back

for i in range(len(preds_c)):
    preds_c[i][idx_nan,:] = np.roll(preds_c[i][idx_nan,:], -1, axis=1)
    preds_f[i][idx_nan] = np.roll(preds_f[i][idx_nan], -1, axis=1)


In [None]:
for m in zmodels_c:
    print(m.best_params)

In [None]:
for m in zmodels_f:
    print(m.best_params)

In [None]:
import matplotlib.pyplot as plt

plt.style.use(['default'])
fig = plt.figure(figsize = (20, 8))

#for col in ['red', 'grey', 'green', 'purple', 'black', 'yellow', 'blue']:
for col in ['red', 'grey', 'green', 'purple']:
    #idx = np.random.choice(range(len(AREAS)), 1)[0]
    #idx = np.random.choice(np.where([x for x in AREAS if not 'US' in x])[0])
    idx = np.random.choice(np.where(train_c[:,-1] > 100)[0])
    plt.plot(train_c[idx], label=AREAS[idx], color=col)
    for i in range(3):
        plt.plot(preds_c[i][idx], linestyle='--', color=col)

plt.title("Cases")
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.style.use(['default'])
fig = plt.figure(figsize = (20, 8))

#for col in ['red', 'grey', 'green', 'purple', 'black', 'yellow', 'blue']:
for col in ['red', 'grey', 'green', 'purple']:
    #idx = np.random.choice(range(len(AREAS)), 1)[0]
    idx = np.random.choice(np.where([x for x in AREAS if not 'US' in x])[0])
    plt.plot(train_f[idx], label=AREAS[idx], color=col)
    for i in range(3):
        plt.plot(preds_f[i][idx], linestyle='--', color=col)

plt.title("Fatalities")
plt.legend()
plt.show()

In [None]:
submission = sub.copy()
submission['TargetValue'] = -1

for tgt, ar, q in \
        [('ConfirmedCases', preds_c[i], tau) for i, tau in enumerate(TAU_LIST)] + \
        [('Fatalities',     preds_f[i], tau) for i, tau in enumerate(TAU_LIST)]:
    
    temp = pd.DataFrame(np.clip(ar, 0, None))
    temp['Area'] = AREAS
    temp = temp.melt(id_vars='Area', var_name='days', value_name="value")
    temp['Target'] = tgt

    temp = test_orig.merge(temp, how='left', left_on=['Area', 'days', 'Target'], right_on=['Area', 'days', 'Target'])[['ForecastId', 'Target', 'value']]
    temp['ForecastId_Quantile'] = temp['ForecastId'].apply(lambda x: str(x) + '_' + str(q))
    temp = temp[temp['Target'] == tgt]
    submission = submission.merge(temp[['ForecastId_Quantile', 'value']], how='left', left_on=['ForecastId_Quantile'], right_on=['ForecastId_Quantile'])

    cond = ~submission['value'].isnull()
    submission.loc[cond, 'TargetValue'] = submission.loc[cond, 'value']
    del submission['value']

print(submission.shape)
print((submission['TargetValue'] < 0).sum())


In [None]:
submission.to_csv("submission.csv", index=False)