Other approach:
- https://www.kaggle.com/binhlc/sar-cov-2-week-5-gradientboosting-vs-light-gbm

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_test = pd.read_csv("../input/covid19-global-forecasting-week-5/test.csv")
df_train = pd.read_csv("../input/covid19-global-forecasting-week-5/train.csv")
df_sub = pd.read_csv("../input/covid19-global-forecasting-week-5/submission.csv")

valid_date = df_test.Date.min()
df_train = df_train[df_train.Date < valid_date]
df = pd.concat([df_train, df_test])
df.Date = pd.to_datetime(df.Date)

from sklearn.preprocessing import LabelEncoder
from datetime import datetime

df["geography"] = df.Country_Region + "_" + df.Province_State + "_" + df.County
df.loc[df.County.isna(), "geography"] = df[df.County.isna()].Country_Region + "_" + df[df.County.isna()].Province_State
df.loc[df.Province_State.isna(), "geography"] = df[df.Province_State.isna()].Country_Region

le = LabelEncoder()
df.Country_Region = le.fit_transform(df.Country_Region.astype(str))
df.Province_State = le.fit_transform(df.Province_State.astype(str))
df.County = le.fit_transform(df.County.astype(str))
df.Target = le.fit_transform(df.Target.astype(str))



In [None]:
lags = [1, 2, 3, 4, 5, 6, 7, 8, 9]
lag_cols = [f"lag_{lag}" for lag in lags]
wins = [3,7]
lag_wins = [1,2,3]

win_cols = []
for win in wins:
    for lag_win in lag_wins:
        win_col = f"rmean_{lag_win}_{win}"  
        win_cols = win_cols + [win_col]

def createfeature(df):   
    df.sort_values(["geography", "Date", "Target"], inplace = True)
    for lag, lag_col in zip(lags, lag_cols):
        df[lag_col] = df.groupby(["geography", "Target"])["TargetValue"].shift(lag)

    for win in wins:
        for lag_win in lag_wins:
            win_col = f"rmean_{lag_win}_{win}"          
            df[win_col] = df[[f"lag_{lag}" for lag in range(lag_win, lag_win+win)]].mean(axis = 1)

    return df

In [None]:
df = createfeature(df)
features = ["Country_Region", "Province_State", "County", "Population","Target"] + lag_cols + win_cols   

categorical_features = ["Country_Region", "Province_State", "County", "Target"]

df_train = df[~(df.TargetValue.isna()) & ~ (df.lag_9.isna())]
X_train = df_train[df_train.Date < datetime(2020, 4, 20)][features]
y_train = df_train[df_train.Date < datetime(2020, 4, 20)].TargetValue.values
X_test = df_train[df_train.Date >= datetime(2020, 4, 20)][features]
y_test = df_train[df_train.Date >= datetime(2020, 4, 20)].TargetValue.values

print(f"Train shape: {(X_train.shape, y_train.shape)}")
print(f"Test shape: {(X_test.shape, y_test.shape)}")

In [None]:
QUANTILE = [0.05, 0.5, 0.95]

def Weighted_Pinball_Loss(q, X_test, y_test, y_pred):
    df_weight = X_test[['Population','Target']].copy()
    df_weight['Weight'] = df_weight['Population']
    #Facilities
    df_weight.loc[df_weight.Target == 1, 'Weight'] = 10 *  df_weight.loc[df_weight.Target == 1, 'Weight']
    #W = X_test.apply(lambda x: x.Population if x.Target == 0 else 10 * x.Population, axis=1).values
    W = df_weight['Weight'].values
    W = np.log(W+1) ** -1
    e = y_test - y_pred
    L = np.maximum(q * e, (q - 1) * e)
    score = np.average(L, weights = W)
    return score

In [None]:
%%time

from sklearn.ensemble import RandomForestRegressor

LEARNING_RATE = 0.01
N_ESTIMATORS = 100

model_rf_save = {}
score_rf_save = {}

model = RandomForestRegressor(n_estimators=N_ESTIMATORS,n_jobs=-1)
model = model.fit(X_train,y_train)

rf_preds = []
for estimator in model.estimators_:
    rf_preds.append(estimator.predict(X_test))
rf_preds = np.array(rf_preds).transpose()

for alpha in QUANTILE:
    y_pred = np.percentile(rf_preds, alpha * 100, axis=1)
    score = Weighted_Pinball_Loss(alpha,X_test,y_test,y_pred)
    score_rf_save.update({alpha: score})
    print(f'{alpha}: Weighted Pinball Loss {score}')
print(f'Average Pinball Loss: {np.mean(list(score_rf_save.values()))}')    

model = RandomForestRegressor(n_estimators=N_ESTIMATORS,n_jobs=-1)
model = model.fit(df_train[features], df_train.TargetValue.values)
model_rf_save = model

In [None]:
model_save = model_rf_save

# Predict by day
df_sub = pd.DataFrame()
for pred_date in df_test.Date.unique():
    print(f'Predict day {pred_date}')
    X_pred = df[df.Date == pred_date][features]

    rf_preds = []
    for estimator in model_save.estimators_:
        rf_preds.append(estimator.predict(X_pred))
    rf_preds = np.array(rf_preds).transpose()

    for alpha in QUANTILE:
        y_pred = np.percentile(rf_preds, alpha * 100, axis=1)
        df_sub = df_sub.append(pd.DataFrame({
            'ForecastId': (df_test[df_test.Date == pred_date]['ForecastId']).values,
            'Quantile': alpha,
            'ForecastId_Quantile': (df_test[df_test.Date == pred_date]['ForecastId'].astype(str) + '_' + str(alpha)).values, 
            'TargetValue': y_pred }),ignore_index=True)
    y_pred = model_save.predict(X_pred)
    df.loc[df.Date == pred_date, 'TargetValue'] = y_pred
    df = createfeature(df)           
        
    # Create submission
df_sub = df_sub.sort_values(['ForecastId','Quantile']).reset_index(drop = True)

In [None]:
df_sub[['ForecastId_Quantile','TargetValue']].to_csv("submission.csv", index = False)

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import NumeralTickFormatter
from bokeh.palettes import Spectral11
output_notebook()

In [None]:
def plotCountry(country):
    df_country = pd.merge(left=df_test[df_test['Country_Region'] == country], right=df_sub, left_on='ForecastId', right_on='ForecastId')
    df_country = df_country.groupby(['Date','Target','Quantile']).sum().reset_index()
    df_country.Date = pd.to_datetime(df_country.Date)
    mypalette=Spectral11[0:3]
    p = figure(title=country + " Confirmed Cases Forecast", x_axis_label='Date', x_axis_type='datetime', y_axis_label='Confirmed Cases')
    i = 0
    for alpha in QUANTILE:
        df_quantile = df_country[(df_country['Target'] == 'ConfirmedCases') & (df_country['Quantile'] == alpha)]   
        p.line(df_quantile['Date'], df_quantile['TargetValue'], legend_label=f"Confirmed Cases - Quantile {alpha}", line_width=2, line_color=mypalette[i])
        i += 1
    p.legend.location = "top_left"
    p.yaxis.formatter=NumeralTickFormatter(format="‘0.0a")    
    show(p)

    mypalette=Spectral11[0:3]
    p = figure(title=country + " Fatalities Forecast", x_axis_label='Date', x_axis_type='datetime', y_axis_label='Fatalities')
    i = 0
    for alpha in QUANTILE:
        df_quantile = df_country[(df_country['Target'] == 'Fatalities') & (df_country['Quantile'] == alpha)]   
        p.line(df_quantile['Date'], df_quantile['TargetValue'], legend_label=f"Fatalities - Quantile {alpha}", line_width=2, line_color=mypalette[i])
        i += 1
    p.legend.location = "top_left"
    p.yaxis.formatter=NumeralTickFormatter(format="‘0.0a")    
    show(p)

In [None]:
plotCountry('US')

In [None]:
plotCountry('Vietnam')