https://www.kaggle.com/c/covid19-global-forecasting-week-5

Due May 11

# Plan:
1. look at each county, province, country

        > min(test.Date)
        '2020-04-27'
        > max(test.Date)
        '2020-06-10'
        > max(train.Date)
        '2020-05-08'


2. add intercept (Weight constant based on Population), x square, fit ridge, alpha = 0.1
        # add some noise
        tmp.y1 = tmp.y1 + np.random.normal(size = tmp.shape[0], scale=.05)
        tmp.y1[200:220] = tmp.y1[200:220] + 1
        tmp['intercept'] = 1
        tmp['x2'] = tmp['x']**2
        tmp['nutch'] = 0
        tmp['nutch'][330:] = -1

        train = tmp[:start_test]
        test = tmp[start_test:]
        # train = train.drop(['y2','y_sum'])
        # test = test.drop(['y2','y_sum'])

        feat = ['intercept','x', 'x2']
        y = train.y1
        x = train[feat]
        
        f = Ridge(alpha=1.0).fit(x,y)  # higher alpha, the higher the penalization
        fitted = f.predict(train[feat])
        y_pred = f.predict(test[feat])
3. add predictions back to train, get quantiles:
        resid = fitted - y
        fig, ax = plt.subplots()
        ax.plot(resid)
        for q in [0.05,0.5,0.95]:
            print(f'resid q={q}', np.quantile(resid, q=q))
4. Output (predict daily quantiles):
        ForecastId	County	Province_State	Country_Region	Population	Weight	Date	Target	q0.05	q0.5	q0.95
        0	1			Afghanistan	27657145	0.058359	2020-04-27	ConfirmedCases	0.0	1.0	171.7
        1	2			Afghanistan	27657145	0.583587	2020-04-27	Fatalities	0.0	0.0	4.7
        2	3			Afghanistan	27657145	0.058359	2020-04-28	ConfirmedCases	0.0	1.0	171.7
        
        
                    ForecastId_Quantile	TargetValue
        0	1_0.05	0.0
        1	2_0.05	0.0
        2	3_0.05	0.0
        
        
# Ref
* https://coronavirus.jhu.edu/map.html
* https://www.kaggle.com/c/covid19-global-forecasting-week-5/discussion


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.linear_model import SGDRegressor, LinearRegression, Lasso, Ridge, LogisticRegression

from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from datetime import timedelta

import pickle
import statsmodels.api as sm
lowess = sm.nonparametric.lowess

import warnings
warnings.filterwarnings(action='once')

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 1000)
plt.rcParams['figure.figsize'] = [8, 4]  # 12, 8  width


In [None]:
train = pd.read_csv('../input/covid19-global-forecasting-week-5/train.csv')
train['Date'] = pd.to_datetime(train['Date'])
train['County']=train['County'].fillna("")
train['Province_State']=train['Province_State'].fillna("")
print(min(train.Date),max(train.Date))
train.head()

In [None]:
# negative TargetValue?
train.sort_values(by='TargetValue').head(20)

In [None]:
a=train.groupby(['County','Province_State','Country_Region','Target'])['TargetValue'].quantile(q=0.05).reset_index()
b=train.groupby(['County','Province_State','Country_Region','Target'])['TargetValue'].quantile(q=0.5).reset_index()
c=train.groupby(['County','Province_State','Country_Region','Target'])['TargetValue'].quantile(q=0.95).reset_index()
a.columns=['County','Province_State','Country_Region','Target','q0.05']
b.columns=['County','Province_State','Country_Region','Target','q0.5']
c.columns=['County','Province_State','Country_Region','Target','q0.95']
a=pd.concat([a,b['q0.5'],c['q0.95']],1)
a['q0.05']=a['q0.05'].clip(0,10000)
a['q0.5']=a['q0.5'].clip(0,10000)
a['q0.95']=a['q0.95'].clip(0,10000)
a.head()

In [None]:
test = pd.read_csv('../input/covid19-global-forecasting-week-5/test.csv')
test['Date'] = pd.to_datetime(test['Date'])
test['County']=test['County'].fillna("")
test['Province_State']=test['Province_State'].fillna("")
test.head()

In [None]:
print('train', min(train.Date), max(train.Date))
print('test', min(test.Date), max(test.Date))

In [None]:
%%time

dt_pred = pd.date_range(pd.to_datetime(max(train.Date)) + timedelta(days=1), max(test.Date))
x_cols = ['Weight','x', 'x2']
y_col = ['TargetValue']
i = 0
a = pd.DataFrame()
models = dict()

for key,grp in tqdm(train.groupby(['County','Province_State','Country_Region','Target'])):
    print(f'key={key}')
    
    grp = grp.sort_values(by=['Date'])

    n_train = grp.shape[0]
    n_test  = dt_pred.shape[0]
    n_all   = n_train + n_test

    df_test = grp.head(len(dt_pred)).copy()
    df_test['Id'] = -1
    df_test['Date'] = dt_pred
    df_test['TargetValue'] = 0.
    df = grp.append(df_test).copy().reset_index(drop=True)
    test_filter = df.Date >= min(dt_pred)
    df_test = df[test_filter]

    # features
    df['q0.05'] = 0.
    df['q0.5'] = 0.
    df['q0.95'] = 0.
    df['x'] = list(range(df.shape[0]))
    df['x2'] = df.x**2


    # train / fit
    df_train, df_test = df[:n_train], df[n_train:]

    # start with first non-zero
    try:
        start_x = min(df_train.query('TargetValue > 0')['x'])
    except:
        start_x = min(df_train.x)
    df_train = df_train.query(f'x >= {start_x}')

    X_train, y_train = df_train[x_cols], df_train[y_col]
    X_test, y_test = df_test[x_cols], df_train[y_col]


    # median / mean
    f = Ridge(alpha=10.0).fit(X_train,y_train)
    models[key] = f
    fitted = f.predict(X_train)
    y_test = f.predict(X_test)   
    fitted_resid = fitted - y_train

    quant = dict()
    for q in [0.05, 0.5, 0.95]:
        quant[q] = np.quantile(fitted_resid, q=q)


    # 0.05
    fitted_05 = fitted + (quant[0.05])
    y_test_05 = y_test + (quant[0.05])

    # 0.95  
    fitted_95 = fitted + (quant[0.95])
    y_test_95 = y_test + (quant[0.95])


#         df_train['q0.05'], df_train['q0.5'], df_train['q0.95'] = fitted_05.clip(0,10000), fitted.clip(0,10000), fitted_95.clip(0,10000)
#         df_test.loc[:,'q0.05'], df_test.loc[:,'q0.5'], df_test.loc[:,'q0.95'] = y_test_05.clip(0,10000), y_test.clip(0,10000), y_test_95.clip(0,10000)
#         df = df_train.append(df_test)
    try:
        start_x_dt = min(df.query('TargetValue > 0')['Date'])
    except:
        start_x_dt = min(df.Date)
    df.loc[df.Date >= start_x_dt, 'q0.05'] = np.concatenate([fitted_05,y_test_05]).clip(0, 10000)
    df.loc[df.Date >= start_x_dt, 'q0.5'] = np.concatenate([fitted,y_test]).clip(0, 10000)
    df.loc[df.Date >= start_x_dt, 'q0.95'] = np.concatenate([fitted_95,y_test_95]).clip(0, 10000)

    a_cols = ['County','Province_State','Country_Region','Target','Date','q0.05','q0.5','q0.95']
    a = a.append(df.query(f'Date>="{min(test.Date)}"')[a_cols])

    country = key[2]
    prov    = key[1]
    # Note: US has a LOT of counties
    if i <= 4 or country in ['Spain','Italy','Monaco','China','UK','Canada','Mexico','Brazil','France', 'Japan', 'Taiwan*'] \
        or prov in ['British Columbia','Hong Kong','New York']:
        fig, ax = plt.subplots()
        ax.plot(X_train.x, y_train, label='data')
        ax.plot(X_train.x, fitted, label='fitted')
        ax.plot(X_test.x, y_test, label='pred')
        ax.plot(X_train.x, fitted_95, label='fitted_95')
        ax.plot(X_test.x, y_test_95, label='pred_95')
        ax.plot(X_train.x, fitted_05, label='fitted_05')
        ax.plot(X_test.x, y_test_05, label='pred_05')
        ax.legend()
        title = f"{key}"
        ax.set_title(title)

#         if i >= 4:
#             break

    i += 1
        
m_fn = 'models.pickle'
with open(m_fn, 'wb') as f:
    pickle.dump(models, f)
print(f'Saved {len(models)} to {m_fn}')

In [None]:
a

In [None]:
test2 = test.merge(a,on=['Country_Region','County','Province_State','Target', 'Date'],how='left')
test2.head()

In [None]:
# test=test.merge(a,on=['Country_Region','County','Province_State','Target'],how='left')
# test.head()

In [None]:
sub = pd.melt(test2[['ForecastId','q0.05','q0.5','q0.95']], id_vars=['ForecastId'], value_vars=['q0.05','q0.5','q0.95'])
sub

In [None]:
# sub=pd.melt(test2, id_vars=['ForecastId'], value_vars=['q0.05','q0.5','q0.95'])
sub['variable']=sub['variable'].str.replace("q","", regex=False)
sub['ForecastId_Quantile']=sub['ForecastId'].astype(str)+'_'+sub['variable']
sub['TargetValue']=sub['value']
sub=sub[['ForecastId_Quantile','TargetValue']]
sub.reset_index(drop=True,inplace=True)
sub.to_csv("submission.csv",index=False)
sub.head()