In [None]:
from datetime import  datetime, timedelta

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

df = pd.read_csv("../input/covid19-global-forecasting-week-3/train.csv")
df.shape

In [None]:
loc_group = ["Province_State", "Country_Region"]
def preprocess(df):
    df["Date"] = df["Date"].astype("datetime64[ms]")
    for col in loc_group:
        df[col].fillna("none", inplace=True)
    df["loc"] = df["Province_State"].str.cat(df["Country_Region"], sep="_")
    return df

df = preprocess(df)
df.head()

In [None]:
class ModifiedLabelEncoder(LabelEncoder):
    """https://stackoverflow.com/questions/48929124/scikit-learn-how-to-compose-labelencoder-and-onehotencoder-with-a-pipeline"""
    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(y).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(y).reshape(-1, 1)

In [None]:
lb_encoder = ModifiedLabelEncoder().fit(df["loc"].values)
oh_encoder = OneHotEncoder().fit(lb_encoder.transform(df["loc"].values))

In [None]:
df["Date"].min(), df["Date"].max()

In [None]:
TARGETS = ["ConfirmedCases", "Fatalities"]

for col in TARGETS:
    df[col] = np.log1p(df[col])

In [None]:
for col in TARGETS:
    df["prev_{}".format(col)] = df.groupby(["loc"])[col].shift(1)

In [None]:
df = df[df["Date"] > df["Date"].min()].copy()
df.iloc[50:60]

In [None]:
from datetime import timedelta

# Now it's leaky

TRAIN_LAST = df["Date"].max()

dev_df, test_df = df[df["Date"] <= TRAIN_LAST].copy(), df[df["Date"] > TRAIN_LAST].copy()
dev_df.shape, test_df.shape

In [None]:
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

features = ["prev_{}".format(col) for col in TARGETS]

model = Pipeline([('fe', ColumnTransformer(
    [
        ('prev', PolynomialFeatures(degree=1, include_bias=False), features),
        ('loc', Pipeline([('label', lb_encoder), ('onehot', oh_encoder)]), 'loc')
    ])),
    ('linear', Ridge())])

train_df = dev_df[dev_df.Date > dev_df.Date.max() - timedelta(days=4)]
model.fit(train_df[features + ['loc']],train_df[TARGETS])

[mean_squared_error(train_df[TARGETS[i]], model.predict(train_df[features+ ['loc']])[:, i]) for i in range(len(TARGETS))]

In [None]:
train_df.head()

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def evaluate(df):
    error = 0
    for col in TARGETS:
        error += rmse(df[col].values, df["pred_{}".format(col)].values)
    return np.round(error/len(TARGETS), 5)


def predict(test_df, first_day, num_days, val=False):

    y_pred = model.predict(test_df.loc[test_df["Date"] == first_day][features + ['loc']])

    for i, col in enumerate(TARGETS):
        test_df["pred_{}".format(col)] = 0
        test_df.loc[test_df["Date"] == first_day, "pred_{}".format(col)] = y_pred[:, i]
        test_df.loc[
            test_df["Date"] == first_day + timedelta(days=1), 
            features[i]
        ] = y_pred[:, i]

    if val:
        print(first_day, evaluate(test_df[test_df["Date"] == first_day]))

    for d in range(1, num_days):
        date = first_day + timedelta(days=d)
        y_pred = model.predict(
            test_df.loc[test_df["Date"] == date][features + ['loc']]
        )
        for i, col in enumerate(TARGETS):
            test_df.loc[test_df["Date"] == date, "pred_{}".format(col)] = y_pred[:, i]
            if d != num_days - 1:
                test_df.loc[
                    test_df["Date"] == date + timedelta(days=1), features[i]
                ] = y_pred[:, i]
        if val:
            print(date, evaluate(test_df[test_df["Date"] == date]))
        
    return test_df

In [None]:
sub_df = preprocess(pd.read_csv("../input/covid19-global-forecasting-week-3/test.csv"))

SUB_FIRST = datetime(2020,3,26)
SUB_LAST = datetime(2020,4,8)
SUB_DAYS = sub_df.loc[sub_df["Date"] <= SUB_LAST, "Date"].nunique()

sub_df = dev_df[dev_df["Date"] < SUB_FIRST].append(sub_df[sub_df["Date"] <= SUB_LAST], sort=False)

for col in TARGETS:
    sub_df["prev_{}".format(col)] = sub_df.groupby(loc_group)[col].shift()
    
sub_df = sub_df[(sub_df["Date"] >= SUB_FIRST) & (sub_df["Date"] <= SUB_LAST)].copy()
sub_df["ForecastId"] = sub_df["ForecastId"].astype(np.int16)
sub_df = predict(sub_df, SUB_FIRST, SUB_DAYS)

for col in TARGETS:
    sub_df[col] = np.expm1(sub_df["pred_{}".format(col)])
    
sub_public = sub_df.copy()
sub_public.head()

In [None]:
sub_public[sub_public["Country_Region"] == "US"][
    ["Date"] + TARGETS + ["pred_ConfirmedCases", "pred_Fatalities", "prev_ConfirmedCases", "prev_Fatalities"]
]

In [None]:
sub_public.Date.unique()

In [None]:
sub_df = preprocess(pd.read_csv("../input/covid19-global-forecasting-week-3/test.csv"))

SUB_FIRST = datetime(2020,4,9)
SUB_DAYS = sub_df.loc[sub_df["Date"] > TRAIN_LAST, "Date"].nunique()

sub_df = dev_df[
    dev_df.Date <= TRAIN_LAST
].append(sub_df[sub_df.Date > TRAIN_LAST], sort=False)

for col in TARGETS:
    sub_df["prev_{}".format(col)] = sub_df.groupby(loc_group)[col].shift()
   
sub_df = sub_df[sub_df["Date"] > TRAIN_LAST].copy()
sub_df["ForecastId"] = sub_df["ForecastId"].astype(np.int16)
sub_df = predict(sub_df, TRAIN_LAST + timedelta(days=1), SUB_DAYS)

for col in TARGETS:
    sub_df[col] = np.expm1(sub_df["pred_{}".format(col)])
    
sub_private = sub_df.copy()
sub_private.head()

In [None]:
sub_private.Date.unique()

In [None]:
df_sub = pd.concat([sub_public, sub_private[sub_private.Date >= SUB_FIRST]], axis=0)
df_sub.Date.unique()

In [None]:
df_sub.to_csv("submission.csv", index=False, columns=["ForecastId"] + TARGETS)

In [None]:
df_sub[df_sub["Province_State"] == "California"][["Date"] + TARGETS]