In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

%matplotlib inline

# preprocess

In [None]:
# train

df_train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-1/train.csv')
df_train['Date'] = df_train['Date'].apply(lambda x: (dt.datetime.strptime(x, '%Y-%m-%d')))
df_train['Province/State'] = df_train['Province/State'].fillna('no data')

min_date = df_train['Date'].min()
df_train['DatePassed'] = (df_train['Date'] - min_date).apply(lambda x: x.days)  # we use this for regression

df_train.head()

In [None]:
# test

df_test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-1/test.csv')
df_test['Date'] = df_test['Date'].apply(lambda x: (dt.datetime.strptime(x, '%Y-%m-%d')))
df_test['Province/State'] = df_test['Province/State'].fillna('no data')

df_test['DatePassed'] = (df_test['Date'] - min_date).apply(lambda x: x.days)

df_test.head()

In [None]:
# remove "leak"

min_test_date = df_test['DatePassed'].min()
df_train_wo_leak = df_train[df_train['DatePassed'] < min_test_date] 

# linear regression for log-scale ConfirmedCases by Country/Region

In [None]:
dct_lm_log_7d = {}

for i, row in df_train_wo_leak[['Country/Region', 'Province/State']].drop_duplicates().iterrows():
    print(row.values)
    country = row[0]
    province = row[1]
    
    df_country = df_train_wo_leak[(df_train_wo_leak['Country/Region'] == country) & (df_train_wo_leak['Province/State'] == province)]
    
    # apply log
    df_country['LogConfirmedCases'] = np.log(df_country['ConfirmedCases'] + 1)
    
    # last 7 days
    max_date_passed = df_country['DatePassed'].max()
    df_country_recent = df_country[df_country['DatePassed'] > max_date_passed - 7]
    
    # fit
    clf = LinearRegression()
    clf.fit(df_country_recent[['DatePassed']], df_country_recent['LogConfirmedCases'])
    
    if not country in dct_lm_log_7d.keys():
        dct_lm_log_7d[country] = {}
        
    dct_lm_log_7d[country][province] = {
        'clf': clf,
        'data': df_country_recent,
    }

# linear regression for Fatalities

In [None]:
dct_lm_fatality = {}
timelag = 14
timelag_width = 7

for i, row in df_train_wo_leak[['Country/Region', 'Province/State']].drop_duplicates().iterrows():
    print(row.values)
    country = row[0]
    province = row[1]

    df_country = df_train_wo_leak[(df_train_wo_leak['Country/Region'] == country) & (df_train_wo_leak['Province/State'] == province)]
    df_country['RecentConfirmedCases'] = df_country['ConfirmedCases'].shift(timelag-timelag_width) - df_country['ConfirmedCases'].shift(timelag+timelag_width)
    df_country['NewFatalities'] = df_country['Fatalities'].diff()

    df_country = df_country.dropna()

    clf = LinearRegression()
    clf.fit(df_country[['RecentConfirmedCases']], df_country['NewFatalities'])

    df_country['PredictedFatality'] = clf.predict(df_country[['RecentConfirmedCases']])

    if not country in dct_lm_fatality.keys():
        dct_lm_fatality[country] = {}
        
    dct_lm_fatality[country][province] = {
        'clf': clf,
        'data': df_country,
    }

# predict

In [None]:
lst_submission_pre = []

for i, row in df_train_wo_leak[['Country/Region', 'Province/State']].drop_duplicates().iterrows():
    print(row.values)
    country = row[0]
    province = row[1]

    df_train_country = df_train[(df_train['Country/Region'] == country) & (df_train['Province/State'] == province)]
    df_test_country = df_test[(df_test['Country/Region'] == country) & (df_test['Province/State'] == province)]
    min_test_date_passed = df_test_country['DatePassed'].min()

    # ConfirmedCases
    clf_CC = dct_lm_log_7d[country][province]['clf']
    nda_predict_CC = np.exp(clf_CC.predict(df_test_country[['DatePassed']])) - 1

    # Fatality
    df_predicted_cases = df_test_country[['DatePassed']].copy()
    df_predicted_cases['PredectedConfirmedCases'] = nda_predict_CC
    df_train_country = df_train_country.merge(df_predicted_cases, how='outer', on='DatePassed')
    df_train_country['ExistRealData'] = (1 - df_train_country['ConfirmedCases'].isnull()).astype(int)
    df_train_country['MergedConfirmedCases'] = df_train_country['ExistRealData'] * df_train_country['ConfirmedCases'].fillna(0)\
        + (1 - df_train_country['ExistRealData']) * df_train_country['PredectedConfirmedCases'].fillna(0)
    df_train_country['RecentConfirmedCases'] = df_train_country['MergedConfirmedCases'].shift(timelag-timelag_width) - df_train_country['MergedConfirmedCases'].shift(timelag+timelag_width)

    clf_F = dct_lm_fatality[country][province]['clf']
    df_train_country['PredictedDailyFatalities'] = clf_F.predict(df_train_country[['RecentConfirmedCases']].fillna(0))

    df_test_country = df_test_country.merge(df_train_country[['DatePassed', 'PredictedDailyFatalities']])
    df_test_country['Fatalities'] = df_test_country['PredictedDailyFatalities'].cumsum() + df_train_country[df_train_country['DatePassed'] < min_test_date_passed]['Fatalities'].values[-1]

    df_submission_pre = pd.DataFrame(
        {
            'ForecastId': df_test_country['ForecastId'],
            'DatePassed': df_test_country['DatePassed'],
            'ConfirmedCases': nda_predict_CC,
        }
    )
    df_submission_pre = df_submission_pre.merge(df_test_country[['DatePassed', 'Fatalities']], on='DatePassed', how='left').fillna(0)
    df_submission_pre = df_submission_pre.drop('DatePassed', axis=1)

    lst_submission_pre.append(df_submission_pre)

In [None]:
df_submission_pre = pd.concat(lst_submission_pre, axis=0)

df_submission = df_submission_pre[['ForecastId', 'ConfirmedCases', 'Fatalities']]
df_submission = df_submission.sort_values(by='ForecastId')
df_submission[['ConfirmedCases', 'Fatalities']] = df_submission[['ConfirmedCases', 'Fatalities']].applymap(lambda x: 0 if x < 0 else np.ceil(x)).astype(int)

df_submission.head(5)

In [None]:
df_submission.to_csv('/kaggle/working/submission.csv', index=False)

# visualize

## ConfirmedCases

In [None]:
for i, row in df_train_wo_leak[['Country/Region', 'Province/State']].drop_duplicates().iterrows():
    print(row.values)
    country = row[0]
    province = row[1]
    
    df_train_here = df_train[(df_train['Country/Region'] == country) & (df_train['Province/State'] == province)]
    df_test_here = df_test[(df_test['Country/Region'] == country) & (df_test['Province/State'] == province)]
    
    clf = dct_lm_log_7d[country][province]['clf']
    plt.plot(df_train_here['DatePassed'], np.log(df_train_here['ConfirmedCases'] + 1))
    plt.plot(df_test_here['DatePassed'], clf.predict(df_test_here[['DatePassed']]))
    plt.show()

## Fatalities

In [None]:
for i, row in df_train_wo_leak[['Country/Region', 'Province/State']].drop_duplicates().iterrows():
    print(row.values)
    country = row[0]
    province = row[1]
    
    df_train_here = df_train[(df_train['Country/Region'] == country) & (df_train['Province/State'] == province)]
    df_test_here = df_test[(df_test['Country/Region'] == country) & (df_test['Province/State'] == province)]
    
    
    df_test_here = df_test_here.merge(df_submission, on=['ForecastId'], how='left').rename(columns={'Fatalities': 'PredictedFatalities'})
    df_train_here = df_train_here.merge(df_test_here, on=['Country/Region', 'Province/State', 'DatePassed'], how='left')
    
    plt.plot(df_train_here['DatePassed'], df_train_here['Fatalities'])
    plt.plot(df_train_here['DatePassed'], df_train_here['PredictedFatalities'])
    plt.show()