In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from statsmodels.tsa.statespace.exponential_smoothing import ExponentialSmoothing
from sklearn.decomposition import TruncatedSVD

from warnings import catch_warnings
from warnings import filterwarnings
filterwarnings('ignore')

import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = (20, 6)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
def auto_ets(df, trend=True, damped_trend=True, seasonal=None):
    df = df.asfreq('D')
    ets = ExponentialSmoothing(df, trend=trend, damped_trend=damped_trend, seasonal=seasonal).fit(maxiter=200)
    return ets

def fit_simulate(data, forecast_period=45, repetitions=1000):
    model = auto_ets(data)
    return model, model.simulate(forecast_period, repetitions=repetitions, anchor='end')

def simulate_all(dat):
    factor_list = [fit_simulate(dat.loc[factor, :])[1] for factor in dat.index]
    # Convert list of factors with sims as columns to list of sims with factors as columns
    sim_list = []
    for sim in range(len(factor_list[0].columns)):
        f = pd.DataFrame(index=factor_list[0].index, columns=dat.index)
        for factor in range(len(factor_list)):
            f.iloc[:, factor] = factor_list[factor].iloc[:, sim]
        sim_list.append(f)
    return sim_list

In [None]:
train = pd.read_csv('../input/covid19-global-forecasting-week-5/train.csv', parse_dates=['Date'])
test = pd.read_csv('../input/covid19-global-forecasting-week-5/test.csv', parse_dates=['Date'])
submission = pd.read_csv('../input/covid19-global-forecasting-week-5/submission.csv')

In [None]:
train

In [None]:
train['key'] = train['Country_Region'].astype('str') + " " + train['Province_State'].astype('str') + " " + train['County'].astype('str')
test['key'] = test['Country_Region'].astype('str') + " " + test['Province_State'].astype('str') + " " + test['County'].astype('str')

In [None]:
train

In [None]:
test

In [None]:
len(set(train.key)), len(set(test.key))

In [None]:
submission

In [None]:
cases = train[train.Target == 'ConfirmedCases']
train = train[train.Target == 'Fatalities']
cases.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
train['ConfirmedCases'] = cases.TargetValue
train.rename(columns={'TargetValue': 'Fatalities'}, inplace=True)
train

In [None]:
train[train.Country_Region=='US']

In [None]:
cases = train.pivot('key', 'Date', 'ConfirmedCases')
fatalities = train.pivot('key', 'Date', 'Fatalities')
cases.index += ' cases'
fatalities.index += ' fatal'
combined = pd.concat([cases, 10 * fatalities])
#combined = combined.iloc[:, -45:]
combined

In [None]:
cases.sum().cumsum().plot(label='Confirmed cases', legend=True)
fatalities.sum().cumsum().plot(label='Fatalities', legend=True, title='COVID19 Global Confirmed Cases and Fatalities', logy=True);

In [None]:
m, f = fit_simulate(cases.sum())
f[:30].clip(0).plot(title='Aggregate check on new cases', legend=False, alpha=0.1)
cases.sum()[-45:].plot()
m.summary()

In [None]:
m, f = fit_simulate(fatalities.sum())
f[:30].clip(0).plot(title='Aggregate check on new deaths', legend=False, alpha=0.1)
fatalities.sum()[-45:].plot()
m.summary()

In [None]:
svd = TruncatedSVD(100)
svd_factors = pd.DataFrame(svd.fit_transform(combined.clip(0).T).T, columns=combined.columns)
svd_factors

In [None]:
svd.explained_variance_ratio_[:5].round(3), sum(svd.explained_variance_ratio_).round(5)

In [None]:
svd_factors.T.iloc[:, :5].plot(title='Top five SVD components');

In [None]:
m, f = fit_simulate(svd_factors.loc[0, :])
f[:30].plot(title='Projected component 0', legend=False, alpha=0.05)
svd_factors.iloc[0, -45:].plot()
m.summary()

In [None]:
m, f = fit_simulate(svd_factors.loc[1, :])
f[:30].plot(title='Projected component 1', legend=False, alpha=0.05)
svd_factors.iloc[1, -45:].plot()
m.summary()

In [None]:
%%time
sim_list = simulate_all(svd_factors)
sim_list[0].head()

In [None]:
proj_list = []
for sim in sim_list:
    proj_list.append(pd.DataFrame(svd.inverse_transform(sim).T, index=combined.index, columns=sim.index).clip(0))
proj_list[0].head()

In [None]:
proj_array = np.empty((len(proj_list), len(proj_list[0].index), len(proj_list[0].columns)))
for proj in range(len(proj_list)):
    proj_array[proj, :, :] = proj_list[proj].values
proj_quantiles = np.quantile(proj_array, [0.05, 0.50, 0.95], axis=0)
forecast_combined_05 = pd.DataFrame(proj_quantiles[0, :, :], index=proj_list[0].index, columns=proj_list[0].columns)
forecast_combined_50 = pd.DataFrame(proj_quantiles[1, :, :], index=proj_list[0].index, columns=proj_list[0].columns)
forecast_combined_95 = pd.DataFrame(proj_quantiles[2, :, :], index=proj_list[0].index, columns=proj_list[0].columns)
forecast_combined_50

In [None]:
forecast_cases_05 = forecast_combined_05.iloc[:len(cases), :]
forecast_cases_50 = forecast_combined_50.iloc[:len(cases), :]
forecast_cases_95 = forecast_combined_95.iloc[:len(cases), :]
forecast_fatalities_05 = forecast_combined_05.iloc[len(cases):, :] / 10
forecast_fatalities_50 = forecast_combined_50.iloc[len(cases):, :] / 10
forecast_fatalities_95 = forecast_combined_95.iloc[len(cases):, :] / 10
forecast_fatalities_50

In [None]:
cases.sum()[-45:].plot()
forecast_cases_05.sum()[:30].plot(title='Daily confirmed cases (including double-counted aggregates)');
forecast_cases_50.sum()[:30].plot();
forecast_cases_95.sum()[:30].plot();

In [None]:
fatalities.sum()[-45:].plot()
forecast_fatalities_05.sum()[:30].plot(title='Daily confirmed fatalities (including double-counted aggregates)');
forecast_fatalities_50.sum()[:30].plot();
forecast_fatalities_95.sum()[:30].plot();

In [None]:
(fatalities.sum() / cases.sum())[-45:].plot();
(forecast_fatalities_05.sum() / forecast_cases_05.sum())[:30].plot(title='Aggregated daily fatalities as proportion of confirmed cases')
(forecast_fatalities_50.sum() / forecast_cases_50.sum())[:30].plot()
(forecast_fatalities_95.sum() / forecast_cases_95.sum())[:30].plot()

In [None]:
cases_melt_05 = forecast_cases_05.reset_index().melt('key', var_name='Date', value_name='ConfirmedCases')
fatalities_melt_05 = forecast_fatalities_05.reset_index().melt('key', var_name='Date', value_name='Fatalities')
cases_melt_05.key = [key[:-6] for key in cases_melt_05.key]
fatalities_melt_05.key = [key[:-6] for key in fatalities_melt_05.key]

cases_melt_50 = forecast_cases_50.reset_index().melt('key', var_name='Date', value_name='ConfirmedCases')
fatalities_melt_50 = forecast_fatalities_50.reset_index().melt('key', var_name='Date', value_name='Fatalities')
cases_melt_50.key = [key[:-6] for key in cases_melt_50.key]
fatalities_melt_50.key = [key[:-6] for key in fatalities_melt_50.key]

cases_melt_95 = forecast_cases_95.reset_index().melt('key', var_name='Date', value_name='ConfirmedCases')
fatalities_melt_95 = forecast_fatalities_95.reset_index().melt('key', var_name='Date', value_name='Fatalities')
cases_melt_95.key = [key[:-6] for key in cases_melt_95.key]
fatalities_melt_95.key = [key[:-6] for key in fatalities_melt_95.key]

fatalities_melt_95

In [None]:
test_cases = test[test.Target == 'ConfirmedCases']
test_fatalities = test[test.Target == 'Fatalities']
test_fatalities

In [None]:
test_fatalities_05 = test_fatalities.copy(); test_cases_05 = test_cases.copy()
test_cases_05 = test_cases_05.merge(cases_melt_05, how='left', on=['key', 'Date'])
test_fatalities_05 = test_fatalities_05.merge(fatalities_melt_05, how='left', on=['key', 'Date'])
test_cases_05['ForecastValue'] = test_cases_05.ConfirmedCases
test_fatalities_05['ForecastValue'] = test_fatalities_05.Fatalities
test_cases_05.drop(columns='ConfirmedCases', inplace=True)
test_fatalities_05.drop(columns='Fatalities', inplace=True)

test_fatalities_50 = test_fatalities.copy(); test_cases_50 = test_cases.copy()
test_cases_50 = test_cases_50.merge(cases_melt_50, how='left', on=['key', 'Date'])
test_fatalities_50 = test_fatalities_50.merge(fatalities_melt_50, how='left', on=['key', 'Date'])
test_cases_50['ForecastValue'] = test_cases_50.ConfirmedCases
test_fatalities_50['ForecastValue'] = test_fatalities_50.Fatalities
test_cases_50.drop(columns='ConfirmedCases', inplace=True)
test_fatalities_50.drop(columns='Fatalities', inplace=True)

test_fatalities_95 = test_fatalities.copy(); test_cases_95 = test_cases.copy()
test_cases_95 = test_cases_95.merge(cases_melt_95, how='left', on=['key', 'Date'])
test_fatalities_95 = test_fatalities_95.merge(fatalities_melt_95, how='left', on=['key', 'Date'])
test_cases_95['ForecastValue'] = test_cases_95.ConfirmedCases
test_fatalities_95['ForecastValue'] = test_fatalities_95.Fatalities
test_cases_95.drop(columns='ConfirmedCases', inplace=True)
test_fatalities_95.drop(columns='Fatalities', inplace=True)

test_fatalities_95

In [None]:
test_05 = pd.concat([test_cases_05, test_fatalities_05])
test_50 = pd.concat([test_cases_50, test_fatalities_50])
test_95 = pd.concat([test_cases_95, test_fatalities_95])
test_50

In [None]:
test_05.ForecastId = [str(fid) + '_0.05' for fid in test_05.ForecastId]
test_50.ForecastId = [str(fid) + '_0.5' for fid in test_50.ForecastId]
test_95.ForecastId = [str(fid) + '_0.95' for fid in test_95.ForecastId]
test_all = pd.concat([test_05, test_50, test_95])
test_all.rename(columns={'ForecastId': 'ForecastId_Quantile'}, inplace=True)
test_all

In [None]:
submission = submission.merge(test_all.loc[:, ['ForecastId_Quantile', 'ForecastValue']], how='left', on='ForecastId_Quantile')
submission.drop(columns='TargetValue', inplace=True)
submission.rename(columns={'ForecastValue': 'TargetValue'}, inplace=True)
submission.TargetValue = submission.TargetValue.fillna(0)
submission

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
submission.tail(10)