In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load data

In [None]:
def load_data():
    train = pd.read_csv('../input/covid19-global-forecasting-week-4/train.csv', index_col='Id', parse_dates=['Date']).set_index('Date')
    test = pd.read_csv('../input/covid19-global-forecasting-week-4/test.csv', index_col='ForecastId', parse_dates=['Date'])
    
    return train, test

train, test = load_data()
submission = pd.read_csv('../input/covid19-global-forecasting-week-4/submission.csv')
submission.set_index('ForecastId', inplace=True)

In [None]:
TEST_START_DATE = pd.Timestamp('2020-04-02')
GROUND_TRUTH_END_DATE = pd.Timestamp(train.index.max())
PRIVATE_START_DATE = pd.Timestamp('2020-04-16')
TEST_END_DATE = pd.Timestamp('2020-05-14')

predict_length = (TEST_END_DATE - TEST_START_DATE).days + 1

In [None]:
train, test = load_data()

# Preprocess

1. replace NaNs to `NA`

In [None]:
train['Province_State'].fillna('NA', inplace=True)
test['Province_State'].fillna('NA', inplace=True)

For what number of Country and Province we must predict?

In [None]:
test.drop_duplicates(subset=['Country_Region', 'Province_State'])

We must predict **313** Country and Province

In [None]:
# Utility Function
# Simply plot feature
def plot_feature(country_region, province_state='NA', column='ConfirmedCases', ax=None, figwidth=12):
    if ax is None:
        fig, ax = plt.subplots()
        fig.set_figwidth(12)
    plot_data = train.loc[
        (train['Country_Region'] == country_region) & (train['Province_State'] == province_state),
        column
    ]
    if len(plot_data) == 0:
        raise Exception(f'country - province pair ({country_region}, {province_state}) you provided was not found.')
    l = ax.plot(plot_data.index, plot_data[column], label=f'{column} - {country_region} / {province_state}')
    ax.grid(True)
    return ax

# Strip preceding and trailing NaNs
def strip_nan(data):
    notnan_index = data.index[~(data.isna())]
    min_index = notnan_index[0]
    max_index = notnan_index[-1]
    
    return data[(min_index <= data.index) & (data.index <= max_index)]

# Strip preceding zeros
def strip_preceding_zeros(data):
    nonzero_index = data.index[data != 0]
    min_index = nonzero_index.min()
    return data[(min_index <= data.index)]


def strip_preceding_values(data, value=0):
    nonzero_index = data.index[data != value]
    min_index = nonzero_index.min()
    return data[(min_index <= data.index)]

# Feature Engineering

In [None]:
country_province_index = train[['Country_Region', 'Province_State']].drop_duplicates().values

Create difference features

In [None]:
train['ConfirmedCasesIncrement'] = 0
train['FatalitiesIncrement'] = 0
for country_region, province_state in country_province_index:
    targeted = train.loc[
        (train['Country_Region'] == country_region) & (train['Province_State'] == province_state),
        ['ConfirmedCases', 'Fatalities']
    ]
    preprocessed = targeted.diff(1).values + 1
    train.loc[
        (train['Country_Region'] == country_region) & (train['Province_State'] == province_state),
        ['ConfirmedCasesIncrement', 'FatalitiesIncrement']
    ] = preprocessed

Try Exponential Smoothing

In [None]:
import itertools
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from statsmodels.tsa import ar_model, stattools

In [None]:
import ipywidgets as widgets


progressbar = widgets.IntProgress(
    value=0,
    min=0,
    max=len(country_province_index),
    step=1,
    description='Predicting:',
    bar_style='success', 
    orientation='horizontal'
)

progresstext = widgets.Label(value='0 / {}'.format(len(country_province_index)))

In [None]:
widgets.HBox([progressbar, progresstext])

In [None]:
import warnings
warnings.simplefilter('ignore')

adf_significant_level = 0.1

for i, (country_region, province_state) in enumerate(country_province_index):
    for feature in ['ConfirmedCases', 'Fatalities']:

        ground_truth = train.loc[
            (train['Country_Region'] == country_region) & (train['Province_State'] == province_state),
            feature + 'Increment'
        ].dropna()

        ground_truth = strip_preceding_values(ground_truth, value=1)
        # Split into training and validation
        train_data = ground_truth[ground_truth.index < TEST_START_DATE]
        validation_data = ground_truth[(TEST_START_DATE <= ground_truth.index) & (ground_truth.index < PRIVATE_START_DATE)]

        if len(train_data) < 10:
            continue


        # get best alpha and beta for Holt
        alpha_grid = np.arange(0.5, 1.0, 0.05)
        beta_grid = np.arange(0.5, 1.0, 0.05)
        scores = []
        for alpha, beta in itertools.product(alpha_grid, beta_grid):
            fitted = Holt(train_data).fit(smoothing_level=alpha, smoothing_slope=beta)
            scores.append(((alpha, beta), fitted.aic))
        alpha, beta = sorted(scores, key=lambda k: k[1])[0][0]

        holt_result = Holt(train_data).fit(smoothing_level=alpha, smoothing_slope=beta)
        holt_fcast = holt_result.forecast(predict_length) # This is assumed as Trend component
        
        holt_fcast[holt_fcast < 1] = 1

        noises = pd.Series(train_data.values.ravel() / holt_result.fittedvalues.values, index=train_data.index)

        trends_cdt = []
        try:
            ct = stattools.adfuller(noises, regression='ct')[1]
            trends_cdt.append(('ct', ct))
        except:
            pass
        try:
            c = stattools.adfuller(noises, regression='c')[1]
            trends_cdt.append(('c', c))
        except:
            pass
        try:
            nc = stattools.adfuller(noises, regression='nc')[1]
            trends_cdt.append(('nc', nc))
        except:
            pass
        
        if len(trends_cdt) == 0:
            trend = 'urp'
        else:
            trend = sorted(trends_cdt, key=lambda x:x[1])[0][0]

        # If unit root process
        if trend == 'urp':
            noise_prediction = pd.Series(
                data=np.ones((predict_length,)) * noises[-1], 
                index=pd.date_range(TEST_START_DATE, TEST_END_DATE)
            )
        else:
            # Search best lags
            aics = []
            for lags in np.arange(min(len(train_data), 30)):
                try:
                    noise_model = ar_model.AutoReg(noises.values.ravel(), lags=lags, trend=trend)
                    noise_result = noise_model.fit()
                    aics.append(noise_result.aic)
                except:
                    pass
            
            if len(aics) == 0:
                noise_prediction = pd.Series(
                    data=np.ones((predict_length,)) * noises[-1], 
                    index=pd.date_range(TEST_START_DATE, TEST_END_DATE)
                )
            else:
                lags = np.argmin(aics)

                # Build AR(lags_best) model
                noise_model = ar_model.AutoReg(noises.values.ravel(), lags=lags, trend=trend)
                noise_result = noise_model.fit()

                noise_prediction = pd.Series(
                    data=noise_result.predict(len(noises), len(noises) + predict_length - 1, dynamic=True),
                    index=pd.date_range(TEST_START_DATE, TEST_END_DATE)
                )
            noise_prediction[noise_prediction < 0] = 0
        # Calculate ConfirmedCases from ConfirmedCases Increment
        predicted_confirmed_cases= pd.concat([train_data, noise_prediction * (holt_fcast - 1)]).cumsum()

        #predicted_confirmed_cases = pd.concat([train_data, holt_fcast]).cumsum()
        # Set values to submission DF
        indices = test.loc[
            (test['Country_Region'] == country_region) & (test['Province_State'] == province_state)
        ]
        submission.loc[indices.index, feature] = predicted_confirmed_cases[indices['Date']].values



        progressbar.value = i + 1
        progresstext.value = '{} / {}'.format(i +1 , len(country_province_index))

In [None]:
nrows = 53
ncols = 6
figwidth = 18
fig, axes = plt.subplots(nrows, ncols, figsize=(figwidth, figwidth * nrows / ncols))
for i, (country_region, province_state) in enumerate(country_province_index):
    
    predicted = submission.loc[(test['Country_Region'] == country_region) & (test['Province_State'] == province_state), 'ConfirmedCases']
    row = i // ncols
    col = i % ncols
    axes[row][col].plot(predicted)
    axes[row][col].set_title(f'{country_region} - {province_state}')

In [None]:
nrows = 53
ncols = 6
figwidth = 18
fig, axes = plt.subplots(nrows, ncols, figsize=(figwidth, figwidth * nrows / ncols))
for i, (country_region, province_state) in enumerate(country_province_index):
    
    predicted = submission.loc[(test['Country_Region'] == country_region) & (test['Province_State'] == province_state), 'Fatalities']
    row = i // ncols
    col = i % ncols
    axes[row][col].plot(predicted)
    axes[row][col].set_title(f'{country_region} - {province_state}')

In [None]:
submission.to_csv('submission.csv')