In [1]:
import json
import numpy as np
import pandas as pd

from baseline_models import CovidData_Split, ModelsRegression

import warnings
warnings.filterwarnings('ignore')

In [2]:
# MAPE metric
def mean_absolute_percentage_error(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mape

In [3]:
# Configs

configs = {
    'path_to_data': '../data/prepare_covid_rus_dataframe.json',
    'path_to_save': '../results/forecast_results.json',
    'x_name': 'Confirmed_100k',
    'y_name': 'Confirmed_100k',
    'history_size': 14, 
    'n_splits': 5,
    'metric': 'mape',
}

# choose region
configs['region_name'] = 'Россия' # one of df['Region'].unique()

# choose prediction depth 
configs['prediction_depth'] = 14 # 1 or 7 or 14 or 28

# choose options for forming training and testing parts:
# - with accumulation of training data (True)
# - with a sliding window (False)
configs['forward_chaining'] = True # True or False

# choose model
configs['model_name'] = 'ExpSmoothing' # "dummy_model" or "AutoARIMA" or "ExpSmoothing" or "LinearSVR" or "LR" or "Ridge" or "Lasso" or "RandomForest"

In [4]:
# Read and preprocessing data

# read data
df = pd.read_json(configs['path_to_data'], convert_dates=['Date'])
df.head()

Unnamed: 0,Date,Region,Confirmed,Recovered,Deaths,Confirmed_100k,Recovered_100k,Deaths_100k
0,2020-03-17,Хакасия,1,0,0,0.187876,0.0,0.0
1,2020-03-18,Хакасия,1,0,0,0.187876,0.0,0.0
2,2020-03-19,Хакасия,1,0,0,0.187876,0.0,0.0
3,2020-03-20,Хакасия,1,0,0,0.187876,0.0,0.0
4,2020-03-21,Хакасия,1,0,0,0.187876,0.0,0.0


In [5]:
# filtering data for a specific region
df_region = df[df['Region']==configs['region_name']].drop(['Region'],axis=1).set_index('Date').copy()

# sort by date
df_region = df_region.sort_values('Date', ascending=True)

# filtering days with a target parameter value less than or equal to 0
df_region = df_region[df_region[configs['y_name']] > 0]

# interpolate values in missing or filtered days
df_region = df_region.reindex(pd.date_range(min(df_region.index), max(df_region.index), freq='D'))
df_region.interpolate(inplace=True)

# prepare target Time Series (total increase in the number of confirmed cases 
# over a certain period of time ("prediction_depth" days) )
new_y_name = f"{configs['y_name']}_delta_{configs['prediction_depth']}"
df_region[new_y_name] = df_region[configs['y_name']].diff(configs['prediction_depth'])
configs['y_name'] = new_y_name
configs['x_name'] = new_y_name

# remove the appeared first "prediction_depth" empty values
df_region = df_region.iloc[configs['prediction_depth']:]
df_region.reset_index(inplace = True)
df_region = df_region.rename({'index': 'Date'}, axis=1)

In [6]:
results = []
pred_curve = []
true_curve = df_region[configs['y_name']].tolist()

covid_split = CovidData_Split(n_splits=configs['n_splits'], forward_chaining=configs['forward_chaining'])

for n_fold, (train_index, test_index) in enumerate(covid_split.split_by_sliding_window(df_region, y_name=configs['y_name'])):
    # forecast
    true_y, pred_y = ModelsRegression(configs=configs).fit_predict(df_region, train_index, test_index)
    # computing accuracy
    score = mean_absolute_percentage_error(true_y, pred_y)
    
    # for figure
    if pred_curve == []:
        pred_curve.extend([np.nan]*(len(train_index)))
    pred_curve.extend(pred_y)
            
    results.append({'n_fold': n_fold+1, configs['metric']: score})
    
with open('../results/results.json', 'w') as f:
    json.dump({'configs': configs,
               'true_curve': true_curve,
               'pred_curve': pred_curve,
               'results': results,
              }, f)

In [7]:
print(f'Region: "{configs["region_name"]}". Metric: "{configs["metric"]}". Model: {configs["model_name"]}')
for sample in results:
    print(f'\t part #{sample["n_fold"]} -- {sample[configs["metric"]]}')
    
print('\n Completed successfully!')

Region: "Россия". Metric: "mape". Model: ExpSmoothing
	 part #1 -- 7.719494543063432
	 part #2 -- 8.080903184558327
	 part #3 -- 4.511299712763777
	 part #4 -- 7.138571521375906

 Completed successfully!
