In [None]:
import pandas as pd

## Covid-19 dataset

Downloaded from https://github.com/simonporcher/COVID-19-Governments-Responses

In [None]:
data = pd.read_excel('Gov_Responses2Covid19_15jul.xlsx', sheet_name='Dataset')
interv_variables = ['iso', 'd', 'cases', 'school', 'domestic', 'travel', 'travel_dom', 'curf', 'mass', 'sport', 'rest', 'testing', 'masks', 'surveillance', 'state']
data = data[interv_variables]
data = data.dropna()

data

In [None]:
# additional fields for countries can be added from IMF data
data_tmp = pd.read_excel('WEOApr2023all.xlsx', sheet_name='WEOApr2023all').replace('--',None)[['ISO','Subject Descriptor',2020]]
data_tmp = data_tmp[data_tmp['Subject Descriptor']=='Population']
data_tmp = data_tmp.pivot(index='ISO',columns='Subject Descriptor',values=2020).reset_index().rename(columns={'ISO':'iso','Population':'pop'})
data_tmp['pop'] = data_tmp['pop'].astype(float)
data_tmp

In [None]:
# merge the data sources
df = pd.merge(data, data_tmp, on='iso')
df['d'] = pd.to_datetime(df['d'])
df = df.groupby(['iso', pd.Grouper(key='d', freq='W')]).agg({'cases': 'sum', **{col: 'mean' for col in df.columns if col not in ['cases','iso','d']}}).reset_index()
df = df.dropna()
df

In [None]:
# add variable that captures weeks since start of pandemic (i.e. because cases may be expected to increase/decrease over time)
df['week'] = df['d'].dt.isocalendar().week

# order values by country and week number
df = df.sort_values(by=['iso','week'])

# convert the task column to indices
task_map = dict(zip(df['iso'].unique(), range(len(df['iso'].unique()))))
df['task'] = df['iso'].map(task_map)

# drop columns not needed anymore
df = df.drop(columns=['d', 'iso'])

# rename the label column and feature columns
df = df.rename(columns={'cases':'Y'})
df = df.rename(columns={feature:'X_{}'.format(feature) for feature in df.keys() if feature not in ['Y','task']})

df

In [None]:
import sys
sys.path.insert(1, '../')
from utils import get_train_val_test_data

NUM_DATASETS = 6
DATASET_NAME = 'covid'
INTERVENTIONS = ['X_school', 'X_domestic', 'X_travel', 'X_travel_dom', 'X_curf', 'X_mass', 'X_sport', 'X_rest', 'X_testing', 'X_masks', 'X_surveilliance', 'X_state']

# run utility function to create train/val/test splits
full_datasets, full_interv_masks = get_train_val_test_data(df, NUM_DATASETS, INTERVENTIONS)

for dataset in range(NUM_DATASETS):
    full_datasets[dataset].to_csv(f'{DATASET_NAME}_dataset{dataset}.csv', index=None)
    full_interv_masks[dataset].to_csv(f'{DATASET_NAME}_dataset{dataset}_mask.csv', index=None)