In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import copy
import time

In [2]:
# Set the path to the folders of all the models in the CDC
models_path = './covid19-forecast-hub/data-processed/'

# Get all the folders (models) available in the repository
models = []
possible_models = os.listdir(models_path)
for element in possible_models:
    if os.path.isdir(models_path + element):
        models.append(element)

In [3]:
# Set the starting and ending dates
start_dates = ['2020-07-05', '2020-07-12',\
               '2020-07-19', '2020-07-26', '2020-08-02', '2020-08-09',\
               '2020-08-16', '2020-08-23', '2020-08-30', '2020-09-06',\
               '2020-09-13', '2020-09-20', '2020-09-27', '2020-10-04',\
               '2020-10-11', '2020-10-18', '2020-10-25', '2020-11-01',\
               '2020-11-08', '2020-11-15', '2020-11-22', '2020-11-29',\
               '2020-12-06', '2020-12-13', '2020-12-20', '2020-12-27',\
               '2021-01-03', '2021-01-10', '2021-01-17', '2021-01-24',\
               '2021-01-31', '2021-02-07', '2021-02-14', '2021-02-21',\
               '2021-02-28', '2021-03-07', '2021-03-14', '2021-03-21',\
               '2021-03-28', '2021-04-04', '2021-04-11', '2021-04-18'\
              ]


end_dates = ['2020-07-11', '2020-07-18',\
             '2020-07-25', '2020-08-01', '2020-08-08', '2020-08-15',\
             '2020-08-22', '2020-08-29', '2020-09-05', '2020-09-12',\
             '2020-09-19', '2020-09-26', '2020-10-03', '2020-10-10',\
             '2020-10-17', '2020-10-24', '2020-10-31', '2020-11-07',\
             '2020-11-14', '2020-11-21', '2020-11-28', '2020-12-05',\
             '2020-12-12', '2020-12-19', '2020-12-26', '2021-01-02',\
             '2021-01-09', '2021-01-16', '2021-01-23', '2021-01-30',\
             '2021-02-06', '2021-02-13', '2021-02-20', '2021-02-27',\
             '2021-03-06', '2021-03-13', '2021-03-20', '2021-03-27',\
             '2021-04-03', '2021-04-10', '2021-04-17', '2021-04-24'\
            ]

dates_to_index = dict()

for i, date in enumerate(end_dates):
    dates_to_index[date] = i

model_predictions = dict()

for model in models:
    model_predictions[model] = -1*np.ones(len(dates_to_index))

In [4]:
# For every model, get all the csv files that they have submitted
max_weeks = 4
model_predictions_dict = dict()

for weeks_ahead in range(max_weeks):
    key_name = str(weeks_ahead+1) + ' weeks ahead'
    model_predictions_dict[key_name] = copy.deepcopy(model_predictions)

In [5]:
subtype = 'case'
start_time = time.time()
# For every model, get all the csv files that they have submitted
for model_folder in models:
    csv_files = os.listdir(models_path+model_folder)
    # Verify that this is not the metadata
    for file in csv_files:
        if (file[0:3] == '202') and (file[-3:] == 'csv'):
            # Load the csv file
            data_frame = pd.read_csv(models_path + model_folder + '/' + file)
            
            for weeks_ahead in range(1, max_weeks+1):
                value = []
                date = []
        
                key_name = str(weeks_ahead) + ' weeks ahead'
            
                # We just care about 'point' predictions in the US
                us_flag = data_frame['location'] == 'US'
                type_flag = data_frame['type'] == 'point'

                target_flag = data_frame['target'] == str(weeks_ahead) + ' wk ahead inc ' + subtype

                # Check if they report '4 wk ahead inc death'
                if np.sum(target_flag) > 0:
                    all_flags = us_flag & type_flag & target_flag

                    us_dataframe = data_frame.loc[all_flags]
                    date = us_dataframe['target_end_date'].values
                    value = us_dataframe['value'].values

                # If not, then check for '4 wk ahead cum death' and '3 wk ahead inc death'
                else:
                    if weeks_ahead > 1:
                        target_flag_w_4 = data_frame['target'] == str(weeks_ahead) + ' wk ahead cum ' + subtype
                        target_flag_w_3 = data_frame['target'] == str(weeks_ahead-1) + ' wk ahead cum ' + subtype

                        all_flags_4 = us_flag & type_flag & target_flag_w_4
                        all_flags_3 = us_flag & type_flag & target_flag_w_3

                        us_dataframe_4 = data_frame.loc[all_flags_4]
                        us_dataframe_3 = data_frame.loc[all_flags_3]

                        date = us_dataframe_4['target_end_date'].values
                        value = us_dataframe_4['value'].values - us_dataframe_3['value'].values


                if len(date) > 0:
                    date = date[0]
                else:
                    date = ''

                if len(value) > 0:
                    value = value[0]
                else:
                    value = -1

                # Check which entry of the predictions to modify
                if date in dates_to_index:
                    index = dates_to_index[date]
                    model_predictions_dict[key_name][model_folder][index] = value       
print(time.time()-start_time)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


916.723494052887


In [8]:
# Load the real data
ground_truth_path = './covid19-forecast-hub/data-truth/truth-Incident Cases.csv'
ground_truth_dataframe = pd.read_csv(ground_truth_path)
# Get the us data only
us_flag = ground_truth_dataframe['location_name'] == 'US'
gt_us_dataframe = ground_truth_dataframe[us_flag]
gt_us_dataframe = gt_us_dataframe.reset_index()

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
# Get the ground truth data
ground_truth = -1*np.ones(len(dates_to_index))
gt_values = gt_us_dataframe['value'].values

for i, end_date in enumerate(end_dates):
    day_flag = gt_us_dataframe['date'] == end_date

    # Get the row numbers of the data of interest
    end_index = 1 + gt_us_dataframe[day_flag].index.values[0]
    start_index = end_index - 7
    ground_truth[i] = np.sum(gt_values[start_index:end_index])

In [10]:
# Save the ground truth in a csv file

with open('Ground_truth_Cases.csv', 'w') as file:
    file.write('Start_date, End_date, Prediction\n')
    for start_d, end_d, gt in zip(start_dates, end_dates, ground_truth):
        file.write(start_d + ',' + end_d +','+ str(gt)+'\n')

In [11]:
# Save the CDC prdictions into a pickle file
pickle.dump(model_predictions_dict, open('CDC_predictions_Cases_Neurips.pkl', 'wb'))