In [916]:
import os
import pandas as pd
pd.options.display.max_rows = 10
import matplotlib.pyplot as plt
import numpy as np
import datetime
from datetime import timedelta
cantonKeys = ['AG','AI','AR', 'BE', 'BL', 'BS', 'FR', 'GE', 'GL', 'GR', 'JU', 'LU', 'NE', 'NW', 'OW', 'SG', 'SH', 'SO', 'SZ', 'TG', 'TI', 'UR', 'VD', 'VS', 'ZG','ZH']

dict = {}

# we discard the first days of 2020 and future data
for cantonId in cantonKeys: #use '2021-04-05' for testing and yesterday for production
    dict[cantonId] = pd.read_csv("data/merged/"+cantonId+".csv").set_index('date')['2020-02-15':'2021-04-05']
    dict[cantonId].index = pd.to_datetime(dict[cantonId].index)
    
interpolMet = 'linear'
originalDict = dict.copy()

In [917]:

yesterday = str(datetime.date.today()- timedelta(days = 1))
for cantonId in cantonKeys:
  
    # interpolate weekly FOPH dataCases entries 0 - 9
    for c1 in ['Cases','Death','Hosp']:
        for c2 in ['entries','inz_entries','inzsumTotal']:
            for c3 in ['0 - 9','10 - 19','20 - 29','30 - 39','40 - 49','50 - 59','60 - 69','70 - 79','80+']:
                # fill first row with 0
                dict[cantonId].loc['2020-02-15',c1 + " " + c2 + " " + c3] = 0
                # interpolate the rest
                dict[cantonId][[c1 + " " + c2 + " " + c3]] = dict[cantonId][[c1 + " " + c2 + " " + c3]].interpolate(method=interpolMet)
            for c4 in ['male','female']:
                # fill first row with 0
                dict[cantonId].loc['2020-02-15',c1 + " " + c2 + " " + c4] = 0
                # interpolate the rest
                dict[cantonId][[c1 + " " + c2 + " " + c4]] = dict[cantonId][[c1 + " " + c2 + " " + c4]].interpolate(method=interpolMet)
    
    
    # fill missing vaccine data
    vaccine = ['VaccDosesAdministered sumTotal','VaccDosesAdministered per100PersonsTotal', 
               'FullyVaccPersons sumTotal', 'FullyVaccPersons per100PersonsTotal']
    dict[cantonId].loc['2020-02-15',vaccine] = 0
    dict[cantonId][vaccine] = dict[cantonId][vaccine].fillna(method='ffill')
    
    # fill missing total hospital capacities
    hospitalCols = ['ICU_Capacity','ICU_FreeCapacity','Total_Capacity','Total_FreeCapacity']
    dict[cantonId][hospitalCols] = dict[cantonId][hospitalCols].interpolate(method=interpolMet)
    dict[cantonId]['ICU_Capacity'].fillna(method='bfill', inplace=True)
    dict[cantonId]['ICU_FreeCapacity'].fillna(method='bfill', inplace=True)
    dict[cantonId]['Total_Capacity'].fillna(method='bfill', inplace=True)
    dict[cantonId]['Total_FreeCapacity'].fillna(method='bfill', inplace=True)

    # ICU_Covid19Patients + ICU_NonCovid19Patients = ICU_AllPatients 
    dict[cantonId].loc['2020-02-15','ICU_Covid19Patients'] = 0
    ICUpatientsCols = ['ICU_Covid19Patients','ICU_NonCovid19Patients','ICU_AllPatients']
    dict[cantonId][ICUpatientsCols] = dict[cantonId][ICUpatientsCols].interpolate(method=interpolMet)
    dict[cantonId]['ICU_NonCovid19Patients'].fillna(method='bfill', inplace=True)
    dict[cantonId]['ICU_AllPatients'].fillna(dict[cantonId][['ICU_Covid19Patients','ICU_NonCovid19Patients']].sum(axis=1), inplace=True)

    # Total_Covid19Patients + Total_NonCovid19Patients = Total_AllPatients
    dict[cantonId].loc['2020-02-15','Total_Covid19Patients'] = 0
    patientCols = ['Total_Covid19Patients','Total_NonCovid19Patients','Total_AllPatients']
    dict[cantonId][patientCols] = dict[cantonId][patientCols].interpolate(method=interpolMet)
    dict[cantonId]['Total_NonCovid19Patients'].fillna(method='bfill', inplace=True)
    dict[cantonId]['Total_AllPatients'].fillna(dict[cantonId][['Total_Covid19Patients','Total_NonCovid19Patients']].sum(axis=1), inplace=True)
    
    # fill in missing Google mobility data
    googleMobilityCols = ['retail_and_recreation_percent_change_from_baseline','grocery_and_pharmacy_percent_change_from_baseline',
    'parks_percent_change_from_baseline','transit_stations_percent_change_from_baseline','workplaces_percent_change_from_baseline'
    ,'residential_percent_change_from_baseline']
    googleMobCH = pd.read_csv("data/GoogleMobility/2020_CH_Region_Mobility_Report.csv")
    googleMobCH = googleMobCH.loc[googleMobCH["sub_region_1"].isna()].set_index('date')[googleMobilityCols]
    googleMobCH.index = pd.to_datetime(googleMobCH.index)
    for col in googleMobilityCols:
        dict[cantonId][col].fillna(googleMobCH[col], inplace=True)
    dict[cantonId][googleMobilityCols] = dict[cantonId][googleMobilityCols].interpolate(method=interpolMet)
    dict[cantonId][googleMobilityCols] = dict[cantonId][googleMobilityCols].fillna(method='ffill')
    
    # fill in missing Intervista mobility data 
    dict[cantonId][['intervistaMob']] = dict[cantonId][['intervistaMob']].interpolate(method=interpolMet)
    dict[cantonId]['intervistaMob'].fillna(method='ffill', inplace=True)
    
    # fill in missing neighbor incidence   
    dict[cantonId].loc['2020-02-15', 'meanNeighborIncidence'] = 0
    dict[cantonId].loc['2020-02-15', 'maxNeighborIncidence'] = 0
    dict[cantonId][['meanNeighborIncidence','maxNeighborIncidence']] = dict[cantonId][['meanNeighborIncidence','maxNeighborIncidence']].interpolate(method=interpolMet)
        
    # fill in missing r values
    rvalues = ['median_R_mean','median_R_highHPD','median_R_lowHPD']
    dict[cantonId][rvalues] = dict[cantonId][rvalues].interpolate(method=interpolMet)  
    dict[cantonId][rvalues] = dict[cantonId][rvalues].fillna(method='ffill')
    dict[cantonId][rvalues] = dict[cantonId][rvalues].fillna(method='bfill')
  
    # fill in missing variants
    variants = ['lower_ci_day','upper_ci_day','anteil_pos']
    # first detected case for variants of concerne in Switzerland is 2020-10-14
    dict[cantonId].loc['2020-10-13',['lower_ci_day','anteil_pos']] = 0
    dict[cantonId].loc['2020-10-13',['upper_ci_day']] = 100
    dict[cantonId][variants] = dict[cantonId][variants].interpolate(method=interpolMet)
    dict[cantonId][variants] = dict[cantonId][variants].fillna(method='ffill')
    dict[cantonId][variants] = dict[cantonId][variants].fillna(method='bfill')

    # fill in daily incoming missing data
    # entries
    dict[cantonId].loc['2020-02-15',['case_entries','hosp_entries','death_entries','case_inz_entries','hosp_inz_entries','death_inz_entries',
              'case_inzsumTotal','hosp_inzsumTotal','death_inzsumTotal']] = 0
    dict[cantonId][entries] = dict[cantonId][entries].interpolate(method=interpolMet)

    dict[cantonId].loc['2020-02-15','test_inzsumTotal'] = 0
    dict[cantonId][['test_inzsumTotal']] = dict[cantonId][['test_inzsumTotal']].interpolate(method=interpolMet)
    
    if not os.path.exists('data/filled'):
        os.makedirs('data/filled')
    dict[cantonId].to_csv('data/filled/'+cantonId+'.csv')

In [918]:
for cantonId in cantonKeys:
    for col in dict[cantonId].columns:
        if dict[cantonId][col].isna().sum() != 0:
            print(cantonId+" "+col+" (#NaN/total): (" + str(dict[cantonId][col].isna().sum())+"/"+str(dict[cantonId][col].notna().sum())+")")
            #dict[cantonId][col].plot(kind='line',y=[col], figsize=(20,10))
            #plt.show()

In [919]:
'''
col = 'Cases entries male'
display(np.array(originalDict['AG'][col]))
originalDict['AG'].plot(kind='scatter', x='date', y=[col], figsize=(20,10))
dict['AG'].plot(kind='line',y=[col], figsize=(20,10))
plt.show()
'''

"\ncol = 'Cases entries male'\ndisplay(np.array(originalDict['AG'][col]))\noriginalDict['AG'].plot(kind='scatter', x='date', y=[col], figsize=(20,10))\ndict['AG'].plot(kind='line',y=[col], figsize=(20,10))\nplt.show()\n"

In [920]:

# plotting original data
for col in dict['AG'].columns:
    comparingDf = pd.concat([dict['AG'][[col]],originalData[["original_"+col]]], axis=1)
    #comparingDf[['original_'+col]].reset_index().plot(kind='scatter', x=['date'], y=['original_'+col], figsize=(20,10))
    #comparingDf['2020-02-15':'2021-04-05'].plot(kind='line',y=[col], figsize=(20,10))


In [921]:


#display([col for col in pd.read_csv("data/filled/AG.csv").columns)

In [922]:
#display([col for col in features.columns])
#display(len([col for col in features.columns]))

In [923]:
# feature engineering
filledDict = {}

# window size
w = 7

for cantonId in cantonKeys:
    d = pd.read_csv("data/filled/"+cantonId+".csv")
    d = d.set_index('date')
    filledDict[cantonId] = d
    
    dailyFeatures = filledDict[cantonId].copy()

    # summarize mask mandatories
    maskMandatories = [ 'Mask mandatory in publicly accessible establishments/ spaces (shops etc.)',
                       'Mask mandatory in public transport','Masks mandatory in schools','Masks mandatory at work']
    dailyFeatures[['maskMandatories']] = dailyFeatures[maskMandatories].sum(axis=1)
    dailyFeatures.drop(maskMandatories, axis=1, inplace=True)

    dailyFeatures[['googleMobility']] = dailyFeatures[['retail_and_recreation_percent_change_from_baseline',
                                                       'grocery_and_pharmacy_percent_change_from_baseline',
                                                       'parks_percent_change_from_baseline',
                                                       'transit_stations_percent_change_from_baseline',
                                                       'workplaces_percent_change_from_baseline',
                                                       'residential_percent_change_from_baseline']].mean(axis=1)

    # r value accuracy
    dailyFeatures[['R_error']] = dailyFeatures['median_R_highHPD']-dailyFeatures['median_R_lowHPD']
    dailyFeatures.drop(['median_R_highHPD','median_R_lowHPD'],axis=1, inplace=True)

    # variants accuracy
    #features[['anteil_pos','upper_ci_day','lower_ci_day']] = features[['anteil_pos','upper_ci_day','lower_ci_day']].rolling(window=w).mean()
    dailyFeatures[['variant_error']] = dailyFeatures['upper_ci_day']-dailyFeatures['lower_ci_day']
    dailyFeatures.drop(['upper_ci_day','lower_ci_day'],axis=1, inplace=True)

    # vaccine
    dailyFeatures.drop(['VaccDosesAdministered sumTotal','FullyVaccPersons sumTotal'],axis=1, inplace=True)
    vaccine = ['VaccDosesAdministered per100PersonsTotal',
               'FullyVaccPersons per100PersonsTotal']
    #features[vaccine] = features[vaccine].rolling(window=w).mean()
    
    # test positivity rate
    #display(len(features[['case_entries']]))
    #display(len(features[['test_entries']]))
    dailyFeatures[['testPositvity']] = dailyFeatures['case_entries']/dailyFeatures['test_entries']

    # remove absolut values which are included in the incidenc rates
    absVal = ['Cases entries 0 - 9','Cases entries 10 - 19','Cases entries 20 - 29','Cases entries 30 - 39',
              'Cases entries 40 - 49','Cases entries 50 - 59','Cases entries 60 - 69','Cases entries 70 - 79',
              'Cases entries 80+','Death entries 0 - 9','Death entries 10 - 19','Death entries 20 - 29',
              'Death entries 30 - 39','Death entries 40 - 49','Death entries 50 - 59','Death entries 60 - 69',
              'Death entries 70 - 79','Death entries 80+', 'Hosp entries 0 - 9','Hosp entries 10 - 19',
              'Hosp entries 20 - 29','Hosp entries 30 - 39','Hosp entries 40 - 49','Hosp entries 50 - 59',
              'Hosp entries 60 - 69','Hosp entries 70 - 79','Hosp entries 80+', 'Cases entries female',
              'Cases entries male','Death entries female','Death entries male', 'Hosp entries female',
              'Hosp entries male','case_entries','hosp_entries','death_entries','test_entries']
    dailyFeatures.drop(absVal,axis=1, inplace=True)

    # hospital capacities
    hospCap = [ 'ICU_AllPatients',
     'ICU_Covid19Patients',
     'ICU_Capacity',
     'Total_AllPatients',
     'Total_Covid19Patients',
     'Total_Capacity',
     'ICU_NonCovid19Patients',
     'ICU_FreeCapacity',
     'Total_NonCovid19Patients',
     'Total_FreeCapacity']
    staticCantonal = pd.read_excel("static_data/staticCantonalData.xlsx").set_index('canton').transpose()
    dailyFeatures[[col + "_inz" for col in hospCap]] = 100000*(dailyFeatures[hospCap]/staticCantonal.loc[[cantonId]]['residents'][0])
    dailyFeatures.drop(hospCap,axis=1, inplace=True)

    if not os.path.exists('data/dailyFeatures'):
        os.makedirs('data/dailyFeatures')
    dailyFeatures.to_csv('data/dailyFeatures/'+cantonId+'.csv')

In [924]:

display(dailyFeatures.reset_index().drop(['date'], axis=1).columns)

Index(['Cases inz_entries 0 - 9', 'Cases inz_entries 10 - 19',
       'Cases inz_entries 20 - 29', 'Cases inz_entries 30 - 39',
       'Cases inz_entries 40 - 49', 'Cases inz_entries 50 - 59',
       'Cases inz_entries 60 - 69', 'Cases inz_entries 70 - 79',
       'Cases inz_entries 80+', 'Cases inzsumTotal 0 - 9',
       ...
       'ICU_AllPatients_inz', 'ICU_Covid19Patients_inz', 'ICU_Capacity_inz',
       'Total_AllPatients_inz', 'Total_Covid19Patients_inz',
       'Total_Capacity_inz', 'ICU_NonCovid19Patients_inz',
       'ICU_FreeCapacity_inz', 'Total_NonCovid19Patients_inz',
       'Total_FreeCapacity_inz'],
      dtype='object', length=126)

In [None]:
# Todo: add future weather as a feature