In [1]:
# THIS FILE PREPARES THE COLLECTED DATA FOR THE MACHINE LEARNING ALGORITHM, IT ALSO DOES THE TRAINING,
# VALIDATION AND TEST SET CATEGORIZATION

import os
import datetime
from datetime import date, timedelta, timezone
from sklearn.impute import KNNImputer
import time
import pandas as pd
import numpy as np
pd.options.display.max_rows = 10
import matplotlib.pyplot as plt

cantonKeys = ['AG','AI','AR', 'BE', 'BL', 'BS', 'FR', 'GE', 'GL', 'GR', 'JU', 'LU', 'NE', 'NW', 'OW', 'SG', 'SH', 'SO', 'SZ', 'TG', 'TI', 'UR', 'VD', 'VS', 'ZG','ZH']
googleMobDict = dict(zip(cantonKeys,["Aargau","Appenzell Innerrhoden","Appenzell Ausserrhoden","Canton of Bern","Basel-Landschaft","Basel City",
                                                    "Fribourg","Geneva","Glarus","Grisons","Jura","Lucerne","Neuchâtel","Nidwalden","Obwalden","St. Gallen",
                                                    "Schaffhausen","Solothurn","Schwyz","Thurgau","Ticino","Uri","Vaud","Valais","Canton of Zug","Zurich"]))
def getDays(year, offset):
   d = date(year, 1, 1)                    
   d += timedelta(days = offset - d.weekday())  
   while d.year == year:
      yield d
      d += timedelta(days = 7)

listOfMondays = []
for year in [2020,2021]:
    for day in getDays(year, 7):
       listOfMondays.append(day)
    
def addZero(x):
    if len(x)==1:
        return "0"+x
    else:
        return x

yearWeek = [str(x.isocalendar()[0])+addZero(str(x.isocalendar()[1])) for x in listOfMondays]



mondaysByWeekNr = dict(zip(yearWeek,listOfMondays))


# we discard the first days of 2020 and future data
yesterday = datetime.date.today()-timedelta(days = 1)
yesterdayStr = str(yesterday)
start = '2020-02-15'
end = '2021-06-25' #use '2021-05-11' for testing and yesterdayStr for production

In [None]:
# MERGE THE COLLECTED DATA TO ONE FILE FOR EACH CANTON
variantList = []


for cantonId in cantonKeys:
    df = pd.DataFrame(index=pd.date_range(start=datetime.datetime(2020, 1, 1), end=datetime.datetime(2021, 12, 31)))
    
    # weekly age classified FOPH data
    for category in ['Cases','Death','Hosp']: 
        age = pd.read_csv("data/FOPH/data/COVID19"+category+"_geoRegion_AKL10_w.csv")
        age[['datum']] = age[['datum']].applymap(lambda x: mondaysByWeekNr[str(x)])
        age = age.loc[age['geoRegion']==cantonId][['datum','altersklasse_covid19','inz_entries','inzsumTotal']]
        age = age.pivot(index="datum", columns="altersklasse_covid19")
        age.columns = [category+" "+' '.join(col) for col in age.columns.values]
        age = age.drop([category+' inz_entries Unbekannt', category+' inzsumTotal Unbekannt'], axis=1)
        df = df.join(age)
    
    # weekly gender classified FOPH data
    for category in ['Cases','Death','Hosp']: 
        gender = pd.read_csv("data/FOPH/data/COVID19"+category+"_geoRegion_sex_w.csv")
        gender[['datum']] = gender[['datum']].applymap(lambda x: mondaysByWeekNr[str(x)])
        gender = gender.loc[gender['geoRegion']==cantonId][['datum','sex','inz_entries','inzsumTotal']]
        gender = gender.pivot(index="datum", columns="sex")
        gender.columns = [category+" "+' '.join(col) for col in gender.columns.values]
        gender = gender.drop([category+' inz_entries unknown', category+' inzsumTotal unknown'],axis=1)
        df = df.join(gender)
      
    # COVID19VaccPersons.csv
    vacc = pd.read_csv("data/FOPH/data/COVID19VaccPersons.csv")
    temp1Dosis = vacc[(vacc['geoRegion']==cantonId) & (vacc['type']=="COVID19AtLeastOneDosePersons")]
    temp2Dosis = vacc[(vacc['geoRegion']==cantonId) & (vacc['type']=="COVID19FullyVaccPersons")]
    temp1Dosis = temp1Dosis[['date','per100PersonsTotal']]
    temp2Dosis = temp2Dosis[['date','per100PersonsTotal']]
    temp1Dosis = temp1Dosis.set_index('date')
    temp2Dosis = temp2Dosis.set_index('date')
    temp1Dosis.rename(columns = {"per100PersonsTotal":'AtLeastOneDosePersons per100PersonsTotal'}, inplace = True)
    temp2Dosis.rename(columns = {"per100PersonsTotal":'COVID19FullyVaccPersons per100PersonsTotal'}, inplace = True)
    temp1Dosis.index = pd.to_datetime(temp1Dosis.index) 
    temp2Dosis.index = pd.to_datetime(temp2Dosis.index)
    df = df.join(temp1Dosis)
    df = df.join(temp2Dosis)
    
    
    # daily hospital capacity FOPH data
    capacity = pd.DataFrame()
    if (cantonId=='AI'):
        capacity = pd.read_csv("static_data/historicHospitalCapacities/hospCapacitiesForAI.csv")
    else:
        capacity = pd.read_csv("data/FOPH/data/COVID19HospCapacity_geoRegion.csv")
    
    capacity = capacity.loc[capacity['geoRegion']==cantonId]
    capacity = capacity.set_index('date').sort_index()
    capacity = capacity[['ICU_AllPatients','ICU_Covid19Patients','ICU_Capacity','Total_AllPatients',
                         'Total_Covid19Patients','Total_Capacity','ICU_NonCovid19Patients','ICU_FreeCapacity',
                         'Total_NonCovid19Patients','Total_FreeCapacity','type_variant']]
    capacity = capacity.drop_duplicates()
    capacity = capacity.loc[capacity['type_variant']=='fp7d']
    capacity = capacity.drop(['type_variant'], axis=1)
    capacity.index = pd.to_datetime(capacity.index) 
    df = df.join(capacity)
    
    
    # percentage of virus variants (swiss based)
    variants = pd.read_csv("data/FOPH/data/COVID19Variants_wgs.csv", low_memory=False)
    variantList = variants[(variants["data_source"]=="wgs") & (variants["variant_type"]!="other_lineages") & (variants["variant_type"]!="all_sequenced")][['variant_type']].drop_duplicates().values
    variantList = variantList.ravel().tolist()
    for v in variantList:
        #variants[variants['']]
        temp = variants[(variants['data_source']=="wgs") & (variants['variant_type']==v)]
        temp = temp.set_index('date').sort_index()[["prct"]]
        temp.rename(columns = {"prct":'prct_'+v}, inplace = True)
        temp.index = pd.to_datetime(temp.index)
        df = df.join(temp)
           
    
    # daily basis data
    # attach daily positive cases
    caseDf = pd.read_csv("data/FOPH/data/COVID19Cases_geoRegion.csv")
    caseDf = caseDf.loc[caseDf["geoRegion"]==cantonId]
    caseDf = caseDf.set_index('datum')
    interestedCols = ['entries','inz_entries','inzsumTotal']
    caseDf = caseDf[interestedCols]
    caseDf.columns = ["case_"+e for e in interestedCols]
    caseDf.index = pd.to_datetime(caseDf.index) 
    df = df.join(caseDf)
    
    # attach daily hospital cases
    hospDf = pd.read_csv("data/FOPH/data/COVID19Hosp_geoRegion.csv")
    hospDf = hospDf.loc[hospDf["geoRegion"]==cantonId]
    hospDf = hospDf.set_index('datum')
    interestedCols = ['inz_entries','inzsumTotal']
    hospDf = hospDf[interestedCols]
    hospDf.columns = ["hosp_"+e for e in interestedCols]
    hospDf.index = pd.to_datetime(hospDf.index) 
    df = df.join(hospDf)
    
    # attach daily death cases
    deathDf = pd.read_csv("data/FOPH/data/COVID19Death_geoRegion.csv")
    deathDf = deathDf.loc[deathDf["geoRegion"]==cantonId]
    deathDf = deathDf.set_index('datum')
    interestedCols = ['inz_entries','inzsumTotal']
    deathDf = deathDf[interestedCols]
    deathDf.columns = ["death_"+e for e in interestedCols]
    deathDf.index = pd.to_datetime(deathDf.index)
    df = df.join(deathDf)
    
    
    # attach daily test
    testDf = pd.read_csv("data/FOPH/data/COVID19Test_geoRegion_all.csv")
    testDf = testDf.loc[testDf["geoRegion"]==cantonId]
    testDf = testDf.set_index('datum')
    pop = testDf['pop'][0]
    interestedCols = ['entries','inz_entries','inzsumTotal']
    testDf = testDf[interestedCols]
    testDf.columns = ["test_"+e for e in interestedCols]
    testDf.index = pd.to_datetime(testDf.index)
    df = df.join(testDf)
    # compute rest of test entries
    '''
    totalTestsInSwitzerland = pd.DataFrame(index=pd.date_range(start=datetime.datetime(2020, 2, 15), end=datetime.datetime(2020, 5, 22)))
    temp = pd.read_csv("data/FOPH/data/COVID19Test_geoRegion_all.csv")
    temp = temp[temp['geoRegion']=='CHFL']
    temp = temp.set_index('datum')
    temp.index = pd.to_datetime(temp.index)
    totalTestsInSwitzerland = totalTestsInSwitzerland.join(temp)  
    totalTestsInSwitzerland = totalTestsInSwitzerland[['entries']]
    totalTestsInSwitzerland.fillna(method='bfill', inplace=True)
    testsByCanton = pd.read_csv("data/FOPH/data/COVID19Test_geoRegion_all.csv")
    testsByCanton = testsByCanton.set_index('datum')
    testsByCanton.index = pd.to_datetime(testsByCanton.index)
    sumSwitzerland = testsByCanton.loc[testsByCanton["geoRegion"]=='CHFL'][['entries']]
    sumCanton = testsByCanton.loc[testsByCanton["geoRegion"]==cantonId][['entries']]
    cantonalTestFraction = sumCanton['2020-05-23':'2020-06-05'].sum(axis=0).values[0]/sumSwitzerland['2020-05-23':'2020-06-05'].sum(axis=0).values[0]
    #multiply this with cantonal test quotient
    computedMissingEntries = totalTestsInSwitzerland['2020-02-15':'2020-05-22']*cantonalTestFraction 
    computedMissingEntries.rename(columns = {"entries":'test_entries'}, inplace = True)
    df[['test_entries']] = df[['test_entries']].fillna(computedMissingEntries[['test_entries']])
    # compute rest of test incidence
    missingTestIncidents = 100000*(df[['test_entries']]/pop)
    missingTestIncidents.rename(columns = {"test_entries":'test_inz_entries'}, inplace = True)
    df[['test_inz_entries']] = df[['test_inz_entries']].fillna(missingTestIncidents)   
    '''
    
    # attach daily R-values
    rvalueDf = pd.read_csv("data/FOPH/data/COVID19Re_geoRegion.csv")
    rvalueDf = rvalueDf.loc[rvalueDf["geoRegion"]==cantonId]
    rvalueDf = rvalueDf.set_index('date')
    interestedCols = ['median_R_mean','median_R_highHPD','median_R_lowHPD']
    rvalueDf = rvalueDf[interestedCols]
    rvalueDf.index = pd.to_datetime(rvalueDf.index)
    df = df.join(rvalueDf)
    
    # attach google mobility data
    mobDf2020 = pd.read_csv("data/GoogleMobility/2020_CH_Region_Mobility_Report.csv")
    mobDf2021 = pd.read_csv("data/GoogleMobility/2021_CH_Region_Mobility_Report.csv")
    mobDf2020 = mobDf2020.loc[mobDf2020["sub_region_1"]==googleMobDict[cantonId]].set_index('date')
    mobDf2021 = mobDf2021.loc[mobDf2021["sub_region_1"]==googleMobDict[cantonId]].set_index('date')
    interestedCols = ['retail_and_recreation_percent_change_from_baseline',
                  'grocery_and_pharmacy_percent_change_from_baseline',
                  'parks_percent_change_from_baseline',
                  'transit_stations_percent_change_from_baseline',
                  'workplaces_percent_change_from_baseline',
                  'residential_percent_change_from_baseline'
                 ]
    mobDf2020 = mobDf2020[interestedCols]
    mobDf2021 = mobDf2021[interestedCols]
    mobDf2020.index = pd.to_datetime(mobDf2020.index)
    mobDf2021.index = pd.to_datetime(mobDf2021.index)
    mobDf = mobDf2020.append(mobDf2021)
    df = df.join(mobDf)
    
    # attach KOF strigency index
    kofDf = pd.read_csv("data/KOF/KOFStrigencyIndex.csv")
    kofDf = kofDf.set_index('date')
    kofDf = kofDf[["ch.kof.stringency."+cantonId.lower()+".stringency_plus"]]
    kofDf.rename(columns = {"ch.kof.stringency."+cantonId.lower()+".stringency_plus":'kofStrigency'}, inplace = True)
    kofDf.index = pd.to_datetime(kofDf.index)
    df = df.join(kofDf)
    
    # attach all measures
    measuresDf = pd.read_csv("data/measures/"+cantonId+".csv")
    measuresDf = measuresDf.set_index('Time')
    measuresDf.index = pd.to_datetime(measuresDf.index)
    df = df.join(measuresDf)
    
    
    # attach holidays & vacations 
    holy = pd.read_csv("data/HolidayVacation/HolidayVacation.csv").set_index("date")[[cantonId]]
    holy.rename(columns = {cantonId:'isHoliday'}, inplace = True)
    holy.index = pd.to_datetime(holy.index)
    df = df.join(holy)
   
    
    # attach intervista mobility data
    averageAndMedian = pd.read_csv("data/IntervistaMobility/Mittelwerte_und_Median_pro_Tag.csv", encoding="mac_roman")
    averageAndMedian = averageAndMedian.loc[(averageAndMedian["Beschreibung"] == "Distanz") & (averageAndMedian["Typ"] == "Median")]
    averageAndMedian = averageAndMedian.set_index("Datum")
    averageAndMedian = averageAndMedian[['D-CH','F-CH', 'I-CH']]
    averageAndMedian.index = pd.to_datetime(averageAndMedian.index)
    D_CH = ['AG','AI','AR', 'BE','BL', 'BS','LU','GR','NW', 'OW', 'SG', 'SH', 'SO', 'SZ', 'TG','GL','UR','ZG','ZH']
    F_CH = ['FR', 'GE', 'JU', 'VD', 'VS', 'NE']
    #'TI'
    if cantonId in D_CH:
        df = df.join(averageAndMedian[['D-CH']])
        df.rename(columns = {'D-CH':'intervistaMob'}, inplace = True) 
    elif cantonId in F_CH:
        df = df.join(averageAndMedian[['F-CH']])
        df.rename(columns = {'F-CH':'intervistaMob'}, inplace = True) 
    else:
        # cantonId = TI
        df = df.join(averageAndMedian[['I-CH']])
        df.rename(columns = {'I-CH':'intervistaMob'}, inplace = True)
        
   
    # attach neighbor incidents (WEEKLY)
    neigbors = {
      'AG': ['BL','SO','BE','LU','ZH','Baden-Wurttemberg','ZG'],
      'AI': ['AR','SG'],
      'AR': ['AI','SG'],
      'BE': ['AG','SO','JU','NE','FR','VD','VS','UR','NW','OW','LU'], 
      'BL': ['AG','BS','SO','Baden-Wurttemberg','Grand Est'], 
      'BS': ['BL','Baden-Wurttemberg','Grand Est'], 
      'FR': ['BE','VD','NE'], 
      'GE': ['VD','Auvergne Rhone Alpes'], 
      'GL': ['SG','SZ','UR','GR'], 
      'GR': ['SG','GL','UR','TI','Vorarlberg','Lombardia','Liechtenstein'], 
      'JU': ['BL','SO','BE','NE','Grand Est','Bourgogne Franche Comte'], 
      'LU': ['AG','BE','NW', 'OW','ZG','SZ'], 
      'NE': ['JU','BE','VD','FR','Bourgogne Franche Comte'], 
      'NW': ['OW','BE','LU','SZ','UR'], 
      'OW': ['NW','LU','BE','UR'], 
      'SG': ['AI','AR','TG','ZH','SZ','GL','GR','Vorarlberg','Liechtenstein'], 
      'SH': ['TG','ZH','Baden-Wurttemberg'], 
      'SO': ['BE','JU','BL','AG','Grand Est'],
      'SZ': ['ZG','ZH','SG','GL','LU','NW','UR'], 
      'TG': ['SH','ZH','SG','Baden-Wurttemberg'], 
      'TI': ['UR','GR','Piemonte','Lombardia'], 
      'UR': ['TI','VS','GR','BE','NW','OW','SZ','GL'], 
      'VD': ['NE','GE','FR','VS','BE','Auvergne Rhone Alpes','Bourgogne Franche Comte'], 
      'VS': ['VD','BE','UR','Piemonte','Auvergne Rhone Alpes'], 
      'ZG': ['ZH','AG','LU','SZ'],
      'ZH': ['SH','AG','ZG','SZ','SG','TG','Baden-Wurttemberg']
    }
    foph = pd.read_csv("data/FOPH/data/COVID19Cases_geoRegion.csv", parse_dates=True)
    # we have to convert each date string to datetime to merge the dataframes later
    foph = foph.set_index('datum')
    foph.index = pd.to_datetime(foph.index)
    foph['rate_14_day_per_100k'] = (100000*foph['sum14d']) / foph['pop']
    foph = foph[['geoRegion','rate_14_day_per_100k']]
    foph.index = pd.to_datetime(foph.index)
    ecdc = pd.read_csv("data/ECDC/ECDCsubnationalcaseweekly.csv")
    ecdc[['year_week']] = ecdc[['year_week']].applymap(lambda x: mondaysByWeekNr[x[0:4]+x[5:7]]) # mondaysByWeekNr[x[0:4]+x[6:8]]
    ecdc = ecdc.set_index("year_week")
    ecdc = ecdc[['region_name','rate_14_day_per_100k']]
    ecdc.index = pd.to_datetime(ecdc.index)
    temp = pd.DataFrame(index=pd.date_range(start=start, end=end))
    for n in neigbors[cantonId]:
        if len(n) != 2:
            internationalRegion = ecdc.loc[ecdc['region_name']==n][['rate_14_day_per_100k']]
            temp[[n]] = internationalRegion
            temp[[n]] = temp[[n]].interpolate(method='linear')
            temp.loc[temp.index[0],[n]] = 0
        else:
            nationalRegion = foph.loc[foph['geoRegion']==n][['rate_14_day_per_100k']]
            temp[[n]] = nationalRegion
            temp.loc[temp.index[0],[n]] = 0
            temp[[n]] = temp[[n]].interpolate(method='linear')
    imputer = KNNImputer(n_neighbors=10, weights="distance")
    temp = pd.DataFrame(imputer.fit_transform(temp.values), index=temp.index, columns=temp.columns)
    temp.columns = ['incidence_'+col for col in temp.columns]
    temp['meanNeighborIncidence'] = temp.mean(axis=1)
    temp['maxNeighborIncidence'] = temp.max(axis=1)
    df = df.join(temp[['meanNeighborIncidence','maxNeighborIncidence']]) 
    
    
    # compute statistic weather for missing values
    '''
    statWeathDf = pd.read_csv("static_data/statistical_historicweather/statistical_"+cantonId+".csv")
    statWeathDf['date'] = statWeathDf.apply(lambda row: datetime.datetime(2020,int(row["month"]),int(row["day"])), axis=1)
    statWeathDf = statWeathDf.set_index('date')
    statWeathDf = statWeathDf[['temp.average_min','temp.average_max','clouds.mean','precipitation.mean']]
    statWeathDf.columns = ['temp_min', 'temp_max', 'clouds', 'precipitation']
    weather = statWeathDf
    '''
    
    # compute historic weather from stored data
    storedWeathDf = pd.read_csv("static_data/historicweather_from_19_03_2020_to_17_03_2021/"+cantonId+".csv")
    storedWeathDf = storedWeathDf.set_index('dt')
    storedWeathDf = storedWeathDf[['main.temp_min','main.temp_max','clouds.all', 'rain.1h','snow.1h']]
    storedWeathDf = storedWeathDf.fillna(0)
    storedWeathDf['precipitation'] = storedWeathDf[['rain.1h','snow.1h']].sum(axis=1)
    storedWeathDf = storedWeathDf[['main.temp_min','main.temp_max','clouds.all','precipitation']] 

    startDate = datetime.datetime(2020, 3, 19)
    endDate = datetime.datetime(2021, 2, 3)
    temp = pd.DataFrame(index=pd.date_range(start=startDate, end=endDate), columns=storedWeathDf.columns)
    for day in pd.date_range(start=startDate, end=endDate):   
        oneDay = storedWeathDf.filter(like=day.strftime('%Y-%m-%d'), axis=0)
        temp.loc[day] = {'main.temp_min': oneDay['main.temp_min'].min(), 
                         'main.temp_max': oneDay['main.temp_max'].max(), 
                         'clouds.all': oneDay['clouds.all'].mean(),
                         'precipitation': oneDay['precipitation'].sum()}
    storedWeathDf = temp
    storedWeathDf.columns = ['temp_min', 'temp_max', 'clouds', 'precipitation']
    #weather = weather.append(storedWeathDf)
    weather = storedWeathDf

    # compute historic weather from recently loaded weather update
    updateWeathDf = pd.read_csv("data/historicweatherupdate/"+cantonId+".csv")
    updateWeathDf = updateWeathDf.set_index('dt')
    if 'snow.1h' in updateWeathDf.columns:
        updateWeathDf = updateWeathDf[['main.temp_min','main.temp_max','clouds.all', 'rain.1h','snow.1h']]
    else:
        updateWeathDf['snow.1h'] = 0
        updateWeathDf = updateWeathDf[['main.temp_min','main.temp_max','clouds.all', 'rain.1h','snow.1h']]
    updateWeathDf = updateWeathDf.fillna(0)
    updateWeathDf['precipitation'] = updateWeathDf[['rain.1h','snow.1h']].sum(axis=1)
    updateWeathDf = updateWeathDf[['main.temp_min','main.temp_max','clouds.all','precipitation']] 

    endDate = datetime.datetime.strptime(updateWeathDf.index[-1], '%Y-%m-%d %H:%M:%S')
    endDate = endDate.replace(hour=0, minute=0)
    startDate = datetime.datetime(2021, 2, 4)
    temp = pd.DataFrame(index=pd.date_range(start=startDate, end=endDate), columns=updateWeathDf.columns)
    for day in pd.date_range(start=startDate, end=endDate):   
        oneDay = updateWeathDf.filter(like=day.strftime('%Y-%m-%d'), axis=0)
        temp.loc[day] = {'main.temp_min': oneDay['main.temp_min'].min(), 
                         'main.temp_max': oneDay['main.temp_max'].max(), 
                         'clouds.all': oneDay['clouds.all'].mean(),
                         'precipitation': oneDay['precipitation'].sum()}
    updateWeathDf = temp
    updateWeathDf.columns = ['temp_min', 'temp_max', 'clouds', 'precipitation']
    weather = weather.append(updateWeathDf)  
    df = df.join(weather)
    
     
    df.index.names = ["date"]
    if not os.path.exists('data/merged'):
        os.makedirs('data/merged')
    df[start:end].to_csv('data/merged/'+cantonId+'.csv')
    

In [None]:
# fill the missing data

mergedDict = {}
for cantonId in cantonKeys: 
    mergedDict[cantonId] = pd.read_csv("data/merged/"+cantonId+".csv").set_index('date')[start:end]
    mergedDict[cantonId].index = pd.to_datetime(mergedDict[cantonId].index)
    
interpolMet = 'linear'
originalDict = mergedDict.copy()

for cantonId in cantonKeys:
    filled = mergedDict[cantonId].copy()
    
    # compute statistic weather for missing values
    statWeathDf = pd.read_csv("static_data/statistical_historicweather/statistical_"+cantonId+".csv")
    statWeathDf['date'] = statWeathDf.apply(lambda row: datetime.datetime(2020,int(row["month"]),int(row["day"])), axis=1)
    statWeathDf = statWeathDf.set_index('date')
    statWeathDf = statWeathDf[['temp.average_min','temp.average_max','clouds.mean','precipitation.mean']]
    statWeathDf.columns = ['temp_min', 'temp_max', 'clouds', 'precipitation']
    filled[['temp_min', 'temp_max', 'clouds', 'precipitation']] = filled[['temp_min', 'temp_max', 'clouds', 'precipitation']].fillna(statWeathDf)
    
    # fill missing test entries
    totalTestsInSwitzerland = pd.DataFrame(index=pd.date_range(start=datetime.datetime(2020, 2, 15), end=datetime.datetime(2020, 5, 22)))
    temp = pd.read_csv("data/FOPH/data/COVID19Test_geoRegion_all.csv")
    temp = temp[temp['geoRegion']=='CHFL']
    temp = temp.set_index('datum')
    temp.index = pd.to_datetime(temp.index)
    totalTestsInSwitzerland = totalTestsInSwitzerland.join(temp)  
    totalTestsInSwitzerland = totalTestsInSwitzerland[['entries']]
    totalTestsInSwitzerland.fillna(method='bfill', inplace=True)
    testsByCanton = pd.read_csv("data/FOPH/data/COVID19Test_geoRegion_all.csv")
    testsByCanton = testsByCanton.set_index('datum')
    testsByCanton.index = pd.to_datetime(testsByCanton.index)
    sumSwitzerland = testsByCanton.loc[testsByCanton["geoRegion"]=='CHFL'][['entries']]
    sumCanton = testsByCanton.loc[testsByCanton["geoRegion"]==cantonId][['entries']]
    cantonalTestFraction = sumCanton['2020-05-23':'2020-06-05'].sum(axis=0).values[0]/sumSwitzerland['2020-05-23':'2020-06-05'].sum(axis=0).values[0]
    #multiply this with cantonal test quotient
    computedMissingEntries = totalTestsInSwitzerland['2020-02-15':'2020-05-22']*cantonalTestFraction 
    computedMissingEntries.rename(columns = {"entries":'test_entries'}, inplace = True)
    filled[['test_entries']] = filled[['test_entries']].fillna(computedMissingEntries[['test_entries']])
    # compute rest of test incidence
    missingTestIncidents = 100000*(filled[['test_entries']]/pop)
    missingTestIncidents.rename(columns = {"test_entries":'test_inz_entries'}, inplace = True)
    filled[['test_inz_entries']] = filled[['test_inz_entries']].fillna(missingTestIncidents)  
    
    
    # fill missing vaccine data
    vaccine = ['AtLeastOneDosePersons per100PersonsTotal','COVID19FullyVaccPersons per100PersonsTotal']
    filled.loc['2020-12-18',vaccine] = 0 #on 19. december 2020 swissmedic approved the first vaccine
    filled[vaccine] = filled[vaccine].interpolate(method=interpolMet)
    filled[vaccine] = filled[vaccine].fillna(method='ffill')
    filled[vaccine] = filled[vaccine].fillna(0)
     
    # fill missing total hospital capacities
    hospitalCols = ['ICU_Capacity','ICU_FreeCapacity','Total_Capacity','Total_FreeCapacity']
    filled[hospitalCols] = filled[hospitalCols].interpolate(method=interpolMet)
    filled[hospitalCols] = filled[hospitalCols].fillna(method='bfill')
    # because of Appenzell Innerrhoden
    #filled[hospitalCols] = filled[hospitalCols].fillna(0)

    # fill in missing hospital capacity datanearest
    # we make the assumption that at the beginning there were no covid patients
    filled.loc['2020-02-15',['Total_Covid19Patients','ICU_Covid19Patients']] = 0
    patientCols = ['Total_Covid19Patients','Total_NonCovid19Patients','Total_AllPatients','ICU_Covid19Patients',
                   'ICU_NonCovid19Patients','ICU_AllPatients']
    filled[patientCols] = filled[patientCols].interpolate(method=interpolMet)
    filled[['Total_NonCovid19Patients','ICU_NonCovid19Patients']] = filled[['Total_NonCovid19Patients','ICU_NonCovid19Patients']].fillna(method='bfill')
    # Covid19Patients + NonCovid19Patients = AllPatients
    filled['Total_AllPatients'].fillna(filled[['Total_Covid19Patients','Total_NonCovid19Patients']].sum(axis=1), inplace=True)
    filled['ICU_AllPatients'].fillna(filled[['ICU_Covid19Patients','ICU_NonCovid19Patients']].sum(axis=1), inplace=True)
    # because of Appenzell Innerrhoden
    #filled[patientCols] = filled[patientCols].fillna(0)
    
    # fill in missing Google mobility data
    googleMobilityCols = ['retail_and_recreation_percent_change_from_baseline','grocery_and_pharmacy_percent_change_from_baseline',
    'parks_percent_change_from_baseline','transit_stations_percent_change_from_baseline','workplaces_percent_change_from_baseline'
    ,'residential_percent_change_from_baseline']
    
    # THIS IS ONLY FOR A PLOT
    '''
    missingDf = filled.copy()
    missingDf = missingDf.reset_index()
    plt.figure(figsize=(9,4))
    missingDf = missingDf[['date','workplaces_percent_change_from_baseline']]
    plt.scatter(missingDf['date'],
            missingDf['workplaces_percent_change_from_baseline'],
           marker='o',
           color='blue',
           label='workplace mobility AI',
           alpha=1)
    plt.xlabel('date')
    plt.ylabel('percent change from baseline')
    plt.legend()
    plt.show()
    missingDf = missingDf.set_index('date')
    missingDf['swiss workplace average'] = float('nan')
    '''
    
    # use the swiss average when possible
    for year in [2020,2021]:
        googleMobCH = pd.read_csv("data/GoogleMobility/"+str(year)+"_CH_Region_Mobility_Report.csv")
        googleMobCH = googleMobCH.loc[googleMobCH["sub_region_1"].isna()].set_index('date')[googleMobilityCols]
        googleMobCH.index = pd.to_datetime(googleMobCH.index)
        for col in googleMobilityCols:
            
            filled[col] = filled[col].fillna(googleMobCH[col])
            
            # THIS IS ONLY FOR A PLOT
            '''
            if col == 'workplaces_percent_change_from_baseline':
                display(googleMobCH[col])
                missingDf['swiss workplace average'] = missingDf['swiss workplace average'].fillna(googleMobCH[col])
            '''
              
    # for the rest interpolate
    filled[googleMobilityCols] = filled[googleMobilityCols].interpolate(method=interpolMet)
    
    # THIS IS ONLY FOR A PLOT
    '''
    missingDf['complement'] = missingDf[missingDf['workplaces_percent_change_from_baseline'].isnull()]['swiss workplace average']
    missingDf = missingDf.reset_index()
    plt.figure(figsize=(9,4))
    plt.scatter(missingDf['date'],
            missingDf['workplaces_percent_change_from_baseline'],
           marker='o',
           color='blue',
           label='workplace mobility AI',
           alpha=1)
    plt.scatter(missingDf['date'],
            missingDf['complement'],
           marker='o',
           color='red',
           label='workplace mobility Swiss average',
           alpha=1)
    plt.xlabel('date')
    plt.ylabel('percent change from baseline')
    plt.legend()
    plt.show()
    '''
    
    # fill in missing Intervista mobility data 
    filled[['intervistaMob']] = filled[['intervistaMob']].interpolate(method=interpolMet)
    filled['intervistaMob'].fillna(method='ffill', inplace=True)
        
    # fill in missing r values
    rvalues = ['median_R_mean','median_R_highHPD','median_R_lowHPD']
    filled[rvalues] = filled[rvalues].interpolate(method=interpolMet)
    # move them 14 days forward to match r erstimation value with publication date
    filled[rvalues] = filled[rvalues].shift(periods=14)
    filled[rvalues] = filled[rvalues].fillna(method='ffill')
    filled[rvalues] = filled[rvalues].fillna(method='bfill')
  
    # fill in missing variants
    # first detected case for variants of concerne in Switzerland is 2020-10-14
    #variants = ['lower_ci_day','upper_ci_day','anteil_pos']
    #filled.loc['2020-10-13',['lower_ci_day','anteil_pos']] = 0
    #filled.loc['2020-10-13',['upper_ci_day']] = 100
    variants = ['prct_'+v for v in variantList]
    filled[variants] = filled[variants].interpolate(method=interpolMet)
    filled[variants] = filled[variants].fillna(method='ffill')
    filled[variants] = filled[variants].fillna(method='bfill')

    # fill in daily incoming missing data
    zeroAndInterpolate = ['case_entries','case_inz_entries','hosp_inz_entries',
                          'death_inz_entries','case_inzsumTotal','hosp_inzsumTotal','death_inzsumTotal',
                          'test_inzsumTotal','Cases inz_entries 0 - 9',
                          'Cases inz_entries 10 - 19','Cases inz_entries 20 - 29','Cases inz_entries 30 - 39',
                          'Cases inz_entries 40 - 49','Cases inz_entries 50 - 59','Cases inz_entries 60 - 69',
                          'Cases inz_entries 70 - 79','Cases inz_entries 80+','Cases inz_entries male',
                          'Cases inz_entries female','Cases inzsumTotal 0 - 9','Cases inzsumTotal 10 - 19',
                          'Cases inzsumTotal 20 - 29','Cases inzsumTotal 30 - 39','Cases inzsumTotal 40 - 49',
                          'Cases inzsumTotal 50 - 59','Cases inzsumTotal 60 - 69','Cases inzsumTotal 70 - 79',
                          'Cases inzsumTotal 80+','Cases inzsumTotal male','Cases inzsumTotal female',
                          'Death inz_entries 0 - 9',
                          'Death inz_entries 10 - 19','Death inz_entries 20 - 29','Death inz_entries 30 - 39',
                          'Death inz_entries 40 - 49','Death inz_entries 50 - 59','Death inz_entries 60 - 69',
                          'Death inz_entries 70 - 79','Death inz_entries 80+','Death inz_entries male',
                          'Death inz_entries female','Death inzsumTotal 0 - 9','Death inzsumTotal 10 - 19',
                          'Death inzsumTotal 20 - 29','Death inzsumTotal 30 - 39','Death inzsumTotal 40 - 49',
                          'Death inzsumTotal 50 - 59','Death inzsumTotal 60 - 69','Death inzsumTotal 70 - 79',
                          'Death inzsumTotal 80+','Death inzsumTotal male','Death inzsumTotal female',
                          'Hosp inz_entries 0 - 9','Hosp inz_entries 10 - 19',
                          'Hosp inz_entries 20 - 29','Hosp inz_entries 30 - 39','Hosp inz_entries 40 - 49',
                          'Hosp inz_entries 50 - 59','Hosp inz_entries 60 - 69','Hosp inz_entries 70 - 79',
                          'Hosp inz_entries 80+','Hosp inz_entries male','Hosp inz_entries female',
                          'Hosp inzsumTotal 0 - 9','Hosp inzsumTotal 10 - 19','Hosp inzsumTotal 20 - 29',
                          'Hosp inzsumTotal 30 - 39','Hosp inzsumTotal 40 - 49','Hosp inzsumTotal 50 - 59',
                          'Hosp inzsumTotal 60 - 69','Hosp inzsumTotal 70 - 79','Hosp inzsumTotal 80+',
                          'Hosp inzsumTotal male','Hosp inzsumTotal female']
    filled.loc['2020-02-15',zeroAndInterpolate] = 0
    filled[zeroAndInterpolate] = filled[zeroAndInterpolate].interpolate(method=interpolMet)
    
    tests = ['test_entries','test_inz_entries']
    filled[tests] = filled[tests].fillna(method='ffill')
    
    
    if not os.path.exists('data/filled'):
        os.makedirs('data/filled')
    filled.to_csv('data/filled/'+cantonId+'.csv')
    

In [None]:
# check complete data if there are any NaNs left
for cantonId in cantonKeys:
    filled = pd.read_csv('data/filled/'+cantonId+'.csv')
    for col in filled.columns:
        if filled[col].isna().sum() != 0:
            print(cantonId+" "+col+" (#NaN/#NotNaN): (" + str(filled[col].isna().sum())+"/"+str(filled[col].notna().sum())+")")
            #display(filled[[col]])
            #dict[cantonId][col].plot(kind='line',y=[col], figsize=(20,10))
            #plt.show()

In [None]:

# plotting original data vs filled data
#for col in dict['AG'].columns:
#    comparingDf = pd.concat([dict['AG'][[col]],originalData[["original_"+col]]], axis=1)
    #comparingDf[['original_'+col]].reset_index().plot(kind='scatter', x=['date'], y=['original_'+col], figsize=(20,10))
    #comparingDf['2020-02-15':'2021-04-05'].plot(kind='line',y=[col], figsize=(20,10))
#for col in pd.read_csv("data/filled/"+'AG'+".csv").set_index('date').columns:
#    print(col)

In [None]:
# FEATURE ENGINEERING
filledDict = {}

for cantonId in cantonKeys:
    filledDict[cantonId] = pd.read_csv("data/filled/"+cantonId+".csv").set_index('date')
    
    dailyFeatures = filledDict[cantonId].copy()
    dailyFeatures.index = pd.to_datetime(dailyFeatures.index) 

    # summarize mask mandatories
    maskMandatories = [ 'Mask mandatory in publicly accessible establishments/ spaces (shops etc.)',
                       'Mask mandatory in public transport','Masks mandatory in schools','Masks mandatory at work']
    dailyFeatures[['maskMandatories']] = dailyFeatures[maskMandatories].sum(axis=1)
    dailyFeatures.drop(maskMandatories, axis=1, inplace=True)
    
    # r value accuracy
    dailyFeatures[['R_error']] = dailyFeatures['median_R_highHPD']-dailyFeatures['median_R_lowHPD']
    dailyFeatures.drop(['median_R_highHPD','median_R_lowHPD'],axis=1, inplace=True)
    
    # test positivity rate
    dailyFeatures[['testPositvity_7dayAverage']] = (dailyFeatures['case_entries'].rolling(window=7, min_periods=1).mean()/dailyFeatures['test_entries'].rolling(window=7, min_periods=1).mean()).rolling(window=7, min_periods=1).mean()


    # hospital capacities
    hospCap = [ 'ICU_AllPatients',
     'ICU_Covid19Patients',
     'ICU_Capacity',
     'Total_AllPatients',
     'Total_Covid19Patients',
     'Total_Capacity',
     'ICU_NonCovid19Patients',
     'ICU_FreeCapacity',
     'Total_NonCovid19Patients',
     'Total_FreeCapacity']
    staticCantonal = pd.read_excel("static_data/staticCantonalData.xlsx").set_index('canton').transpose()
    dailyFeatures[[col + "_inz" for col in hospCap]] = 100000*(dailyFeatures[hospCap]/staticCantonal.loc[[cantonId]]['residents'][0])
    dailyFeatures.drop(hospCap,axis=1, inplace=True)

    # summing up age groups
    for c1 in ['Cases','Death','Hosp']:
        for c2 in ['inz_entries','inzsumTotal']:
            dailyFeatures[[c1+" "+c2+" "+"0 - 19"]] = dailyFeatures[[c1+" "+c2+" "+"0 - 9",c1+" "+c2+" "+"10 - 19"]].sum(axis=1)
            dailyFeatures[[c1+" "+c2+" "+"20 - 39"]] = dailyFeatures[[c1+" "+c2+" "+"20 - 29",c1+" "+c2+" "+"30 - 39"]].sum(axis=1)
            dailyFeatures[[c1+" "+c2+" "+"40 - 59"]] = dailyFeatures[[c1+" "+c2+" "+"40 - 49",c1+" "+c2+" "+"50 - 59"]].sum(axis=1)
            dailyFeatures[[c1+" "+c2+" "+"60+"]] = dailyFeatures[[c1+" "+c2+" "+"60 - 69",c1+" "+c2+" "+"70 - 79",c1+" "+c2+" "+"80+"]].sum(axis=1)
            for agegroup in ["0 - 9","10 - 19","20 - 29","30 - 39","40 - 49","50 - 59","60 - 69","70 - 79","80+"]:
                dailyFeatures.drop([c1+" "+c2+" "+agegroup], axis=1, inplace=True)
        
        
    toBeSmoothed = ['case_inz_entries',
                    'hosp_inz_entries',
                    'death_inz_entries',
                    'test_entries',
                    'test_inz_entries',
                    'retail_and_recreation_percent_change_from_baseline',
                     'grocery_and_pharmacy_percent_change_from_baseline',
                     'parks_percent_change_from_baseline',
                     'transit_stations_percent_change_from_baseline',
                     'workplaces_percent_change_from_baseline',
                     'residential_percent_change_from_baseline',
                     'intervistaMob']
    for f in toBeSmoothed:
        dailyFeatures[[f+"_7dayAverage"]] = dailyFeatures[[f]].rolling(window=7, min_periods=1).mean()

  
    #display([c for c in dailyFeatures.columns])
    if not os.path.exists('data/dailyFeatures'):
        os.makedirs('data/dailyFeatures')
    dailyFeatures.to_csv('data/dailyFeatures/'+cantonId+'.csv')

In [None]:
for cantonId in cantonKeys:
    data = pd.read_csv('data/dailyFeatures/'+cantonId+'.csv')
    for col in data.columns:
        if data[col].isna().sum() != 0:
            display(cantonId)
            print(col+" (#NaN/#NotNaN): (" + str(data[col].isna().sum())+"/"+str(data[col].notna().sum())+")")

In [None]:
# CONSTRUCTING INPUT/OUTPUT INTERVALS

weeksIn = 3
weeksOut = 4

listOfInputIntervals = []
listOfOutputIntervals = []
for e in pd.date_range(start=pd.Timestamp(start),end=pd.Timestamp(end), freq='D'):
    if (e+timedelta(days = (weeksIn*7)+(weeksOut*7)-1) <= pd.Timestamp(end).date()):
        tempInputList = []
        for week in range(0,weeksIn):
            tempInputList.append(((e+timedelta(days = week*7)).date(),(e+timedelta(days = ((week+1)*7)-1)).date()))
        listOfInputIntervals.append(tempInputList)
        
        tempOutputList = []
        for week in range(0,weeksOut):
            tempOutputList.append(((e+timedelta(days = weeksIn*7+week*7)).date(),(e+timedelta(days = weeksIn*7+((week+1)*7)-1)).date()))
        listOfOutputIntervals.append(tempOutputList)                      
    

display(listOfInputIntervals)





In [None]:
startTimer = time.time()
from sklearn.linear_model import LinearRegression

dailyFeaturesDict = {} 

# average features
# features which will be averaged over the whole input interval

    
# direct features
# features which will be direct input for every day of the input interval
# attention: this can potentially increase the number of input features significantly
# added features are len(directFea)*daysIn
# only add features for which have a large variance from one day to another day

# direct feautres = output features

outputFeatures = ['case_inz_entries_7dayAverage',
                  'hosp_inz_entries_7dayAverage',
                  'death_inz_entries_7dayAverage',
                  'testPositvity_7dayAverage',
                  'transit_stations_percent_change_from_baseline_7dayAverage',
                  'workplaces_percent_change_from_baseline_7dayAverage'                        
]

# CONSTRUCTING ACTUAL INPUTS
for cantonId in cantonKeys:
    display(cantonId)
    dailyFeaturesDict[cantonId] = pd.read_csv("data/dailyFeatures/"+cantonId+".csv").set_index('date')
    dailyFeaturesDict[cantonId].index = pd.to_datetime(dailyFeaturesDict[cantonId].index)
    averageFeatures = dailyFeaturesDict[cantonId].columns
    
    # we remove the 7 day average features from averagefeatures list because we do not need their mean again
    averageFeatures = [e for e in averageFeatures if e not in outputFeatures]

    
    features = pd.DataFrame()
    
    # construction of input features
    listCounter = 0
    
    for inputList in listOfInputIntervals: # for every input week set
        # we create a feature row which we append at the end to the features dataframe of the canton
        featureRow = pd.DataFrame()
        
        weekNumber = 0
        
        for inputTuple in inputList: # for all input weeks
            timeFrame = dailyFeaturesDict[cantonId][inputTuple[0]:inputTuple[1]]

            # add the mean of the week for all features
            temp = timeFrame[averageFeatures].mean().to_frame().transpose()
            temp.columns = [f + "_weekMean_" + str(weekNumber) for f in averageFeatures]
            
            
            for f in outputFeatures: # for all output features
                y = timeFrame[[f]].values
             
                # add last value of the input week
                temp[[f +"_last_"+str(weekNumber)]] = y[6][0]
                
                
                # short term trend
                temp[[f +"_shortTrend_"+str(weekNumber)]] = (y[6][0]-y[0][0])/7
                
                
                s = np.sort(y, axis=None)
                # add range of the input week
                #temp[[f +"_quartilerange_"+str(weekNumber)]] = (s[4]+s[5]/2)-(s[1]+s[2]/2)
                temp[[f +"_range_"+str(weekNumber)]] = s[6]-s[0]
                # add median of the input week
                temp[[f +"_median_"+str(weekNumber)]] = s[3]
                
            
            featureRow = pd.concat([featureRow, temp], axis = 1)
            weekNumber = weekNumber + 1
        
        # add the sum of the KOF strigency index from start of the pandemic until the last day of the input
        featureRow[['kofStrigency sumtotal']] = dailyFeaturesDict[cantonId][start:inputList[-1][1]][['kofStrigency']].sum()[0]
        
        # longterm trend
        wholeTimeFrame = dailyFeaturesDict[cantonId][inputList[0][0]:inputList[-1][1]]
        for f in outputFeatures: # for every output feature
            y = wholeTimeFrame[[f]].values
            featureRow[[f +"_longtermTrend"]] = (y[20][0]-y[0][0])/21 
        
        # limited future features (only one future week, but only week mean)
        limitedFutureFeatures = ['temp_min','temp_max','clouds','precipitation', 'kofStrigency',
                         'Borders','Events','Gatherings/private events',
                         'Demonstrations','Primary (includes kindergarten) and lower secondary school',
                         'Upper secondary school, vocational schools and higher education',
                         'Universities and other educational establishments',
                         'Mountain railways',
                         'Homeworking',
                         'Restaurants',
                         'Discos/Nightclubs',
                         'Shops/Markets',
                         'Penalties',
                         'Cultural, entertainment and recreational facilities',
                         'Sport/Wellness facilities',
                         'Sport activities',
                         'Religious services',
                         'Singing allowed']
        ot = listOfOutputIntervals[listCounter][0]  #select only the first future week
        timeFrameOutput = dailyFeaturesDict[cantonId][ot[0]:ot[1]]
        temp = timeFrameOutput[limitedFutureFeatures].mean().to_frame().transpose()
        temp.columns = [f + "_futureWeekMean_0" for f in limitedFutureFeatures]
        featureRow = pd.concat([featureRow, temp], axis = 1)
        
        # limited future features direct (only one future week, but all 7 entires)
        limitedFutureFeaturesDirect = ['temp_min','temp_max','clouds','precipitation']
        ot = listOfOutputIntervals[listCounter][0]  #select only the first future week
        timeFrameOutput = dailyFeaturesDict[cantonId][ot[0]:ot[1]]
        temp = pd.DataFrame(timeFrameOutput[limitedFutureFeaturesDirect].values.flatten(order='F')).transpose()
        temp.columns =  [f + '_future_day_'+str(d) for f in limitedFutureFeaturesDirect for d in range(0,7)]
        featureRow = pd.concat([featureRow, temp], axis = 1)
        
        # unlimited future features (as much weeks as selected output weeks, but only week mean)
        unlimitedFutureFeatures = ['isHoliday']
        futureWeekNumber = 0
        for outPutTuple in listOfOutputIntervals[listCounter]:
            timeFrameOutput = dailyFeaturesDict[cantonId][outPutTuple[0]:outPutTuple[1]]
            temp = timeFrameOutput[unlimitedFutureFeatures].mean().to_frame().transpose()
            temp.columns = [f + "_futureWeekMean_" + str(futureWeekNumber) for f in unlimitedFutureFeatures]
            featureRow = pd.concat([featureRow, temp], axis = 1)
            futureWeekNumber = futureWeekNumber + 1
        
        # increase the list counter to get the next corresponding output
        listCounter = listCounter + 1
        
        # add the last input day date
        lastInputDay = inputList[weeksIn-1][1]
        featureRow['lastInputDay'] = lastInputDay
        
        
        features = features.append(featureRow, ignore_index=True)
          
    staticCantonal = pd.read_excel("static_data/staticCantonalData.xlsx").set_index('canton').transpose()
    
    # households
    for h in ['2PersonHouseholds','3PersonHouseholds', '4PersonHouseholds', '5PersonHouseholds','6+PersonHouseholds']:
        features[[h+"_perc"]] = staticCantonal.loc[[cantonId]][h][0]/staticCantonal.loc[[cantonId]]['totalHousholds'][0]
    features[['averageHousehold']] = staticCantonal.loc[[cantonId]]['residents'][0]/staticCantonal.loc[[cantonId]]['totalHousholds'][0]
    # add static features
    staticFeatures = ['percentage 65 years or over','urbanPopulationPercent','homeownershipPercent', 
                      'livingSpaceInm2','carsPer1000inhabitants', 'publicTransportationPercent',
                      'privateMotorisedTransportPercent','DoctorsPer100Kinhabitants','residentsPerKm2']
    for f in staticFeatures:
        features[[f]] = staticCantonal.loc[[cantonId]][f][0]
    # construct settlement area feature
    residents = staticCantonal.loc[[cantonId]]['residents'][0]
    settlementArea = staticCantonal.loc[[cantonId]]['areaInKm2'][0]*(staticCantonal.loc[[cantonId]]['settlementAreaPercent'][0]/100)
    features[['residentsPerKm2SettlementArea']] = residents/settlementArea
    
    
    residentsInSwitzerland = staticCantonal[['residents']].sum()[0]
    residentsInCanton = staticCantonal.loc[[cantonId]]['residents'][0]
    features[['population_perc']] = residentsInCanton/residentsInSwitzerland
    
    temp = pd.DataFrame([i for i in range(0,features.shape[0])],columns =['daysSincePandemicStart'])
    features = pd.concat([features, temp], axis = 1) 
    
    #display([col for col in features])
    #display(features)
    if not os.path.exists('data/features'):
        os.makedirs('data/features')
    features.to_csv('data/features/'+cantonId+'.csv', index=False)

display("----------End of evaluating (%s)----------" % (time.time() - startTimer))




In [None]:
# outputs 
outputCols = outputFeatures
# CONSTRUCTING CORRECT OUTPUTS
for cantonId in cantonKeys:
    outputs = pd.DataFrame()
    for outputList in listOfOutputIntervals:
        outputRow = pd.DataFrame()
        weekNumber = 0
        for outputTuple in outputList:
            temp = dailyFeaturesDict[cantonId][outputTuple[0]:outputTuple[1]][outputCols] 
            temp = temp.iloc[-1].to_frame().transpose() #this uses the last value of the output week
            #temp = temp.mean().to_frame().transpose() #this uses the mean of the output week
            temp = temp.reset_index(drop=True)
            temp.columns = ["output_"+f + "_" + str(weekNumber) for f in outputCols]
            weekNumber = weekNumber + 1
            outputRow = pd.concat([outputRow, temp], axis = 1)
        outputs = outputs.append(outputRow, ignore_index=True)
    
    if not os.path.exists('data/outputs'):
        os.makedirs('data/outputs')
    outputs.to_csv('data/outputs/'+cantonId+'.csv', index=False)

In [23]:
# categorizes data into test sets, validation sets and training set
import random
intervalSize = 14 #two week intervals for test/validation set 2
# test set 1 are the newest 28 days
# validation set 1 are the second newest 28 days

dfList = []


#for cantonId in cantonKeys:
for cantonId in cantonKeys:
    inputs = pd.read_csv('data/features/'+cantonId+'.csv')
    outputs = pd.read_csv('data/outputs/'+cantonId+'.csv')
    length = inputs.shape[0]
    
    df = pd.concat([inputs,outputs], axis=1)
    
    df[['cantonId']] = cantonId
    # mark all as train set (default)
    df[['category']] = 'train'
    
    # mark test set 1
    df.loc[(length-28):,['category']] = 'test 1'
    
    # mark validation set 1
    df.loc[(length-56):(length-28),['category']] = 'validation 1'
    
    dfList.append((cantonId,df))

    
def hasConflict(list1, list2):
    for i in range(0, len(list1)):
        if list1[i] == list2[i]:
            return True
    return False

def sortListBasedOnSpecificOrder(inputList, orderingList):
    newList = []
    for e in orderingList:
        # search in input list
        for i in inputList:
            if i[0] == e:
                #found
                newList.append(i)
    return newList

def printFirstsInTuple(inputList):
    for e in inputList:
        print(e[0])
    print(" ")
    

cantons = cantonKeys.copy()


random.shuffle(cantons) 
testPerm1 = cantons.copy()
display(testPerm1)


testPerm2 = cantons.copy()
while (hasConflict(testPerm1,testPerm2)):
    random.shuffle(cantons)
    testPerm2 = cantons.copy()
display(testPerm2)


validPerm1 = cantons.copy()
while (hasConflict(validPerm1,testPerm1) or hasConflict(validPerm1,testPerm2)):
    random.shuffle(cantons)
    validPerm1 = cantons.copy()
display(validPerm1)

validPerm2 = cantons.copy()
while (hasConflict(validPerm2,testPerm1) or hasConflict(validPerm2,testPerm2) or hasConflict(validPerm2,validPerm1)):
    random.shuffle(cantons)
    validPerm2 = cantons.copy()
display(validPerm2)


#printFirstsInTuple(dfList)

dfList = sortListBasedOnSpecificOrder(dfList, testPerm1)
#printFirstsInTuple(dfList)


length = dfList[0][1].shape[0]
counter = 0
cantonCounter = 0
while (counter+intervalSize-1)  < length-28:
    dfList[cantonCounter % 26][1].loc[counter:counter+intervalSize-1,['category']] = 'test 2'
    counter = counter + intervalSize
    cantonCounter = cantonCounter + 1
dfList[cantonCounter % 26][1].loc[counter:length-29,['category']] = 'test 2'
# remove first and add at the end
temp = dfList.pop(0)
dfList.append(temp)
    
 
dfList = sortListBasedOnSpecificOrder(dfList, testPerm2)

counter = 0
cantonCounter = 0
while (counter+intervalSize-1)  < length-28:
    dfList[cantonCounter % 26][1].loc[counter:counter+intervalSize-1,['category']] = 'test 2'
    counter = counter + intervalSize
    cantonCounter = cantonCounter + 1
dfList[cantonCounter % 26][1].loc[counter:length-29,['category']] = 'test 2'
# remove first and add at the end
temp = dfList.pop(0)
dfList.append(temp)


dfList = sortListBasedOnSpecificOrder(dfList, validPerm1)

# we go through the whole time intervall 2
counter = 0 
cantonCounter = 0
while (counter+intervalSize-1)  < length-56:
    dfList[cantonCounter % 26][1].loc[counter:counter+intervalSize-1,['category']] = 'validation 2'
    counter = counter + intervalSize
    cantonCounter = cantonCounter + 1  
dfList[cantonCounter % 26][1].loc[counter:length-57,['category']] = 'validation 2'
# remove first and add at the end
temp = dfList.pop(0)
dfList.append(temp)
    
dfList = sortListBasedOnSpecificOrder(dfList, validPerm2)   
    
counter = 0
cantonCounter = 0
while (counter+intervalSize-1)  < length-56:
    dfList[cantonCounter % 26][1].loc[counter:counter+intervalSize-1,['category']] = 'validation 2'
    counter = counter + intervalSize
    cantonCounter = cantonCounter + 1  
dfList[cantonCounter % 26][1].loc[counter:length-57,['category']] = 'validation 2'
# remove first and add at the end
temp = dfList.pop(0)
dfList.append(temp)

dfList = [y for (x,y) in dfList]

data = pd.concat(dfList, axis=0, ignore_index=True)
completeNumber = data.shape[0]
trainNumber = data[data['category']=='train'].shape[0]
test1Number = data[data['category']=='test 1'].shape[0]
test2Number = data[data['category']=='test 2'].shape[0]
valid1Number = data[data['category']=='validation 1'].shape[0]
valid2Number = data[data['category']=='validation 2'].shape[0]
display("Complete: "+ str(completeNumber) )
display("Training set: "+ str(trainNumber)+ "("+str(trainNumber/completeNumber)+")")
display("Test set 1: "+ str(test1Number)+ "("+str(test1Number/completeNumber)+")")
display("Test set 2: "+ str(test2Number)+ "("+str(test2Number/completeNumber)+")")
display("Validation set 1: "+ str(valid1Number)+ "("+str(valid1Number/completeNumber)+")")
display("Validation set 2: "+ str(valid2Number)+ "("+str(valid2Number/completeNumber)+")")

data.to_csv('completedata.csv', index=False)

['BE',
 'GL',
 'SH',
 'GE',
 'TI',
 'LU',
 'FR',
 'JU',
 'TG',
 'SO',
 'AR',
 'GR',
 'NE',
 'AG',
 'OW',
 'BL',
 'BS',
 'NW',
 'ZG',
 'UR',
 'AI',
 'VD',
 'ZH',
 'VS',
 'SG',
 'SZ']

['ZG',
 'SO',
 'NE',
 'GR',
 'GL',
 'AG',
 'VD',
 'TG',
 'BE',
 'GE',
 'VS',
 'AI',
 'FR',
 'SG',
 'TI',
 'SZ',
 'NW',
 'AR',
 'LU',
 'BS',
 'JU',
 'SH',
 'BL',
 'UR',
 'ZH',
 'OW']

['BL',
 'AR',
 'GL',
 'JU',
 'GR',
 'VD',
 'UR',
 'VS',
 'SO',
 'OW',
 'SZ',
 'BS',
 'TG',
 'ZH',
 'GE',
 'NE',
 'ZG',
 'SH',
 'NW',
 'BE',
 'TI',
 'FR',
 'SG',
 'AI',
 'AG',
 'LU']

['BS',
 'BL',
 'SZ',
 'TI',
 'NW',
 'ZH',
 'SG',
 'VD',
 'OW',
 'NE',
 'TG',
 'FR',
 'BE',
 'JU',
 'SO',
 'AG',
 'LU',
 'AI',
 'AR',
 'ZG',
 'GE',
 'UR',
 'VS',
 'GL',
 'SH',
 'GR']

'Complete: 11674'

'Training set: 8646(0.7406201816001371)'

'Test set 1: 702(0.060133630289532294)'

'Test set 2: 842(0.0721260921706356)'

'Validation set 1: 698(0.05979098852150077)'

'Validation set 2: 786(0.06732910741819428)'