In [124]:
import os
import pandas as pd
pd.options.display.max_rows = None
import matplotlib.pyplot as plt
import numpy as np
import datetime
import time
from datetime import timedelta
cantonKeys = ['AG','AI','AR', 'BE', 'BL', 'BS', 'FR', 'GE', 'GL', 'GR', 'JU', 'LU', 'NE', 'NW', 'OW', 'SG', 'SH', 'SO', 'SZ', 'TG', 'TI', 'UR', 'VD', 'VS', 'ZG','ZH']


dict = {}
for cantonId in cantonKeys:
    dict[cantonId] = pd.read_csv("data/dailyFeatures/"+cantonId+".csv").set_index('date')
    dict[cantonId].index = pd.to_datetime(dict[cantonId].index)




In [115]:
# generate date ranges
dataStart = pd.Timestamp('2020-02-15')
dataEnd = pd.Timestamp('2021-04-05')

daysIn = 7 # last n days of input
daysOut = 7 # next n days of output

listOfInputIntervals = []
listOfOutputIntervals = []
for e in pd.date_range(start=dataStart,end=dataEnd, freq='D'):
    if (e+timedelta(days = daysIn+daysOut-1) <= dataEnd.date()):
        listOfInputIntervals.append((e.date(),(e+timedelta(days = (daysIn-1))).date()))
        listOfOutputIntervals.append(((e+timedelta(days = daysIn)).date(),(e+timedelta(days = daysIn+daysOut-1)).date()))
    
#display(listOfInputIntervals)
#display(listOfOutputIntervals)

In [117]:
######### constructing the inputs #########
for cantonId in cantonKeys:
    display(cantonId)
    features = pd.DataFrame()
    
    # construction of input features
    for t in listOfInputIntervals:
        # features which will be averaged over the whole input interval
        averageFeatures = ['Cases inz_entries 0 - 9','Cases inz_entries 10 - 19','Cases inz_entries 20 - 29',
                  'Cases inz_entries 30 - 39','Cases inz_entries 40 - 49','Cases inz_entries 50 - 59',
                  'Cases inz_entries 60 - 69','Cases inz_entries 70 - 79','Cases inz_entries 80+',
                  'Cases inzsumTotal 0 - 9','Cases inzsumTotal 10 - 19','Cases inzsumTotal 20 - 29',
                  'Cases inzsumTotal 30 - 39','Cases inzsumTotal 40 - 49','Cases inzsumTotal 50 - 59',
                  'Cases inzsumTotal 60 - 69','Cases inzsumTotal 70 - 79','Cases inzsumTotal 80+',
                  'Death inz_entries 0 - 9','Death inz_entries 10 - 19','Death inz_entries 20 - 29',
                  'Death inz_entries 30 - 39','Death inz_entries 40 - 49','Death inz_entries 50 - 59',
                  'Death inz_entries 60 - 69','Death inz_entries 70 - 79','Death inz_entries 80+',
                  'Death inzsumTotal 0 - 9','Death inzsumTotal 10 - 19','Death inzsumTotal 20 - 29',
                  'Death inzsumTotal 30 - 39','Death inzsumTotal 40 - 49','Death inzsumTotal 50 - 59',
                  'Death inzsumTotal 60 - 69','Death inzsumTotal 70 - 79','Death inzsumTotal 80+',
                  'Hosp inz_entries 0 - 9','Hosp inz_entries 10 - 19','Hosp inz_entries 20 - 29',
                  'Hosp inz_entries 30 - 39','Hosp inz_entries 40 - 49','Hosp inz_entries 50 - 59',
                  'Hosp inz_entries 60 - 69','Hosp inz_entries 70 - 79','Hosp inz_entries 80+',
                  'Hosp inzsumTotal 0 - 9','Hosp inzsumTotal 10 - 19','Hosp inzsumTotal 20 - 29',
                  'Hosp inzsumTotal 30 - 39','Hosp inzsumTotal 40 - 49','Hosp inzsumTotal 50 - 59',
                  'Hosp inzsumTotal 60 - 69','Hosp inzsumTotal 70 - 79','Hosp inzsumTotal 80+',
                  'Cases inz_entries female','Cases inz_entries male','Cases inzsumTotal female',
                  'Cases inzsumTotal male','Death inz_entries female','Death inz_entries male',
                  'Death inzsumTotal female','Death inzsumTotal male','Hosp inz_entries female',
                  'Hosp inz_entries male','Hosp inzsumTotal female','Hosp inzsumTotal male', 
                  'VaccDosesAdministered per100PersonsTotal',
                  'FullyVaccPersons per100PersonsTotal',
                  'anteil_pos',
                  'variant_error',
                  'case_inzsumTotal','hosp_inzsumTotal','death_inzsumTotal','test_inzsumTotal','case_inz_entries',
                  'hosp_inz_entries','death_inz_entries','test_inz_entries','testPositvity',
                  'median_R_mean','R_error',
                  'meanNeighborIncidence','maxNeighborIncidence',
                  'kofStrigency',
                  'Borders','Events','Gatherings/private events','Demonstrations',
                  'Primary (includes kindergarten) and lower secondary school','Upper secondary school, vocational schools and higher education',
                  'universities and other educational establishments\xa0','Mountain railways','Homeworking','Restaurants',
                  'Discos/Nightclubs','Shops/Markets','Penalties','Cultural, entertainment and recreational facilities',
                  'Sport/Wellness facilities','Sport activities','Religious services','Singing allowed','maskMandatories',
                  'ICU_AllPatients_inz','ICU_Covid19Patients_inz','ICU_Capacity_inz','Total_AllPatients_inz',
                  'Total_Covid19Patients_inz','Total_Capacity_inz','ICU_NonCovid19Patients_inz','ICU_FreeCapacity_inz',
                  'Total_NonCovid19Patients_inz','Total_FreeCapacity_inz'
                 ]

        # features which will be direct input for every day of the input interval
        # attention: this can potentially increase the number of input features significantly
        # added features are len(directFea)*daysIn
        # only add features for which have a large variance from one day to another day
        directFeatures = ['retail_and_recreation_percent_change_from_baseline',
                     'grocery_and_pharmacy_percent_change_from_baseline',
                     'parks_percent_change_from_baseline',
                     'transit_stations_percent_change_from_baseline',
                     'workplaces_percent_change_from_baseline',
                     'residential_percent_change_from_baseline',
                     'intervistaMob',
                     'isHolyday',
                     'temp_min','temp_max','clouds','precipitation']

        # average features
        featureRow = dict[cantonId][t[0]:t[1]][averageFeatures].mean().to_frame().transpose()

        # direct features
        for f in directFeatures:
            directFe = dict[cantonId][t[0]:t[1]][[f]].transpose()
            directFe.columns = [f+'_day_'+str(d) for d in range(0,daysIn)]
            directFe = directFe.reset_index().drop(['index'], axis=1)
            featureRow = pd.concat([featureRow,directFe], axis=1)
        
        
        # future features
        futureFeatures = ['temp_min','temp_max','clouds','precipitation']
        indexOfInputTuple = listOfInputIntervals.index(t)
        ot = listOfOutputIntervals[indexOfInputTuple]
        for ff in futureFeatures:
            futureFe = dict[cantonId][ot[0]:ot[1]][[ff]].transpose()
            futureFe.columns = [ff+'_future_day_'+str(d) for d in range(0,daysOut)]
            futureFe = futureFe.reset_index().drop(['index'], axis=1)
            featureRow = pd.concat([featureRow,futureFe], axis=1)
        
        features = features.append(featureRow, ignore_index=True)

    # static cantonal features
    staticCantonal = pd.read_excel("static_data/staticCantonalData.xlsx").set_index('canton').transpose()
    # households
    households = ['1PersonHouseholds', '2PersonHouseholds','3PersonHouseholds', '4PersonHouseholds', 
                  '5PersonHouseholds','6+PersonHouseholds']
    for h in households:
        features[[h+"_perc"]] = staticCantonal.loc[[cantonId]][h][0]/staticCantonal.loc[[cantonId]]['totalHousholds'][0]
    features[['averageHousehold']] = staticCantonal.loc[[cantonId]]['residents'][0]/staticCantonal.loc[[cantonId]]['totalHousholds'][0]
    # add static features
    staticFeatures = ['percentage 65 years or over','urbanPopulationPercent','homeownershipPercent', 
                      'livingSpaceInm2','carsPer1000inhabitants', 'publicTransportationPercent',
                      'privateMotorisedTransportPercent','DoctorsPer100Kinhabitants','residentsPerKm2']
    for f in staticFeatures:
        features[[f]] = staticCantonal.loc[[cantonId]][f][0]
    # construct settlement area feature
    residents = staticCantonal.loc[[cantonId]]['residents'][0]
    settlementArea = staticCantonal.loc[[cantonId]]['areaInKm2'][0]*(staticCantonal.loc[[cantonId]]['settlementAreaPercent'][0]/100)
    features[['residentsPerKm2SettlementArea']] = residents/settlementArea    
    
    if not os.path.exists('data/features'):
        os.makedirs('data/features')
    features.to_csv('data/features/'+cantonId+'.csv', index=False)
    #display(features)

'AG'

'AI'

'AR'

'BE'

'BL'

'BS'

'FR'

'GE'

'GL'

'GR'

'JU'

'LU'

'NE'

'NW'

'OW'

'SG'

'SH'

'SO'

'SZ'

'TG'

'TI'

'UR'

'VD'

'VS'

'ZG'

'ZH'

In [118]:
######### constructing the ouputs #########
for cantonId in cantonKeys:
    outputs = pd.DataFrame()
    for t in listOfOutputIntervals:
        outputCols = ['hosp_inz_entries','death_inz_entries','testPositvity','googleMobility']

        outputRow = dict[cantonId][t[0]:t[1]][outputCols].mean().to_frame().transpose()

        '''
        outputRow = pd.DataFrame()
        # direct features
        for o in outputCols:
            directOu = dict[cantonId][t[0]:t[1]][[o]].transpose()
            directOu.columns = [o+'_day_'+str(d) for d in range(0,daysOut)]
            directOu = directOu.reset_index().drop(['index'], axis=1)
            outputRow = pd.concat([outputRow,directOu], axis=1)
        '''    
        outputs = outputs.append(outputRow, ignore_index=True)
    
    if not os.path.exists('data/outputs'):
        os.makedirs('data/outputs')
    outputs.to_csv('data/outputs/'+cantonId+'.csv', index=False)
#display(outputs)




In [119]:
# test cantons are (large, medium, small) (de, fr, de) SG,NE,NW

trainCantons = [canton for canton in cantonKeys if canton not in ['SG','NE','NW']]
# construct train set
train_features = pd.DataFrame()
train_labels = pd.DataFrame()
for cantonId in trainCantons:
    train_features = train_features.append(pd.read_csv("data/features/"+cantonId+".csv"))
    train_labels = train_labels.append(pd.read_csv("data/outputs/"+cantonId+".csv"))

test_features = pd.DataFrame()
test_labels = pd.DataFrame()
# construct test set
for cantonId in ['SG','NE','NW']:
    test_features = test_features.append(pd.read_csv("data/features/"+cantonId+".csv"))
    test_labels = test_labels.append(pd.read_csv("data/outputs/"+cantonId+".csv"))

train_features.to_csv('train_features.csv', index=False)
train_labels.to_csv('train_labels.csv', index=False)
test_features.to_csv('test_features.csv', index=False)
test_labels.to_csv('test_labels.csv', index=False)

In [120]:
# output log file
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPRegressor
import lightgbm as lightgbm
import xgboost as xgb



start = time.time()
train_features = pd.read_csv('train_features.csv')
train_labels = pd.read_csv('train_labels.csv')
test_features = pd.read_csv('test_features.csv')
test_labels = pd.read_csv('test_labels.csv')



#pip = Pipeline([('std_scaler', StandardScaler())])
pip = Pipeline([('minmax_scaler', MinMaxScaler())])
featureNames = train_features.columns
train_features = pip.fit_transform(train_features)
cv = 5

display(train_labels)

Unnamed: 0,hosp_inz_entries,death_inz_entries,testPositvity,googleMobility
0,0.021429,0.000000,0.014211,5.690476
1,0.062857,0.000000,0.032002,3.857143
2,0.062857,0.000000,0.044182,3.714286
3,0.104286,0.000000,0.048949,1.619048
4,0.104286,0.000000,0.037903,2.142857
...,...,...,...,...
9264,0.768571,0.090000,0.046355,-22.166667
9265,0.667143,0.098571,0.046011,-22.166667
9266,0.611429,0.071429,0.045847,-22.166667
9267,0.564286,0.080000,0.044938,-22.166667


In [121]:
for label in ['hosp_inz_entries', 'testPositvity','googleMobility','death_inz_entries']:
    tl = pip.fit_transform(train_labels[[label]]).ravel()

    # SVR
    display("---SVR---"+label+"---")
    start = time.time()
    parameters = {'C':[0.1, 1]}
    reg = GridSearchCV(svm.SVR(), parameters, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')
    reg.fit(train_features, tl)
    display(reg.best_estimator_)
    scores = pd.DataFrame(reg.cv_results_).loc[[reg.best_index_]][['split'+str(i)+'_test_score' for i in range(0,cv)]].values
    scores = np.sqrt(-scores)
    display("Scores:"+str(scores))
    display("Mean:"+str(scores.mean()))
    display("Standard deviation:"+str(scores.std()))
    display("----------End of evaluating (%s)----------" % (time.time() - start))

'---SVR---hosp_inz_entries---'

SVR(C=1)

'Scores:[[0.05179367 0.06186199 0.0535296  0.08717979 0.0392781 ]]'

'Mean:0.05872863042397854'

'Standard deviation:0.015954520152582564'

'----------End of evaluating (3.0388715267181396)----------'

'---SVR---testPositvity---'

SVR(C=0.1)

'Scores:[[0.06172361 0.07271704 0.06764352 0.07351628 0.0693869 ]]'

'Mean:0.06899747058292817'

'Standard deviation:0.004222529714403477'

'----------End of evaluating (4.474646329879761)----------'

'---SVR---googleMobility---'

SVR(C=1)

'Scores:[[0.05236475 0.10318075 0.07274532 0.0697653  0.0580417 ]]'

'Mean:0.07121956348151956'

'Standard deviation:0.017635873657311948'

'----------End of evaluating (3.1147682666778564)----------'

'---SVR---death_inz_entries---'

SVR(C=1)

'Scores:[[0.05769632 0.06602492 0.04642345 0.04519649 0.04946118]]'

'Mean:0.0529604727255202'

'Standard deviation:0.007853217045760596'

'----------End of evaluating (2.3682315349578857)----------'

In [95]:
for label in ['hosp_inz_entries', 'testPositvity','googleMobility','death_inz_entries']:
    tl = pip.fit_transform(train_labels[[label]]).ravel()

    # Ridge regression
    display("---Ridge---"+label+"---")
    start = time.time()
    parameters = {'alpha':[0.1, 1, 10]}
    reg = GridSearchCV(linear_model.Ridge(), parameters, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')
    reg.fit(train_features, tl)
    display(reg.best_estimator_)
    scores = pd.DataFrame(reg.cv_results_).loc[[reg.best_index_]][['split'+str(i)+'_test_score' for i in range(0,cv)]].values
    scores = np.sqrt(-scores)
    display("Scores:"+str(scores))
    display("Mean:"+str(scores.mean()))
    display("Standard deviation:"+str(scores.std()))
    #display(pd.DataFrame(reg.best_estimator_.coef_.transpose(), index=featureNames, columns=['importance']).sort_values(['importance'], ascending=False))
    display("----------End of evaluating (%s)----------" % (time.time() - start))

'---Ridge---hosp_inz_entries---'

Ridge(alpha=0.1)

'Scores:[[0.04224647 0.05224684 0.03448905 0.0446484  0.03428489]]'

'Mean:0.04158312922288176'

'Standard deviation:0.006740042140557605'

'----------End of evaluating (0.5962944030761719)----------'

'---Ridge---testPositvity---'

Ridge(alpha=10)

'Scores:[[0.05230119 0.0649693  0.05324717 0.06585795 0.05457163]]'

'Mean:0.05818944824426027'

'Standard deviation:0.005949093215678608'

'----------End of evaluating (0.5520322322845459)----------'

'---Ridge---googleMobility---'

Ridge(alpha=10)

'Scores:[[0.05108178 0.05555305 0.06222482 0.058764   0.05877961]]'

'Mean:0.05728065174999277'

'Standard deviation:0.003749737748668433'

'----------End of evaluating (0.5126698017120361)----------'

'---Ridge---death_inz_entries---'

Ridge(alpha=1)

'Scores:[[0.03543829 0.03733242 0.03328197 0.03058123 0.03537479]]'

'Mean:0.0344017411249536'

'Standard deviation:0.0023004492708929864'

'----------End of evaluating (0.608684778213501)----------'

In [122]:
for label in ['hosp_inz_entries', 'testPositvity','googleMobility','death_inz_entries']:
    tl = pip.fit_transform(train_labels[[label]]).ravel()

    # Lasso regression
    display("---Lasso---"+label+"---")
    start = time.time()
    parameters = {'alpha':[0.00001, 0.0001]}
    reg = GridSearchCV(linear_model.Lasso(), parameters, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')
    reg.fit(train_features, tl)
    #display(pd.DataFrame(reg.cv_results_))
    display(reg.best_estimator_)
    scores = pd.DataFrame(reg.cv_results_).loc[[reg.best_index_]][['split'+str(i)+'_test_score' for i in range(0,cv)]].values
    scores = np.sqrt(-scores)
    display("Scores:"+str(scores))
    display("Mean:"+str(scores.mean()))
    display("Standard deviation:"+str(scores.std()))
    display(pd.DataFrame(reg.best_estimator_.coef_.transpose(), index=featureNames, columns=['coefficients']).sort_values(['coefficients'], ascending=False))
    display("----------End of evaluating (%s)----------" % (time.time() - start))

'---Lasso---hosp_inz_entries---'

Lasso(alpha=0.0001)

'Scores:[[0.02664086 0.02515486 0.02356835 0.02719636 0.01745393]]'

'Mean:0.024002871285291914'

'Standard deviation:0.0035088985543820232'

Unnamed: 0,coefficients
Hosp inz_entries male,0.878691
Hosp inz_entries female,0.467740
Cases inz_entries male,0.033102
testPositvity,0.011553
Hosp inz_entries 80+,0.009223
...,...
Total_Covid19Patients_inz,-0.004772
Sport activities,-0.005283
Death inz_entries male,-0.013268
Gatherings/private events,-0.013417


'----------End of evaluating (5.5496039390563965)----------'

'---Lasso---testPositvity---'

Lasso(alpha=0.0001)

'Scores:[[0.04800042 0.06145534 0.04946228 0.06223305 0.04990914]]'

'Mean:0.054212043898231234'

'Standard deviation:0.006268354338365659'

Unnamed: 0,coefficients
testPositvity,0.433718
Cases inz_entries 20 - 29,0.227445
Cases inz_entries 50 - 59,0.215382
median_R_mean,0.204591
Cases inz_entries 70 - 79,0.138658
...,...
publicTransportationPercent,-0.036699
hosp_inz_entries,-0.054756
Gatherings/private events,-0.061703
R_error,-0.134632


'----------End of evaluating (6.265832901000977)----------'

'---Lasso---googleMobility---'

Lasso(alpha=0.0001)

'Scores:[[0.04289682 0.05235808 0.06483659 0.05368306 0.05115391]]'

'Mean:0.05298569298605969'

'Standard deviation:0.007020892463805663'

Unnamed: 0,coefficients
Death inzsumTotal 20 - 29,0.161655
parks_percent_change_from_baseline_day_6,0.145246
transit_stations_percent_change_from_baseline_day_6,0.125744
retail_and_recreation_percent_change_from_baseline_day_6,0.109794
Sport activities,0.098504
...,...
universities and other educational establishments,-0.092004
temp_max_day_6,-0.102249
Events,-0.103541
ICU_Covid19Patients_inz,-0.109657


'----------End of evaluating (7.123257398605347)----------'

'---Lasso---death_inz_entries---'

Lasso(alpha=0.0001)

'Scores:[[0.0292514  0.03101672 0.02171582 0.02032745 0.02530927]]'

'Mean:0.025524133438071077'

'Standard deviation:0.004138019072258814'

Unnamed: 0,coefficients
Death inz_entries female,0.507942
Death inz_entries male,0.502378
Cases inz_entries 80+,0.082079
Death inz_entries 80+,0.073473
hosp_inz_entries,0.036926
...,...
clouds_day_0,-0.001823
Cases inzsumTotal 80+,-0.002358
Death inzsumTotal 80+,-0.006293
maxNeighborIncidence,-0.006640


'----------End of evaluating (5.260971307754517)----------'

In [106]:
for label in ['hosp_inz_entries', 'testPositvity','googleMobility','death_inz_entries']:
    tl = pip.fit_transform(train_labels[[label]]).ravel()

    # MLP regression
    display("---MLP---"+label+"---")
    start = time.time()
    parameters = {'hidden_layer_sizes':[(100,50,50,20,10,10)]}
    reg = GridSearchCV(MLPRegressor(), parameters, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')
    reg.fit(train_features, tl)
    #display(pd.DataFrame(reg.cv_results_))
    display(reg.best_estimator_)
    scores = pd.DataFrame(reg.cv_results_).loc[[reg.best_index_]][['split'+str(i)+'_test_score' for i in range(0,cv)]].values
    scores = np.sqrt(-scores)
    display("Scores:"+str(scores))
    display("Mean:"+str(scores.mean()))
    display("Standard deviation:"+str(scores.std()))
    display("----------End of evaluating (%s)----------" % (time.time() - start))

'---MLP---hosp_inz_entries---'

MLPRegressor(hidden_layer_sizes=(100, 50, 50, 20, 10, 10))

'Scores:[[0.04622179 0.04572822 0.04308994 0.07297331 0.03196155]]'

'Mean:0.04799496317317258'

'Standard deviation:0.013515397105826426'

'----------End of evaluating (5.638275861740112)----------'

'---MLP---testPositvity---'

MLPRegressor(hidden_layer_sizes=(100, 50, 50, 20, 10, 10))

'Scores:[[0.05856969 0.06591378 0.05394816 0.07015244 0.05452139]]'

'Mean:0.06062109166172045'

'Standard deviation:0.006400029966753064'

'----------End of evaluating (9.345763683319092)----------'

'---MLP---googleMobility---'

MLPRegressor(hidden_layer_sizes=(100, 50, 50, 20, 10, 10))

'Scores:[[0.05050278 0.08247293 0.06699082 0.0567811  0.06122968]]'

'Mean:0.06359546383989276'

'Standard deviation:0.010875092523910512'

'----------End of evaluating (11.558503866195679)----------'

'---MLP---death_inz_entries---'

MLPRegressor(hidden_layer_sizes=(100, 50, 50, 20, 10, 10))

'Scores:[[0.04133558 0.05863064 0.04531868 0.0450905  0.04821174]]'

'Mean:0.04771742845651561'

'Standard deviation:0.005877503221681948'

'----------End of evaluating (7.370590686798096)----------'

In [101]:
for label in ['death_inz_entries','hosp_inz_entries', 'testPositvity','googleMobility']:
    tl = pip.fit_transform(train_labels[[label]]).ravel()

    # RandomForestRegressor
    display("---RandomForestRegressor---"+label+"---")
    start = time.time()
    parameters = {'n_estimators':[20]}
    reg = GridSearchCV(RandomForestRegressor(), parameters, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')
    reg.fit(train_features, tl)
    #display(pd.DataFrame(reg.cv_results_))
    display(reg.best_estimator_)
    scores = pd.DataFrame(reg.cv_results_).loc[[reg.best_index_]][['split'+str(i)+'_test_score' for i in range(0,cv)]].values
    scores = np.sqrt(-scores)
    display("Scores:"+str(scores))
    display("Mean:"+str(scores.mean()))
    display("Standard deviation:"+str(scores.std()))
    display(pd.DataFrame(reg.best_estimator_.feature_importances_.transpose(), index=featureNames, columns=['importance']).sort_values(['importance'], ascending=False))
    display("----------End of evaluating (%s)----------" % (time.time() - start))


'---RandomForestRegressor---death_inz_entries---'

RandomForestRegressor(n_estimators=20)

'Scores:[[0.04198309 0.05013015 0.03943916 0.03875326 0.04255276]]'

'Mean:0.04257168297808148'

'Standard deviation:0.0040464871432413785'

Unnamed: 0,importance
Cases inz_entries 80+,5.937905e-01
Death inz_entries male,1.394809e-01
Death inz_entries 80+,4.063720e-02
testPositvity,2.988475e-02
Hosp inz_entries 70 - 79,1.344236e-02
...,...
Death inzsumTotal 20 - 29,4.161919e-07
Death inzsumTotal 0 - 9,3.969892e-07
Death inz_entries 10 - 19,9.623981e-08
Death inz_entries 0 - 9,1.938098e-08


'----------End of evaluating (47.13285732269287)----------'

'---RandomForestRegressor---hosp_inz_entries---'

RandomForestRegressor(n_estimators=20)

'Scores:[[0.04792487 0.04744582 0.04080907 0.08699201 0.03318978]]'

'Mean:0.051272308205221793'

'Standard deviation:0.01864511488367793'

Unnamed: 0,importance
Hosp inz_entries male,6.134980e-01
Hosp inz_entries 70 - 79,9.964917e-02
median_R_mean,6.166603e-02
Hosp inz_entries female,3.362810e-02
Hosp inz_entries 30 - 39,1.133040e-02
...,...
Death inz_entries 30 - 39,1.373571e-07
Death inz_entries 20 - 29,4.540334e-08
Death inz_entries 10 - 19,4.216564e-08
Death inzsumTotal 10 - 19,8.450168e-09


'----------End of evaluating (54.881152868270874)----------'

'---RandomForestRegressor---testPositvity---'

RandomForestRegressor(n_estimators=20)

'Scores:[[0.04284746 0.06450109 0.05587063 0.06711878 0.05584827]]'

'Mean:0.05723724773458877'

'Standard deviation:0.008500247773008084'

Unnamed: 0,importance
testPositvity,6.129931e-01
median_R_mean,6.617285e-02
Cases inz_entries 50 - 59,4.248450e-02
Sport/Wellness facilities,2.452784e-02
Sport activities,2.176125e-02
...,...
Death inz_entries 0 - 9,7.238299e-08
Death inzsumTotal 20 - 29,1.693881e-08
Death inz_entries 10 - 19,1.118803e-08
Death inz_entries 20 - 29,6.450843e-09


'----------End of evaluating (55.547982931137085)----------'

'---RandomForestRegressor---googleMobility---'

RandomForestRegressor(n_estimators=20)

'Scores:[[0.04420332 0.09632913 0.06832748 0.07425238 0.06026043]]'

'Mean:0.06867454750119298'

'Standard deviation:0.017121020677670677'

Unnamed: 0,importance
transit_stations_percent_change_from_baseline_day_13,5.563682e-01
parks_percent_change_from_baseline_day_13,7.471478e-02
test_inzsumTotal,2.311992e-02
parks_percent_change_from_baseline_day_12,2.311731e-02
retail_and_recreation_percent_change_from_baseline_day_13,2.232816e-02
...,...
Death inz_entries 0 - 9,9.875664e-08
VaccDosesAdministered per100PersonsTotal,6.829026e-09
FullyVaccPersons per100PersonsTotal,6.027901e-09
Death inz_entries 10 - 19,3.523223e-19


'----------End of evaluating (51.5964035987854)----------'

In [125]:
for label in ['hosp_inz_entries', 'testPositvity','googleMobility','death_inz_entries']:
    tl = pip.fit_transform(train_labels[[label]]).ravel()
    
    # lightGBM
    display("---lightGBM---"+label+"---")
    start = time.time()
    parameters = {'n_estimators':[100]}
    reg = GridSearchCV(lightgbm.LGBMRegressor(), parameters, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')
    reg.fit(train_features, tl)
    #display(pd.DataFrame(reg.cv_results_))
    display(reg.best_estimator_)
    scores = pd.DataFrame(reg.cv_results_).loc[[reg.best_index_]][['split'+str(i)+'_test_score' for i in range(0,cv)]].values
    scores = np.sqrt(-scores)
    display("Scores:"+str(scores))
    display("Mean:"+str(scores.mean()))
    display("Standard deviation:"+str(scores.std()))
    display(pd.DataFrame(reg.best_estimator_.feature_importances_.transpose(), index=featureNames, columns=['importance']).sort_values(['importance'], ascending=False))
    display("----------End of evaluating (%s)----------" % (time.time() - start))

'---lightGBM---hosp_inz_entries---'

LGBMRegressor()

'Scores:[[0.03362329 0.03104012 0.02767795 0.05858911 0.02081361]]'

'Mean:0.03434881511942382'

'Standard deviation:0.01285898972228739'

Unnamed: 0,importance
hosp_inz_entries,289
Hosp inz_entries male,192
Hosp inz_entries female,169
median_R_mean,83
Hosp inz_entries 30 - 39,68
Hosp inz_entries 60 - 69,68
Hosp inz_entries 70 - 79,64
Hosp inz_entries 50 - 59,63
Hosp inz_entries 80+,51
Cases inz_entries 70 - 79,45


'----------End of evaluating (3.825889825820923)----------'

'---lightGBM---testPositvity---'

LGBMRegressor()

'Scores:[[0.04689762 0.05372954 0.0442287  0.05687131 0.04310269]]'

'Mean:0.04896597067071321'

'Standard deviation:0.005408953279507951'

Unnamed: 0,importance
median_R_mean,191
testPositvity,188
Cases inz_entries 50 - 59,67
Hosp inz_entries 30 - 39,62
Cases inz_entries 80+,56
Cases inz_entries 40 - 49,54
Cases inz_entries 20 - 29,52
Hosp inz_entries 20 - 29,52
Cases inz_entries 30 - 39,51
Cases inz_entries female,50


'----------End of evaluating (4.423806428909302)----------'

'---lightGBM---googleMobility---'

LGBMRegressor()

'Scores:[[0.03814927 0.07952263 0.0606777  0.06465854 0.0533932 ]]'

'Mean:0.059280267238237996'

'Standard deviation:0.013577984176124285'

Unnamed: 0,importance
transit_stations_percent_change_from_baseline_day_6,80
retail_and_recreation_percent_change_from_baseline_day_6,53
parks_percent_change_from_baseline_day_6,52
parks_percent_change_from_baseline_day_2,41
parks_percent_change_from_baseline_day_4,41
parks_percent_change_from_baseline_day_0,38
2PersonHouseholds_perc,37
test_inz_entries,36
meanNeighborIncidence,36
maxNeighborIncidence,35


'----------End of evaluating (4.763471603393555)----------'

'---lightGBM---death_inz_entries---'

LGBMRegressor()

'Scores:[[0.03476192 0.0384098  0.02621302 0.02955132 0.03107556]]'

'Mean:0.032002324022578535'

'Standard deviation:0.004220271230922381'

Unnamed: 0,importance
death_inz_entries,293
Death inz_entries female,194
Death inz_entries male,194
Cases inz_entries 80+,104
Death inz_entries 80+,64
Death inz_entries 60 - 69,50
hosp_inz_entries,47
Hosp inz_entries 60 - 69,45
Hosp inz_entries 80+,44
Death inz_entries 70 - 79,43


'----------End of evaluating (4.198756694793701)----------'

In [126]:
for label in ['hosp_inz_entries', 'testPositvity','googleMobility','death_inz_entries']:
    tl = pip.fit_transform(train_labels[[label]]).ravel()
    
    # XGBoost
    display("---XGBoost---"+label+"---")
    start = time.time()
    parameters = {'n_estimators':[10], 'reg_alpha': [0], 'reg_lambda': [1]}
    reg = GridSearchCV(xgb.XGBRegressor(), parameters, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')
    reg.fit(train_features, tl)
    #display(pd.DataFrame(reg.cv_results_))
    #display(reg.best_estimator_.get_xgb_params())
    scores = pd.DataFrame(reg.cv_results_).loc[[reg.best_index_]][['split'+str(i)+'_test_score' for i in range(0,cv)]].values
    scores = np.sqrt(-scores)
    display("Scores:"+str(scores))
    display("Mean:"+str(scores.mean()))
    display("Standard deviation:"+str(scores.std()))
    display(pd.DataFrame(reg.best_estimator_.feature_importances_.transpose(), index=featureNames, columns=['importance']).sort_values(['importance'], ascending=False))
    display("----------End of evaluating (%s)----------" % (time.time() - start))

'---XGBoost---hosp_inz_entries---'

'Scores:[[0.03945924 0.04106757 0.03549687 0.06657774 0.02724659]]'

'Mean:0.04196960412206989'

'Standard deviation:0.013201055874427471'

Unnamed: 0,importance
Hosp inz_entries male,0.306881
Hosp inz_entries female,0.113868
Events,0.065002
Hosp inz_entries 30 - 39,0.045581
"Cultural, entertainment and recreational facilities",0.04034
Discos/Nightclubs,0.024777
Gatherings/private events,0.022566
Death inz_entries 80+,0.019448
kofStrigency,0.013878
residential_percent_change_from_baseline_day_0,0.012585


'----------End of evaluating (34.20915675163269)----------'

'---XGBoost---testPositvity---'

'Scores:[[0.05108709 0.06559816 0.05016719 0.06289053 0.05069636]]'

'Mean:0.056087864674545976'

'Standard deviation:0.00672089945724063'

Unnamed: 0,importance
testPositvity,0.23507
Sport/Wellness facilities,0.120136
temp_min_future_day_3,0.03417
Hosp inz_entries male,0.029929
Cases inz_entries 50 - 59,0.028303
transit_stations_percent_change_from_baseline_day_5,0.020822
temp_max_future_day_1,0.019179
temp_max_future_day_0,0.017967
Death inzsumTotal 30 - 39,0.017965
ICU_FreeCapacity_inz,0.016797


'----------End of evaluating (37.62841749191284)----------'

'---XGBoost---googleMobility---'

'Scores:[[0.04570707 0.09810941 0.06361355 0.07470565 0.0566097 ]]'

'Mean:0.06774907650462189'

'Standard deviation:0.017872784913806138'

Unnamed: 0,importance
transit_stations_percent_change_from_baseline_day_6,0.19563
transit_stations_percent_change_from_baseline_day_5,0.101253
Restaurants,0.086897
temp_max_future_day_5,0.068494
transit_stations_percent_change_from_baseline_day_4,0.052196
Events,0.027488
parks_percent_change_from_baseline_day_1,0.026477
temp_max_future_day_3,0.023745
temp_max_future_day_6,0.019632
kofStrigency,0.017678


'----------End of evaluating (41.99630308151245)----------'

'---XGBoost---death_inz_entries---'

'Scores:[[0.03892503 0.04773417 0.03460766 0.0378127  0.03688034]]'

'Mean:0.03919197885881143'

'Standard deviation:0.004501162281226967'

Unnamed: 0,importance
Death inz_entries male,0.246495
Death inz_entries 80+,0.157669
Cases inz_entries female,0.07939
Hosp inzsumTotal 50 - 59,0.066965
Hosp inzsumTotal 80+,0.031209
Death inzsumTotal female,0.02171
Death inz_entries female,0.020479
Cases inz_entries 80+,0.019502
privateMotorisedTransportPercent,0.019168
Hosp inz_entries female,0.017933


'----------End of evaluating (35.6097629070282)----------'

In [139]:
# PCA
from sklearn.decomposition import IncrementalPCA

ipca = IncrementalPCA()
ipca.fit(train_features)
display(pd.DataFrame(ipca.singular_values_.transpose(), index=featureNames, columns=['singular value']).sort_values(['singular value'], ascending=False))

Unnamed: 0,singular value
Cases inz_entries 0 - 9,168.3747
Cases inz_entries 10 - 19,105.4882
Cases inz_entries 20 - 29,81.09005
Cases inz_entries 30 - 39,72.85874
Cases inz_entries 40 - 49,66.20652
Cases inz_entries 50 - 59,62.39412
Cases inz_entries 60 - 69,57.86845
Cases inz_entries 70 - 79,50.81096
Cases inz_entries 80+,49.07019
Cases inzsumTotal 0 - 9,48.38226
