# Import libraries

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import make_scorer
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from skopt import BayesSearchCV

# Import data

In [3]:
data = pd.read_csv("covid_data_196.csv")

In [4]:
# Set variables
xvar = ["Employed", "Males", "Vulnerable_pop", "Health_Insurance", 
        "Secondary_Education", "Life_Expectancy", "prev_cronic_prov", 
        "Indice_Pobreza_Compuesto", "White_1000_Inhab", "Assian_1000_Inhab", 
        "Black_1000_Inhab", "prev_diferencial_prov_endes", "prev_hipertension_prov_endes", 
        "prev_diabetes_prov_endes", "prev_obesidad_prov_endes", "Days_Till_Attended", 
        "SD_Days_Till_Attended",  "Travel_Time_toHFacility_Hours", "SD_TTtHFH",
        "Waiting_Time_4Attention_Hours", "SD_WT4AH", "logPD_1000", "Overcrowding",
        "Natural_Region1", "Natural_Region2", "Natural_Region3"]

datac = data.dropna()

x = datac[xvar]
Y = datac["logmuertes1000"]

In [5]:
# Display dataset
datac

Unnamed: 0,_ID,_CX,_CY,IDDPTO,DEPARTAMEN,prov,PROVINCIA,CAPITAL,FUENTE,dep,...,casos_1000_Inhab,muertes_1000_Inhab,logcasos1000,logmuertes1000,dominio,Natural_Region,Natural_Region1,Natural_Region2,Natural_Region3,logPD_1000
0,1,-77.773167,-6.437059,1,AMAZONAS,101,CHACHAPOYAS,CHACHAPOYAS,INEI,1,...,37.995892,0.414370,3.637478,-0.880997,sierra norte,sierra,0,1,0,0.614774
1,2,-78.402692,-5.087450,1,AMAZONAS,102,BAGUA,BAGUA,INEI,1,...,108.002700,1.322537,4.682156,0.279552,selva,selva,0,0,1,1.538134
2,3,-77.873091,-5.683441,1,AMAZONAS,103,BONGARA,JUMBILLA,INEI,1,...,14.393260,0.390061,2.666760,-0.941452,selva,selva,0,0,1,2.541240
3,4,-78.038591,-4.166722,1,AMAZONAS,104,CONDORCANQUI,SANTA MARIA DE NIEVA,INEI,1,...,78.714386,0.518013,4.365826,-0.657755,selva,selva,0,0,1,-1.905125
4,5,-78.077829,-6.323335,1,AMAZONAS,105,LUYA,LAMUD,INEI,1,...,9.924386,0.247547,2.294995,-1.396155,sierra norte,sierra,0,1,0,0.998217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,191,-80.740769,-3.969757,24,TUMBES,2402,CONTRALMIRANTE VILLAR,ZORRITOS,INEI,24,...,22.177898,1.282234,3.099096,0.248604,costa norte,costa,1,0,0,0.231815
191,192,-80.256500,-3.652287,24,TUMBES,2403,ZARUMILLA,ZARUMILLA,INEI,24,...,31.876997,1.289821,3.461885,0.254503,costa norte,costa,1,0,0,0.147406
192,193,-74.058177,-8.672964,25,UCAYALI,2501,CORONEL PORTILLO,PUCALLPA,INEI,25,...,40.443245,0.739260,3.699900,-0.302106,selva,selva,0,0,1,0.893040
193,194,-73.218893,-10.389299,25,UCAYALI,2502,ATALAYA,ATALAYA,INEI,25,...,9.427460,0.141919,2.243627,-1.952501,selva,selva,0,0,1,0.625074


# Setting parameters

In [None]:
# Set the cross-validation

# Define cross-validation method
cv = RepeatedKFold(n_splits=4, n_repeats=5,random_state=0)

In [None]:
# Set the hyperparameters grid

# Define grid
param_grid = {
    'enr__alpha': (1e-2, 100.0, 'log-uniform'),
    'enr__l1_ratio': (0.1, 1.0, 'uniform')
}

# Model training

In [None]:
# Define evaluation and post-processing criteria
from sklearn.metrics import mean_squared_error

scoring = make_scorer(mean_squared_error, greater_is_better=False)

# Define the model
pipe = Pipeline([('scaler', StandardScaler()), ('enr', ElasticNet())])


# Define search
search_ddnn = BayesSearchCV(estimator = pipe, search_spaces=param_grid, scoring=scoring,
                           cv=cv, n_jobs=-1, verbose = 4, n_iter=50, n_points=5)

# Perform the search
results_ENLR = search_ddnn.fit(x, Y)

In [16]:
# Display search results
pd.DataFrame(results_ENLR.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_enr__alpha,param_enr__l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,...,split33_test_score,split34_test_score,split35_test_score,split36_test_score,split37_test_score,split38_test_score,split39_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006559,0.002098,0.002229,0.000523,7.764483,0.509481,"{'enr__alpha': 7.764483075000309, 'enr__l1_rat...",-0.875347,-0.599164,-0.848559,...,-0.728712,-1.058141,-0.613368,-0.843393,-0.683221,-0.702252,-0.955684,-0.787884,0.110799,43
1,0.005726,0.001162,0.00205,0.000444,1.395355,0.983032,"{'enr__alpha': 1.3953547198786433, 'enr__l1_ra...",-0.875347,-0.599164,-0.848559,...,-0.728712,-1.058141,-0.613368,-0.843393,-0.683221,-0.702252,-0.955684,-0.787884,0.110799,43
2,0.005015,0.001618,0.002287,0.001396,0.476603,0.317878,"{'enr__alpha': 0.4766034314128834, 'enr__l1_ra...",-0.298129,-0.289069,-0.340342,...,-0.250081,-0.476474,-0.192091,-0.291127,-0.253908,-0.296589,-0.486244,-0.327908,0.072526,37
3,0.005176,0.001465,0.0022,0.000781,0.063023,0.763848,"{'enr__alpha': 0.06302320757565777, 'enr__l1_r...",-0.197619,-0.27162,-0.259986,...,-0.158423,-0.326623,-0.177094,-0.226841,-0.237325,-0.221339,-0.372387,-0.259058,0.055695,14
4,0.005182,0.001129,0.002107,0.000673,0.48996,0.38142,"{'enr__alpha': 0.48996036966695894, 'enr__l1_r...",-0.325422,-0.30442,-0.370335,...,-0.27898,-0.519081,-0.209816,-0.318191,-0.269483,-0.320874,-0.514315,-0.351301,0.076988,40
5,0.004443,0.001073,0.001743,0.000477,1.614196,0.747958,"{'enr__alpha': 1.6141960512046327, 'enr__l1_ra...",-0.875347,-0.599164,-0.848559,...,-0.728712,-1.058141,-0.613368,-0.843393,-0.683221,-0.702252,-0.955684,-0.787884,0.110799,43
6,0.004656,0.00167,0.001882,0.000591,6.015526,0.662653,"{'enr__alpha': 6.015525830148008, 'enr__l1_rat...",-0.875347,-0.599164,-0.848559,...,-0.728712,-1.058141,-0.613368,-0.843393,-0.683221,-0.702252,-0.955684,-0.787884,0.110799,43
7,0.004801,0.001249,0.0021,0.0012,0.149863,0.189267,"{'enr__alpha': 0.14986265866619747, 'enr__l1_r...",-0.212549,-0.277242,-0.263326,...,-0.157101,-0.329039,-0.168808,-0.236308,-0.24897,-0.218192,-0.375838,-0.263489,0.057579,18
8,0.004003,0.000677,0.001612,0.00048,0.078661,0.407124,"{'enr__alpha': 0.07866129415384464, 'enr__l1_r...",-0.201881,-0.27247,-0.257606,...,-0.154702,-0.318847,-0.172905,-0.227731,-0.2442,-0.21546,-0.368145,-0.258758,0.056005,12
9,0.003952,0.000614,0.001426,0.000501,0.010858,0.976742,"{'enr__alpha': 0.010857695167247747, 'enr__l1_...",-0.202879,-0.258765,-0.294902,...,-0.164175,-0.293281,-0.186411,-0.212488,-0.261116,-0.229567,-0.354201,-0.268158,0.05465,24


# Results Visualization

In [7]:
# Print MSE and Settings for optimal model
print('MSE: %.3f' % results_ENLR.best_score_)
print('Config: %s' % results_ENLR.best_params_)

MSE: -0.255
Config: OrderedDict([('enr__alpha', 0.031080299245982723), ('enr__l1_ratio', 1.0)])


In [8]:
results_ENLR.best_estimator_

In [9]:
Results_LR = pd.DataFrame(list(zip(x.columns.values,results_ENLR.best_estimator_['enr'].coef_)), columns =["Variable", "Coefficient"])
Results_LR

Unnamed: 0,Variable,Coefficient
0,Employed,0.063004
1,Males,0.0
2,Vulnerable_pop,0.104738
3,Health_Insurance,0.0
4,Secondary_Education,-0.0
5,Life_Expectancy,0.0
6,prev_cronic_prov,-0.0
7,Indice_Pobreza_Compuesto,-0.393642
8,White_1000_Inhab,-0.0
9,Assian_1000_Inhab,0.005572


In [17]:
# Visualization of most important features

Results_LR.loc[Results_LR.Coefficient != 0]

Unnamed: 0,Variable,Coefficient
0,Employed,0.063004
2,Vulnerable_pop,0.104738
7,Indice_Pobreza_Compuesto,-0.393642
9,Assian_1000_Inhab,0.005572
14,prev_obesidad_prov_endes,0.002033
18,SD_TTtHFH,0.003908
19,Waiting_Time_4Attention_Hours,0.037726
20,SD_WT4AH,0.054827
21,logPD_1000,0.238755
22,Overcrowding,0.021604


# Predict a real case

In [11]:
# Enter data for any province:

x.iloc[0:1,:] # The first province

Unnamed: 0,Employed,Males,Vulnerable_pop,Health_Insurance,Secondary_Education,Life_Expectancy,prev_cronic_prov,Indice_Pobreza_Compuesto,White_1000_Inhab,Assian_1000_Inhab,...,SD_Days_Till_Attended,Travel_Time_toHFacility_Hours,SD_TTtHFH,Waiting_Time_4Attention_Hours,SD_WT4AH,logPD_1000,Overcrowding,Natural_Region1,Natural_Region2,Natural_Region3
0,421.01755,491.85672,193.42053,837.1167,14840,63.489361,484.55893,47.172291,41.8153,0.09008,...,9.410896,0.409124,0.827978,0.121618,0.492989,0.614774,11.253831,0,1,0


In [21]:
# Print the prediction
n = 98
pred = results_ENLR.best_estimator_.predict(x.iloc[n:(n+1),:]) # Row n point prediction

print("This province would have:", np.round(np.exp(pred),1).item(), "Deaths per 1000 inhabitants")

This province would have: 1.2 Deaths per 1000 inhabitants


In [22]:
# Input data in matrix format
x_test = [[4.2101755e+02, 4.9185672e+02, 1.9342053e+02, 8.3711670e+02,
        1.4840000e+04, 6.3489361e+01, 4.8455893e+02, 4.7172291e+01,
        4.1815300e+01, 9.0080351e-02, 2.4213598e+01, 4.1135788e+02,
        1.0121213e+02, 2.2985615e+01, 1.4377316e+02, 3.3102899e+00,
        9.4108963e+00, 4.0912408e-01, 8.2797784e-01, 1.2161849e-01,
        4.9298859e-01, 6.1477417e-01, 1.1253831e+01, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00]]

In [14]:
pred = results_ENLR.best_estimator_.predict(x_test) # Row n point prediction

print("This province would have:", np.round(np.exp(pred),1).item(), "Deaths per 1000 inhabitants")

This province would have: 0.6 Deaths per 1000 inhabitants
