# Import packages

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Import data in csv format

In [2]:
data = pd.read_csv("covid_data_196.csv")

# Set variables

In [3]:
xvar = ["Employed", "Males", "Vulnerable_pop", "Health_Insurance", 
        "Secondary_Education", "Life_Expectancy", "prev_cronic_prov", 
        "Indice_Pobreza_Compuesto", "White_1000_Inhab", "Assian_1000_Inhab", 
        "Black_1000_Inhab", "prev_diferencial_prov_endes", "prev_hipertension_prov_endes", 
        "prev_diabetes_prov_endes", "prev_obesidad_prov_endes", "Days_Till_Attended", 
        "SD_Days_Till_Attended",  "Travel_Time_toHFacility_Hours", "SD_TTtHFH",
        "Waiting_Time_4Attention_Hours", "SD_WT4AH", "logPD_1000", "Overcrowding",
        "Natural_Region1", "Natural_Region2", "Natural_Region3"]

In [4]:
datac = data.dropna()

x = datac[xvar]
Y = datac["logmuertes1000"]

In [5]:
datac

Unnamed: 0,_ID,_CX,_CY,IDDPTO,DEPARTAMEN,prov,PROVINCIA,CAPITAL,FUENTE,dep,...,casos_1000_Inhab,muertes_1000_Inhab,logcasos1000,logmuertes1000,dominio,Natural_Region,Natural_Region1,Natural_Region2,Natural_Region3,logPD_1000
0,1,-77.773167,-6.437059,1,AMAZONAS,101,CHACHAPOYAS,CHACHAPOYAS,INEI,1,...,37.995892,0.414370,3.637478,-0.880997,sierra norte,sierra,0,1,0,0.614774
1,2,-78.402692,-5.087450,1,AMAZONAS,102,BAGUA,BAGUA,INEI,1,...,108.002700,1.322537,4.682156,0.279552,selva,selva,0,0,1,1.538134
2,3,-77.873091,-5.683441,1,AMAZONAS,103,BONGARA,JUMBILLA,INEI,1,...,14.393260,0.390061,2.666760,-0.941452,selva,selva,0,0,1,2.541240
3,4,-78.038591,-4.166722,1,AMAZONAS,104,CONDORCANQUI,SANTA MARIA DE NIEVA,INEI,1,...,78.714386,0.518013,4.365826,-0.657755,selva,selva,0,0,1,-1.905125
4,5,-78.077829,-6.323335,1,AMAZONAS,105,LUYA,LAMUD,INEI,1,...,9.924386,0.247547,2.294995,-1.396155,sierra norte,sierra,0,1,0,0.998217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,191,-80.740769,-3.969757,24,TUMBES,2402,CONTRALMIRANTE VILLAR,ZORRITOS,INEI,24,...,22.177898,1.282234,3.099096,0.248604,costa norte,costa,1,0,0,0.231815
191,192,-80.256500,-3.652287,24,TUMBES,2403,ZARUMILLA,ZARUMILLA,INEI,24,...,31.876997,1.289821,3.461885,0.254503,costa norte,costa,1,0,0,0.147406
192,193,-74.058177,-8.672964,25,UCAYALI,2501,CORONEL PORTILLO,PUCALLPA,INEI,25,...,40.443245,0.739260,3.699900,-0.302106,selva,selva,0,0,1,0.893040
193,194,-73.218893,-10.389299,25,UCAYALI,2502,ATALAYA,ATALAYA,INEI,25,...,9.427460,0.141919,2.243627,-1.952501,selva,selva,0,0,1,0.625074


# Begin machine learning pipeline

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from skopt import BayesSearchCV

pipe = Pipeline([('scaler', StandardScaler()), ('enr', ElasticNet())])

# Define cross-validation method
cv = RepeatedKFold(n_splits=4, n_repeats=10,random_state=0)

# Define grid
param_grid = {
    'enr__alpha': (1e-2, 100.0, 'log-uniform'),
    'enr__l1_ratio': (0.1, 1.0, 'uniform')
}

# Define evaluation post-processing criteria
from sklearn.metrics import mean_squared_error

scoring = make_scorer(mean_squared_error, greater_is_better=False)

model = ElasticNet(random_state=0, fit_intercept=True)

# Define search
search_ddnn = BayesSearchCV(estimator = pipe, search_spaces=param_grid, scoring=scoring,
                           cv=cv, n_jobs=-1, verbose = 4, n_iter=50, n_points=5)

# Perform the search
results_ENLR = search_ddnn.fit(x, Y)

Fitting 40 folds for each of 5 candidates, totalling 200 fits
Fitting 40 folds for each of 5 candidates, totalling 200 fits
Fitting 40 folds for each of 5 candidates, totalling 200 fits
Fitting 40 folds for each of 5 candidates, totalling 200 fits
Fitting 40 folds for each of 5 candidates, totalling 200 fits
Fitting 40 folds for each of 5 candidates, totalling 200 fits
Fitting 40 folds for each of 5 candidates, totalling 200 fits
Fitting 40 folds for each of 5 candidates, totalling 200 fits
Fitting 40 folds for each of 5 candidates, totalling 200 fits
Fitting 40 folds for each of 5 candidates, totalling 200 fits


In [7]:
print('MSE: %.3f' % results_ENLR.best_score_)
print('Config: %s' % results_ENLR.best_params_)

MSE: -0.255
Config: OrderedDict([('enr__alpha', 0.02848056450581579), ('enr__l1_ratio', 1.0)])


In [8]:
results_ENLR.best_estimator_

In [9]:
Results_LR = pd.DataFrame(list(zip(x.columns.values,results_ENLR.best_estimator_['enr'].coef_)), columns =["Variable", "Coefficient"])
Results_LR

Unnamed: 0,Variable,Coefficient
0,Employed,0.062088
1,Males,0.0
2,Vulnerable_pop,0.114127
3,Health_Insurance,0.0
4,Secondary_Education,-0.0
5,Life_Expectancy,0.0
6,prev_cronic_prov,-0.0
7,Indice_Pobreza_Compuesto,-0.399366
8,White_1000_Inhab,-0.0
9,Assian_1000_Inhab,0.006538


# Sparse subset

In [10]:
Results_LR.loc[Results_LR.Coefficient != 0]

Unnamed: 0,Variable,Coefficient
0,Employed,0.062088
2,Vulnerable_pop,0.114127
7,Indice_Pobreza_Compuesto,-0.399366
9,Assian_1000_Inhab,0.006538
14,prev_obesidad_prov_endes,0.002565
18,SD_TTtHFH,0.005762
19,Waiting_Time_4Attention_Hours,0.037433
20,SD_WT4AH,0.057029
21,logPD_1000,0.239418
22,Overcrowding,0.030886


# Enter data for any province:

In [11]:
x.iloc[0:1,:] #The first province

Unnamed: 0,Employed,Males,Vulnerable_pop,Health_Insurance,Secondary_Education,Life_Expectancy,prev_cronic_prov,Indice_Pobreza_Compuesto,White_1000_Inhab,Assian_1000_Inhab,...,SD_Days_Till_Attended,Travel_Time_toHFacility_Hours,SD_TTtHFH,Waiting_Time_4Attention_Hours,SD_WT4AH,logPD_1000,Overcrowding,Natural_Region1,Natural_Region2,Natural_Region3
0,421.01755,491.85672,193.42053,837.1167,14840,63.489361,484.55893,47.172291,41.8153,0.09008,...,9.410896,0.409124,0.827978,0.121618,0.492989,0.614774,11.253831,0,1,0


In [12]:
n = 98
pred = results_ENLR.best_estimator_.predict(x.iloc[n:(n+1),:]) # Row n point prediction

print("This province would have:", np.round(np.exp(pred),1).item(), "Deaths per 1000 inhabitants")

This province would have: 1.2 Deaths per 1000 inhabitants


In [13]:
x_test = [[4.2101755e+02, 4.9185672e+02, 1.9342053e+02, 8.3711670e+02,
        1.4840000e+04, 6.3489361e+01, 4.8455893e+02, 4.7172291e+01,
        4.1815300e+01, 9.0080351e-02, 2.4213598e+01, 4.1135788e+02,
        1.0121213e+02, 2.2985615e+01, 1.4377316e+02, 3.3102899e+00,
        9.4108963e+00, 4.0912408e-01, 8.2797784e-01, 1.2161849e-01,
        4.9298859e-01, 6.1477417e-01, 1.1253831e+01, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00]]

In [14]:
pred = results_ENLR.best_estimator_.predict(x_test) # Row n point prediction

print("This province would have:", np.round(np.exp(pred),1).item(), "Deaths per 1000 inhabitants")

This province would have: 0.6 Deaths per 1000 inhabitants
