In [1]:
import pipeline

In [2]:
filepath = '../data/covid_df.pkl'
df_train, df_test = pipeline.read_and_process_data(filepath)
del filepath

In [3]:
pipeline.sanity_check(df_train, df_test)

Success: Features match


Checking variables with highest percentage of NAs in training set:

In [4]:
df_train.apply(lambda x: x.isnull().sum()/len(x)).sort_values(ascending=False)[:20]

E4_International support                 0.159124
H4_Emergency investment in healthcare    0.149890
H5_Investment in vaccines                0.139698
H2_Testing policy                        0.136277
E2_Debt/contract relief                  0.134989
E3_Fiscal measures                       0.134327
H3_Contact tracing                       0.134106
E1_Income support                        0.131862
C5_Close public transport                0.131199
C7_Restrictions on internal movement     0.131163
C4_Restrictions on gatherings            0.131163
C3_Cancel public events                  0.131163
C8_International travel controls         0.131126
C6_Stay at home requirements             0.131126
H1_Public information campaigns          0.131126
C2_Workplace closing                     0.131126
Diabetes Prevalence                      0.127925
Physicians per 1k                        0.127925
Universal Health Care Coverage Index     0.127925
C1_School closing                        0.127925


In [5]:
df_train = df_train.dropna()
df_test = df_test.dropna()
df_train = df_train.dropna()
df_test = df_test.dropna()

#Since we will be predicting confirmed cases first, we remove the other outcomes (deaths)
df_train = df_train.drop(['Deaths'], axis=1)
df_test = df_test.drop(['Deaths'], axis=1)

#Next we choose a target to split the data on x and y
x_train, y_train = pipeline.divide_target_and_features(df_train, 'Confirmed Cases')
x_test, y_test = pipeline.divide_target_and_features(df_test, 'Confirmed Cases')

In [39]:
from sklearn import linear_model
import importlib
importlib.reload(pipeline)
ls = linear_model.Lasso(alpha=0.5)
rg = linear_model.Ridge(alpha=0.5)
lreg = linear_model.LinearRegression()
ev = {}
models = [(ls, 'Lasso'),
          (rg, 'Ridge'),
          (lreg, 'Linear Regression')]

for m in models:
    (model, name) = m
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print('{}\n{}\n'.format(name, pipeline.get_most_relevant_features(x_train, model, 10)))
    ev[name] = pipeline.metrics(y_pred, y_test, x_train, y_train, model)

Lasso
                           Feature  Coefficient
19                          Deaths     0.514527
0                        Intercept     0.015328
30   Days Elapsed Since First Case     0.000606
29                       Day Count    -0.000035
122             Country_Mozambique     0.000000
116             Country_Mauritania    -0.000000
117              Country_Mauritius    -0.000000
118                 Country_Mexico     0.000000
119                Country_Moldova    -0.000000
120               Country_Mongolia    -0.000000

Bias: 0.40
Mean squared error: 11.59
RSS: 5804.84
Variance score: 0.68

Ridge
                  Feature  Coefficient
146        Country_Serbia     2.681179
82          Country_Gabon    -1.024362
19                 Deaths     0.810872
129       Country_Nigeria    -0.583018
145       Country_Senegal    -0.507376
74        Country_Ecuador     0.484716
71        Country_Denmark    -0.463286
41       Country_Barbados    -0.349468
144  Country_Saudi Arabia    -0.3307

In [41]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
pf = poly.fit_transform(x_train)