In [7]:
import pipeline

In [8]:
filepath = '../data/covid_df.pkl'
df_train, df_test = pipeline.read_and_process_data(filepath)
del filepath

In [9]:
pipeline.sanity_check(df_train, df_test)

Success: Features match


In [10]:
df_train

Unnamed: 0,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,C8_International travel controls,E1_Income support,E2_Debt/contract relief,...,Country_United Arab Emirates,Country_United Kingdom,Country_United States of America,Country_Uruguay,Country_Uzbekistan,Country_Venezuela,Country_Vietnam,Country_Yemen,Country_Zambia,Country_Zimbabwe
0,-1.035714,-0.892128,-1.061166,-0.954281,-0.651546,-0.798087,-0.900415,-1.08765,-0.499524,-0.522238,...,0,0,0,0,0,0,0,0,0,0
1,-1.035714,-0.892128,-1.061166,-0.954281,-0.651546,-0.798087,-0.900415,-1.08765,-0.499524,-0.522238,...,0,0,0,0,0,0,0,0,0,0
2,-1.035714,-0.892128,-1.061166,-0.954281,-0.651546,-0.798087,-0.900415,-1.08765,-0.499524,-0.522238,...,0,0,0,0,0,0,0,0,0,0
3,-1.035714,-0.892128,-1.061166,-0.954281,-0.651546,-0.798087,-0.900415,-1.08765,-0.499524,-0.522238,...,0,0,0,0,0,0,0,0,0,0
4,-1.035714,-0.892128,-1.061166,-0.954281,-0.651546,-0.798087,-0.900415,-1.08765,-0.499524,-0.522238,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28126,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
28127,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
28128,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
28129,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1


Checking variables with highest percentage of NAs in training set:

In [4]:
df_train.apply(lambda x: x.isnull().sum()/len(x)).sort_values(ascending=False)[:20]

E4_International support                 0.166873
H4_Emergency investment in healthcare    0.153933
H5_Investment in vaccines                0.147627
H2_Testing policy                        0.144237
EconomicSupportIndexForDisplay           0.143654
EconomicSupportIndex                     0.143654
E2_Debt/contract relief                  0.142961
E3_Fiscal measures                       0.142305
GovernmentResponseIndexForDisplay        0.142086
H3_Contact tracing                       0.142086
GovernmentResponseIndex                  0.142086
ContainmentHealthIndexForDisplay         0.142050
ContainmentHealthIndex                   0.142050
E1_Income support                        0.139863
C5_Close public transport                0.139207
C7_Restrictions on internal movement     0.139170
C4_Restrictions on gatherings            0.139170
C3_Cancel public events                  0.139170
StringencyLegacyIndexForDisplay          0.139134
H1_Public information campaigns          0.139134


In [5]:
df_train = df_train.dropna()
df_test = df_test.dropna()
df_train = df_train.dropna()
df_test = df_test.dropna()

#Since we will be predicting confirmed cases first, we remove the other outcomes (deaths)
df_train = df_train.drop(['Deaths'], axis=1)
df_test = df_test.drop(['Deaths'], axis=1)

#Next we choose a target to split the data on x and y
x_train, y_train = pipeline.divide_target_and_features(df_train, 'Confirmed Cases')
x_test, y_test = pipeline.divide_target_and_features(df_test, 'Confirmed Cases')

In [6]:
from sklearn import linear_model
import importlib
importlib.reload(pipeline)
ls = linear_model.Lasso(alpha=0.5)
rg = linear_model.Ridge(alpha=0.5)
lreg = linear_model.LinearRegression()
ev = {}
models = [(ls, 'Lasso'),
          (rg, 'Ridge'),
          (lreg, 'Linear Regression')]

for m in models:
    (model, name) = m
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print('{}\n{}\n'.format(name, pipeline.get_most_relevant_features(x_train, model, 10)))
    ev[name] = pipeline.metrics(y_pred, y_test, x_train, y_train, model)

Lasso
                           Feature  Coefficient
26                       Recovered     0.225405
0                        Intercept     0.017516
37   Days Elapsed Since First Case     0.000916
36                       Day Count    -0.000068
136                  Country_Niger    -0.000000
135              Country_Nicaragua     0.000000
138                 Country_Norway    -0.000000
121                 Country_Malawi    -0.000000
122               Country_Malaysia    -0.000000
123                   Country_Mali    -0.000000

Bias: 0.87
Mean squared error: 0.57
RSS: 351.72
Variance score: 0.24

Ridge
                               Feature  Coefficient
152               Country_Saudi Arabia     2.369791
87                     Country_Finland     1.215052
20             GovernmentResponseIndex    -0.747285
21   GovernmentResponseIndexForDisplay    -0.747285
26                           Recovered     0.732569
28                          Population    -0.727180
61                       

In [41]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
pf = poly.fit_transform(x_train)