In [1]:
import pipeline

In [2]:
filepath = '../data/covid_df.pkl'
df_train, df_test = pipeline.read_and_process_data(filepath)
del filepath

Checking variables with highest percentage of NAs in training set:

In [3]:
df_train.apply(lambda x: x.isnull().sum()/len(x)).sort_values(ascending=False)[:20]

E4_International support                 0.035775
H4_Emergency investment in healthcare    0.020798
H5_Investment in vaccines                0.013500
H2_Testing policy                        0.009576
EconomicSupportIndex                     0.008901
E2_Debt/contract relief                  0.008100
E3_Fiscal measures                       0.007341
H3_Contact tracing                       0.007087
GovernmentResponseIndex                  0.007087
ContainmentHealthIndex                   0.007045
E1_Income support                        0.004514
C5_Close public transport                0.003755
C3_Cancel public events                  0.003712
C4_Restrictions on gatherings            0.003712
C7_Restrictions on internal movement     0.003712
StringencyIndex                          0.003670
H1_Public information campaigns          0.003670
C8_International travel controls         0.003670
C6_Stay at home requirements             0.003670
C2_Workplace closing                     0.003670


In [4]:
pipeline.sanity_check(df_train, df_test)

Success: Features match


In [5]:
df_train = df_train.dropna()
df_test = df_test.dropna()
df_train = df_train.dropna()
df_test = df_test.dropna()

#Since we will be predicting confirmed cases first, we remove the other outcomes (deaths)
df_train = df_train.drop(['Deaths'], axis=1)
df_test = df_test.drop(['Deaths'], axis=1)

#Next we choose a target to split the data on x and y
x_train, y_train = pipeline.divide_target_and_features(df_train, 'Confirmed Cases')
x_test, y_test = pipeline.divide_target_and_features(df_test, 'Confirmed Cases')

In [6]:
from sklearn import linear_model
import importlib
importlib.reload(pipeline)
ls = linear_model.Lasso(alpha=0.5)
rg = linear_model.Ridge(alpha=0.5)
lreg = linear_model.LinearRegression()
ev = {}
models = [(ls, 'Lasso'),
          (rg, 'Ridge'),
          (lreg, 'Linear Regression')]

for m in models:
    (model, name) = m
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print('{}\n{}\n'.format(name, pipeline.get_most_relevant_features(x_train, model, 10)))
    ev[name] = pipeline.metrics(y_pred, y_test, x_train, y_train, model)

Lasso
                           Feature  Coefficient
18                       Recovered     0.307085
0                        Intercept     0.030487
29   Days Elapsed Since First Case     0.000615
28                       Day Count    -0.000067
121             Country_Mozambique    -0.000000
115             Country_Mauritania    -0.000000
116              Country_Mauritius    -0.000000
117                 Country_Mexico     0.000000
118                Country_Moldova    -0.000000
119               Country_Mongolia    -0.000000

Bias: 0.82
Mean squared error: 22.05
RSS: 11049.15
Variance score: 0.34

Ridge
               Feature  Coefficient
145     Country_Serbia     5.234631
18           Recovered     0.651700
73     Country_Ecuador    -0.610619
54     Country_Burundi    -0.442469
81       Country_Gabon     0.439532
53       Country_Burma    -0.424404
55    Country_Cameroon    -0.423874
38     Country_Bahrain    -0.378646
44       Country_Benin    -0.299912
93   Country_Indonesia    

In [41]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
pf = poly.fit_transform(x_train)