In [3]:
import pipeline

In [4]:
filepath = '../data/covid_df.pkl'
df_train, df_test = pipeline.read_and_process_data(filepath)
del filepath

Checking variables with highest percentage of NAs in training set:

In [5]:
df_train.apply(lambda x: x.isnull().sum()/len(x)).sort_values(ascending=False)[:20]

E4_International support                 0.035732
H4_Emergency investment in healthcare    0.020798
H5_Investment in vaccines                0.013500
H2_Testing policy                        0.009576
EconomicSupportIndex                     0.008859
E2_Debt/contract relief                  0.008058
E3_Fiscal measures                       0.007298
H3_Contact tracing                       0.007087
ContainmentHealthIndex                   0.007045
GovernmentResponseIndex                  0.007045
E1_Income support                        0.004472
C5_Close public transport                0.003755
C3_Cancel public events                  0.003712
C4_Restrictions on gatherings            0.003712
C7_Restrictions on internal movement     0.003712
StringencyIndex                          0.003670
H1_Public information campaigns          0.003670
C8_International travel controls         0.003670
C6_Stay at home requirements             0.003670
C2_Workplace closing                     0.003670


In [6]:
pipeline.sanity_check(df_train, df_test)

Success: Features match
Failure: Data is not clean yet, NAs remaining


In [7]:
df_train = df_train.dropna()
df_test = df_test.dropna()

#Since we will be predicting confirmed cases first, we remove the other outcomes (deaths)
df_train = df_train.drop(['Deaths'], axis=1)
df_test = df_test.drop(['Deaths'], axis=1)

#Next we choose a target to split the data on x and y
x_train, y_train = pipeline.divide_target_and_features(df_train, 'Confirmed Cases')
x_test, y_test = pipeline.divide_target_and_features(df_test, 'Confirmed Cases')

In [18]:
from sklearn import linear_model
import importlib
importlib.reload(pipeline)
ls = linear_model.Lasso(alpha=0.5)
rg = linear_model.Ridge(alpha=0.5)
lreg = linear_model.LinearRegression()
ev = {}
models = [(ls, 'Lasso'),
          (rg, 'Ridge'),
          (lreg, 'Linear Regression')]

for m in models:
    (model, name) = m
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print('{}\n{}\n'.format(name + ': Features with highest magnitude coefficients in absolute value'
                            , pipeline.get_most_relevant_features(x_train, model, 10)))
    ev[name] = pipeline.metrics(y_pred, y_test, x_train, y_train, model)

Lasso: Features with highest magnitude coefficients in absolute value
                           Feature  Coefficient
22                       Recovered     0.212831
0                        Intercept     0.015721
33   Days Elapsed Since First Case     0.003892
32                       Day Count    -0.000156
116             Country_Madagascar    -0.000000
118               Country_Malaysia    -0.000000
119                   Country_Mali     0.000000
120             Country_Mauritania    -0.000000
121              Country_Mauritius    -0.000000
122                 Country_Mexico    -0.000000

Bias: 0.77
Mean squared error: 7.61
RSS: 30639.07
Variance score: 0.28

Ridge: Features with highest magnitude coefficients in absolute value
                              Feature  Coefficient
174  Country_United States of America     1.208713
64                      Country_China    -1.181994
88                    Country_Germany    -1.129358
96                      Country_India     0.923956
18  

In [19]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7691365902943914

In [41]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
pf = poly.fit_transform(x_train)