In [130]:
import pipeline
import pandas as pd
import pickle
pd.options.display.max_columns = None

In [131]:
filepath = '../data/covid_df_protocol_3.pkl'
df_train, df_test = pipeline.read_and_process_data(filepath)
del filepath

Checking variables with highest percentage of NAs in training set:

In [132]:
df_train.apply(lambda x: x.isnull().sum()/len(x)).sort_values(ascending=False)[:20]

E4_International support                 0.035775
H4_Emergency investment in healthcare    0.020798
H5_Investment in vaccines                0.013500
H2_Testing policy                        0.009576
EconomicSupportIndex                     0.008901
E2_Debt/contract relief                  0.008100
E3_Fiscal measures                       0.007341
H3_Contact tracing                       0.007087
GovernmentResponseIndex                  0.007087
ContainmentHealthIndex                   0.007045
E1_Income support                        0.004514
C5_Close public transport                0.003755
C3_Cancel public events                  0.003712
C4_Restrictions on gatherings            0.003712
C7_Restrictions on internal movement     0.003712
StringencyIndex                          0.003670
H1_Public information campaigns          0.003670
C8_International travel controls         0.003670
C6_Stay at home requirements             0.003670
C2_Workplace closing                     0.003670


In [133]:
pipeline.sanity_check(df_train, df_test)

Success: Features match
Failure: Data is not clean yet, NAs remaining


In [134]:
df_train = df_train.dropna()
df_test = df_test.dropna()
df_train = df_train.dropna()
df_test = df_test.dropna()

#Since we will be predicting confirmed cases first, we remove the other outcomes (deaths)
df_train = df_train.drop(['Deaths'], axis=1)
df_test = df_test.drop(['Deaths'], axis=1)

#Next we choose a target to split the data on x and y
x_train, y_train = pipeline.divide_target_and_features(df_train, 'Confirmed Cases')
x_test, y_test = pipeline.divide_target_and_features(df_test, 'Confirmed Cases')

In [135]:
from sklearn import linear_model
import importlib
importlib.reload(pipeline)
ls = linear_model.Lasso(alpha=0.5)
rg = linear_model.Ridge(alpha=0.5)
lreg = linear_model.LinearRegression()
ev = {}
models = [(ls, 'Lasso'),
          (rg, 'Ridge'),
          (lreg, 'Linear Regression')]

for m in models:
    (model, name) = m
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print('{}\n{}\n'.format(name, pipeline.get_most_relevant_features(x_train, model, 10)))
    ev[name] = pipeline.metrics(y_pred, y_test, x_train, y_train, model)

Lasso
                           Feature  Coefficient
22                       Recovered     0.212831
0                        Intercept     0.015721
33   Days Elapsed Since First Case     0.003892
32                       Day Count    -0.000156
116             Country_Madagascar    -0.000000
118               Country_Malaysia    -0.000000
119                   Country_Mali     0.000000
120             Country_Mauritania    -0.000000
121              Country_Mauritius    -0.000000
122                 Country_Mexico    -0.000000

Bias: 0.77
Mean squared error: 7.61
RSS: 30639.07
Variance score: 0.28
R2 score: -8.62

Ridge
                              Feature  Coefficient
174  Country_United States of America     1.208713
64                      Country_China    -1.181994
88                    Country_Germany    -1.129358
96                      Country_India     0.923956
18                    StringencyIndex    -0.854737
19            GovernmentResponseIndex    -0.760830
23            

In [41]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
pf = poly.fit_transform(x_train)

## Trying to see the effect of policies in lagged days 

In [136]:
def policy_effect_lag(filepath, outcome_var, lag):
    '''
    The effect of policy is expected to have some lag.
    This function, lags the outcome variable by given number of days,
    input:
        filepath: pandas pickle filepath 
        outcome_var: (str) the variable which needs to be lagged
        lag: (int) number of days to be lagged
    output:
        save the file in data folder
    '''
    df = pd.read_pickle(filepath)
    df.rename(columns={outcome_var: str(outcome_var) + '_old'}, inplace = True)
    df[outcome_var] = df.groupby('Country')[str(outcome_var) + '_old'].shift(lag)
    df = df[~df[outcome_var].isnull()]
    df.drop(columns=[str(outcome_var) + '_old'])
    
    #savint the pkl to the data folder
    filename = 'covid_df_lagged_days_' + str(lag)
    with open ('../data/'+filename, 'wb') as f:
        pickle.dump(df, f)
    
    print("filename: {} dumped in ../data/folder".format(filename) )
    return None

In [137]:
filepath = '../data/covid_df_protocol_3.pkl'
policy_effect_lag(filepath, 'Confirmed Cases', 2)

filename: covid_df_lagged_days_2 dumped in ../data/folder


In [138]:
filepath_lag4 = '../data/covid_df_lagged_days_2'
df_train, df_test = pipeline.read_and_process_data(filepath_lag4)

In [139]:
df_train = df_train.dropna()
df_test = df_test.dropna()
df_train = df_train.dropna()
df_test = df_test.dropna()

#Since we will be predicting confirmed cases first, we remove the other outcomes (deaths)
#df_train = df_train.drop(['Deaths'], axis=1)
#df_test = df_test.drop(['Deaths'], axis=1)

#Next we choose a target to split the data on x and y
x_train, y_train = pipeline.divide_target_and_features(df_train, 'Confirmed Cases')
x_test, y_test = pipeline.divide_target_and_features(df_test, 'Confirmed Cases')

In [140]:
from sklearn import linear_model
import importlib
importlib.reload(pipeline)
ls = linear_model.Lasso(alpha=0.5)
rg = linear_model.Ridge(alpha=0.5)
lreg = linear_model.LinearRegression()
ev = {}
models = [(ls, 'Lasso'),
          (rg, 'Ridge'),
          (lreg, 'Linear Regression')]

for m in models:
    (model, name) = m
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print('{}\n{}\n'.format(name, pipeline.get_most_relevant_features(x_train, model, 10)))
    ev[name] = pipeline.metrics(y_pred, y_test, x_train, y_train, model)

Lasso
                           Feature  Coefficient
22             Confirmed Cases_old     0.441012
35   Days Elapsed Since First Case     0.002641
0                        Intercept     0.002474
34                       Day Count    -0.000086
118             Country_Madagascar    -0.000000
120               Country_Malaysia    -0.000000
121                   Country_Mali     0.000000
122             Country_Mauritania    -0.000000
123              Country_Mauritius    -0.000000
124                 Country_Mexico    -0.000000

Bias: 0.39
Mean squared error: 4.54
RSS: 18298.02
Variance score: 0.64
R2 score: -0.66

Ridge
                              Feature  Coefficient
22                Confirmed Cases_old     1.250989
104                     Country_Italy     0.632144
160                     Country_Spain     0.444251
24                             Deaths    -0.321141
18                    StringencyIndex     0.135549
52                    Country_Bolivia    -0.112233
19            