In [4]:
import pipeline
import pandas as pd
import numpy as np

## Reading data

In [26]:
filepath = '../data/covid_df.pkl'
df = pd.read_pickle(filepath)

In [31]:
max(df['Date'])

datetime.date(2020, 5, 29)

## Creating log target of confirmed cases

In [27]:
df['log_cases'] = np.log(df['Confirmed Cases']+1)

## Dropping China, Russia and Brazil

In [35]:
df = df[df['Country'] != 'China']
df = df[df['Country'] != 'Russia']
df = df[df['Country'] != 'Brazil']

In [36]:
weeks_for_cv = [0,1,2,3,4,5,6]
n_weeks_prediction = 2
results = {}
for i in weeks_for_cv:
    shorter_df = pipeline.cut_df_on_weeks(df, i)
    df_train, df_test = pipeline.split_and_scale_on_last_weeks(shorter_df, n_weeks_prediction)
    pipeline.sanity_check(df_train, df_test)
    X_train, y_train = pipeline.divide_target_and_features(df_train, 'log_cases')
    X_test, y_test = pipeline.divide_target_and_features(df_test, 'log_cases')
    dct = pipeline.train_and_evaluate(X_train, y_train, X_test, y_test)
    results['Until week: ' +str(i)] = dct


Cutting dataframe on date: 2020-05-29
Success: Features match
Success: No NAs remain
Lasso: Features with highest magnitude                                coefficients in absolute value
                           Feature  Coefficient
0                        Intercept     2.256314
8             EconomicSupportIndex     0.233803
19   Days Elapsed Since First Case     0.060904
18                       Day Count     0.007023
143                  Country_Syria    -0.000000
134               Country_Slovenia     0.000000
135           Country_South Africa     0.000000
136            Country_South Korea     0.000000
137                  Country_Spain     0.000000
138              Country_Sri Lanka    -0.000000

Bias: 3.02
Mean squared error: 3.94
RSS: 5698.12
Variance score: 0.76

R2 score: 0.40

Ridge: Features with highest magnitude                                coefficients in absolute value
                         Feature  Coefficient
0                      Intercept     9.644856
108  

Linear Regression: Features with highest magnitude                                coefficients in absolute value
                              Feature   Coefficient
137              Country_South Africa  8.438242e+08
98                 Country_Madagascar -8.098762e+08
0                           Intercept -7.693987e+08
63                   Country_Eswatini  7.552231e+08
13                    Life Expectancy  6.590389e+08
115                   Country_Nigeria  6.165055e+08
94                    Country_Lesotho  5.832973e+08
28                 Country_Bangladesh -5.746944e+08
21                    Country_Albania -5.705071e+08
44   Country_Central African Republic  5.652015e+08

Bias: 1.05
Mean squared error: 2.30
RSS: 4540.72
Variance score: 0.89

R2 score: 0.68

Cutting dataframe on date: 2020-05-01
Success: Features match
Success: No NAs remain
Lasso: Features with highest magnitude                                coefficients in absolute value
                           Feature  Coeff

## Predicting with best model

In [41]:
from sklearn import linear_model
n_weeks_prediction = 2
df_train, df_test = pipeline.split_and_scale_on_last_weeks(df, n_weeks_prediction)
X_train, y_train = pipeline.divide_target_and_features(df_train, 'log_cases')
X_test, y_test = pipeline.divide_target_and_features(df_test, 'log_cases')
lreg = linear_model.LinearRegression()
lreg.fit(X_train, y_train)
y_pred = lreg.predict(X_test)

In [45]:
dct = pipeline.train_and_evaluate(X_train, y_train, X_test, y_test)

Lasso: Features with highest magnitude                                coefficients in absolute value
                           Feature  Coefficient
0                        Intercept     2.256314
8             EconomicSupportIndex     0.233803
19   Days Elapsed Since First Case     0.060904
18                       Day Count     0.007023
143                  Country_Syria    -0.000000
134               Country_Slovenia     0.000000
135           Country_South Africa     0.000000
136            Country_South Korea     0.000000
137                  Country_Spain     0.000000
138              Country_Sri Lanka    -0.000000

Bias: 3.02
Mean squared error: 3.94
RSS: 5698.12
Variance score: 0.76

R2 score: 0.40

Ridge: Features with highest magnitude                                coefficients in absolute value
                         Feature  Coefficient
0                      Intercept     9.644856
108                Country_Nepal    -3.123086
158              Country_Vietnam    -2.70308

In [42]:
predictions = pipeline.predictions_every_country(X_test.columns[19:161], X_test, y_pred, y_test)

In [43]:
predictions

Unnamed: 0_level_0,Afghanistan real,Afghanistan prediction,Albania real,Albania prediction,Algeria real,Algeria prediction,Angola real,Angola prediction,Argentina real,Argentina prediction,...,Venezuela real,Venezuela prediction,Vietnam real,Vietnam prediction,Yemen real,Yemen prediction,Zambia real,Zambia prediction,Zimbabwe real,Zimbabwe prediction
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-16,8.764522,6.604923,6.839476,8.412216,8.827908,8.863518,3.89182,5.654449,8.962648,7.757488,...,6.224558,7.076756,5.765191,6.051254,4.812184,3.283306,6.522093,4.756992,3.7612,5.271351
2020-05-17,8.804625,6.629032,6.853299,8.436329,8.856518,8.887634,3.89182,5.623665,8.995785,7.781597,...,,,5.771441,6.075363,4.859812,3.307423,6.625392,4.781109,3.806662,5.29546
2020-05-18,8.86404,6.653149,6.855409,8.722225,8.882114,8.911747,3.931826,5.647781,9.032648,7.805714,...,,,5.783825,6.09948,4.875197,3.331539,6.635947,4.805218,3.850148,5.319576
2020-05-19,8.942984,6.677265,6.856462,8.746338,8.906258,8.93586,3.970292,5.671898,9.083643,7.82983,...,,,5.783825,6.123596,5.123964,3.355648,6.650279,4.829334,3.850148,5.343693
2020-05-20,9.005282,6.701374,6.872128,8.770451,8.928375,8.959976,3.970292,5.696007,9.136048,7.853939,...,,,5.783825,6.147705,5.220356,3.379765,6.725034,4.853451,3.89182,5.367802
2020-05-21,9.068431,6.725491,6.877296,8.794563,8.952735,8.984089,4.077537,5.720123,9.203517,7.878056,...,,,5.783825,6.171822,5.288267,3.403881,6.765039,4.87756,3.951244,5.391918
2020-05-22,,,6.889591,8.81868,8.97702,9.008202,4.110874,5.744236,9.273315,7.902172,...,,,5.783825,6.195934,5.347108,3.42799,6.82546,4.901676,3.951244,5.416035
2020-05-23,,,6.897705,8.842793,9.001346,9.032314,4.127134,5.768349,9.337325,7.926281,...,,,5.786897,6.220047,5.361292,3.452106,6.82546,4.925785,4.043051,5.440144
2020-05-24,,,6.906755,8.866905,9.024854,9.056431,4.248495,5.792465,9.399058,7.950397,...,,,5.786897,6.244164,5.407172,3.476215,6.82546,4.949902,4.043051,5.46426
2020-05-25,,,6.912743,8.891022,,,4.26268,5.816578,9.443751,7.974514,...,,,,,5.455321,3.500332,6.82546,4.974018,4.043051,5.488369


In [44]:
predictions.to_pickle("../data/predictions_noChinaRussiaBrazil.pkl")