# The COVID pandmic: socioeconomic and health disparities

#### Javaheri, B. The COVID-19 Pandemic: Socioeconomic and Health Disparities. Preprints 2020, 2020120599 (doi: 10.20944/preprints202012.0599.v1

### This notebook contains all the steps taken to process and analyse the COVID-19 data. These are:

#### 1. Loading required libraries
#### 2. Importing the dataset
#### 3. Exploratory data analysis
#### 4. Data imputation to process missing values
#### 5. Data visualisation
#### 6. Data distribution, transformation and correlation matrix
#### 7. Data imputation to process missing values
#### 8. Ridge regression
#### 9. XGBoost 





### 1. Loading the required libraries

In [None]:
import pandas as pd
from pandas.plotting import scatter_matrix
from pandas import to_datetime
import numpy as np
from numpy import mean
from numpy import std
from numpy import absolute
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
import seaborn as sns
from scipy.stats import norm, skew
from scipy import stats
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, StratifiedKFold, RepeatedKFold, KFold, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor, plot_importance 
from statsmodels.stats.outliers_influence import variance_inflation_factor
from cycler import cycler
import matplotlib as mpl
from yellowbrick.regressor import PredictionError, ResidualsPlot
from yellowbrick.model_selection import learning_curve, ValidationCurve, FeatureImportances, CVScores
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))
import datetime
import operator
import random
import math
import time
# to improve matplotlib graphs
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
pio.renderers.default='notebook'
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings('ignore')
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn
mpl.rcParams['figure.dpi'] = 300

### 2. Importing the dataset

In [None]:
#Importing the csv file
df = pd.read_csv("../input/covid19-socioeconomic-and-health-disparities/data.csv")


### 3. Exploratory Data Analysis (EDA)
In this step basic information about the data structure is obtained. 

#### 3.1 Data dimension and head to obtain some information on the structure

In [None]:
# Print the shape of dataframe
print("Dimension of this datasets (rows, columns) is: ", df.shape)
print()

In [None]:
print("The first few rows: ")
df.head()


#### 3.2 Concise summary of dataframe
Here, df.info() method is used to print summary and data types.

In [None]:
print(df.info(verbose=True))


Variable "date" has "object" as data type. Whilst, time aspect of this dataset only used for visualisation and not part of analysis and model building,
type needs to be changed appropriately. This is achieved using to_datetime

In [None]:
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")



#### 3.3 Descriptive statistics

This is generated by describe() function to summarise the central tendency and distribution of the dataframe columns excluding missing values.
Count provides information on missing values.
Mean provides mean of variable.
Std provides standard deviation of that variables. etc.

In [None]:
# Brief statistical description of the data
df.describe().T

### 4. Data imputation to process missing values

#### 4.1 Count the missing values

In [None]:
#https://towardsdatascience.com/data-cleaning-with-python-and-pandas-detecting-missing-values-3e9c6ebcf78b
# Count the missing values.
miss_values = df.columns[df.isnull().any()]
print(f"Missing values:\n{df[miss_values].isnull().sum()}")
null_values = df.columns[df.isna().any()]
print(f"Null values:\n{df[null_values].isna().sum()}")
df_missing = df

#### 4.2 Dropping empty columns

In [None]:
# Dropping columns from Yougov source as they are mostly empty
df.drop([col for col in df.columns if "weekly" in col], axis=1, inplace=True)
df.drop([col for col in df.columns if "yougov" in col], axis=1, inplace=True)
df.drop([col for col in df.columns if "ox_m1_wildcard" in col], axis=1, inplace=True)

In [None]:
# pandas drop columns using list of column names
df.drop(["iso_code", "jhu_confirmed", "jhu_deaths", "owid_new_tests_smoothed", "owid_new_tests_per_thousand", "owid_new_tests_smoothed_per_thousand", 
        "owid_tests_per_case", "owid_positive_rate", "owid_tests_units", "ox_confirmed_cases", "owid_total_tests", "owid_new_tests", "owid_total_tests_per_thousand",
        "ox_confirmed_deaths", "marioli_ci_65_u", "marioli_ci_65_l", "marioli_ci_95_u", "marioli_ci_95_l", "sdsn_effective_reproduction_rate_smoothed",
        "sdsn_positive_test_rate_smoothed", "sdsn_new_cases_per_million_smoothed", "sdsn_new_deaths_per_million_smoothed", "owid_total_cases_per_million",
        "owid_total_deaths_per_million", "ox_c1_flag", "ox_c2_flag", "ox_c3_flag", "ox_c4_flag", "ox_c5_flag", "ox_c6_flag", "ox_c7_flag", "ox_e1_flag", "ox_h1_flag",
        "owid_handwashing_facilities", "sdsn_overall_transmission", "google_mobility_change_grocery_and_pharmacy", "google_mobility_change_parks",
        "google_mobility_change_transit_stations", "google_mobility_change_retail_and_recreation", "google_mobility_change_residential",
        "google_mobility_change_workplaces", "marioli_effective_reproduction_rate","ox_stringency_index_for_display", "ox_stringency_legacy_index_for_display",
        "ox_government_response_index_for_display", "ox_containment_health_index_for_display", "ox_economic_support_index_for_display", "ox_stringency_legacy_index",
        "owid_stringency_index","owid_aged_70_older"], axis=1, inplace=True)

#### 4.3 Selecting countires with population more than 1,000,000

In [None]:
df = df.loc[df["owid_population"] > 1000000]



#### 4.4 Limiting analysis to top 5 most affected countries

In [None]:
top_n_country_names = df.groupby("country").max()["owid_total_deaths"].nlargest(5).keys()
df = df.loc[df['country'].isin(top_n_country_names)]

#### 4.5 Confining period of analysis to between 01/April and 30/October/2020

In [None]:
#Find Start and finish Date
start_date = df.groupby('country').min()['date'].min()
end_date = df.groupby('country').max()['date'].max()
date_range = pd.date_range(start_date, end_date, freq='D')
print ("Start Date : ", start_date)
print ("End Date : ", end_date)

In [None]:
df = df[~(df['date'] < '2020-04-01')]
start_date = df.groupby('country').min()['date'].min()
date_range = pd.date_range(start_date, end_date, freq='D')
print ("Start Date : ", start_date)

#### 4.6 Engineering a new feature: COVID-19 daily recovery per million

In [None]:
# Recovered cases = daily - deaths - recovered
df['owid_new_recovered_per_million'] = df['owid_new_cases_per_million'] - df['owid_new_deaths_per_million']
# Recovered cases = daily - deaths - recovered
df['owid_new_recovered'] = df['owid_new_cases'] - df['owid_new_deaths']

#### 4.7 Forward and backward fill

In [None]:
## 2. Forward Fill --------------------------
df = df.fillna(method='ffill').fillna(method='bfill')

#### 4.8 Visualisation of data before and after imputation to remove missing values

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(22, 15));
fig.suptitle('Visualisation of dataset for missing values (in yellow) before and after data imputation', fontsize=32, weight="bold");
plt.subplot(2,1,1);
sns.heatmap(df_missing.isnull(),xticklabels=False,cbar=False,cmap='summer');
plt.title('Before imputation: missing values (yellow)', size=30, weight="bold");
plt.subplot(2,1,2);
sns.heatmap(df.isnull(),xticklabels=True,cbar=False,cmap='summer');
plt.title('After imputation: no missing values', size=30, weight="bold");
plt.subplots_adjust(top=0.92);
plt.subplots_adjust(wspace=0, hspace=0.1)

### 5. Data visualisation

In [None]:
sns.set_style('white');
fig, axes = plt.subplots(2, 4, figsize=(24, 10));
fig.suptitle('COVID-19 mortality, survival, recovery, health indices and governments restrictions', fontsize=28, weight="bold");
plt.subplot(2,4,1);
plt.gca().set_title('COVID-19 mortality/million', fontsize=22, weight="bold");
st = sns.stripplot(x = 'country', y = 'owid_new_deaths_per_million', data = df, jitter=0.25, split=True, linewidth=0.5, palette = "husl");
box = sns.boxplot(palette=['#BBBBBB','#DDDDDD'], linewidth=1, x = 'country', y = 'owid_new_deaths_per_million', data = df,showfliers=False);
box.set(xlabel=None);
box.set(ylabel="Daily mortality/million");
plt.subplot(2,4,2);
plt.gca().set_title('COVID-19 recovery/million', fontsize=22, weight="bold");
st = sns.stripplot(x = 'country', y = 'owid_new_recovered_per_million', data = df, jitter=0.25, split=True, linewidth=0.5, palette = "husl");
box = sns.boxplot(palette=['#BBBBBB','#DDDDDD'], linewidth=1, x = 'country', y = 'owid_new_recovered_per_million', data = df,showfliers=False);
box.set(xlabel=None);
box.set(ylabel="Daily recovery/million");
plt.subplot(2,4,3);
plt.gca().set_title('Health containment policy', fontsize=22, weight="bold");
st = sns.stripplot(x = 'country', y = 'ox_containment_health_index', data = df, jitter=0.25, split=True, linewidth=0.5, palette = "husl");
box = sns.boxplot(palette=['#BBBBBB','#DDDDDD'], linewidth=1, x = 'country', y = 'ox_containment_health_index', data = df,showfliers=False);
box.set(xlabel=None);
box.set(ylabel="Containment health index");
plt.subplot(2,4,4);
plt.gca().set_title('Governments stringency policy', fontsize=22, weight="bold");
st = sns.stripplot(x = 'country', y = 'ox_stringency_index', data = df, jitter=0.25, split=True, linewidth=0.5, palette = "husl");
box = sns.boxplot(palette=['#BBBBBB','#DDDDDD'], linewidth=1, x = 'country', y = 'ox_stringency_index', data = df,showfliers=False);
box.set(xlabel=None);
box.set(ylabel="Government policy stringency");
plt.subplot(2,4,5);
plt.gca().set_title('Extreme poverty', fontsize=22, weight="bold");
bar = sns.barplot(x = 'country', y = 'owid_extreme_poverty', data = df, palette = "husl");
bar.set(xlabel=None);
bar.set(ylabel="Extreme povery");
plt.subplot(2,4,6);
plt.gca().set_title('Life expectancy', fontsize=22, weight="bold");
bar = sns.barplot(x = 'country', y = 'owid_life_expectancy', data = df, linewidth=0.5, palette = "husl");
bar.set(xlabel=None);
bar.set(ylabel="Average life expectancy");
plt.subplot(2,4,7);
plt.gca().set_title('Age 65 or over/million', fontsize=22, weight="bold");
bar = sns.barplot(x = 'country', y = 'owid_aged_65_older', data = df, linewidth=0.5, palette = "husl");
bar.set(xlabel=None);
bar.set(ylabel="Age >= 65 per million");
plt.subplot(2,4,8);
plt.gca().set_title('Hospital beds/thousand', fontsize=22, weight="bold");
bar = sns.barplot(x = 'country', y = 'owid_hospital_beds_per_thousand', data = df, linewidth=0.5, palette = "husl");
bar.set(xlabel=None);
bar.set(ylabel="Number of hospital beds/thousand");


In [None]:
#https://www.kaggle.com/therealcyberlord/coronavirus-covid-19-visualization-prediction/notebook
df1 = df[(df.iloc[:,2:-1] >= 0).all(1)]
fig, axes = plt.subplots(2, 1, figsize=(24, 10))
fig.suptitle('Daily COVID-19 survival and mortality (per million) in top 5 affected countries', fontsize=30, weight="bold");
plt.subplot(2,1,1)
plt.bar(df1.date, df1.owid_new_recovered_per_million, label="Recovered", color='teal')
plt.ylabel('Recovered (per million)', size=22, weight="bold");
plt.xticks(visible=False)
plt.yticks(size=20)
plt.legend(loc='upper right', shadow=True, fontsize='xx-large')
plt.subplot(2,1,2)
plt.bar(df1.date, df1.owid_new_deaths_per_million, label="Mortality", color='lightcoral')
plt.xlabel('Days Since 1/04/2020', size=30, weight="bold");
plt.ylabel('Mortality (per million)', size=22, weight="bold");
plt.legend(loc='upper right', shadow=True, fontsize='xx-large')
plt.xticks(size=24)
plt.yticks(size=24)
plt.subplots_adjust(wspace=0, hspace=0.02)
plt.subplots_adjust(top=0.93);

In [None]:
#dropping columns not needed for analysis
df.drop(["jhu_recovered", "owid_total_cases", "owid_new_cases", "owid_total_deaths", "owid_new_deaths", "owid_population", "owid_new_recovered", "owid_new_cases_per_million",
        "owid_new_recovered_per_million"], axis=1, inplace=True)

### 6. Data distribution, transformation and correlation matrix

#### 6.1 Distribution of data

In [None]:
#https://simply-python.com/2019/08/21/useful-seaborn-plots-for-data-exploration/

#numeric_features= df.select_dtypes(exclude=["object","datetime"])
#numeric_features = numeric_features.stack().reset_index().rename(columns = {"level_1":"Variable",0:"Value"})
#g = sns.FacetGrid(data =numeric_features, col="Variable",  col_wrap=6, sharex=False, sharey=False)
#g = g.map(sns.distplot, "Value", color ='blue')
#plt.subplots_adjust(top=0.93)
#plt.suptitle("Histograms of various variables to test distribution of target (first) and predictors (remainder)", fontsize=28, weight="bold");

The plot above suggests that the target variable (first histogram) suffers from assymetry and therefore skewed. 

#### 6.2 Transformation of skewed target variable

In [None]:
# https://www.kaggle.com/duonghoanvu1/momo-secret-finding
#### Some variables are skewed and as linear models like normally distributed data , we will transform SalePrice and make it more normally distributed.
pal = sns.color_palette('Paired')
fig, axes = plt.subplots(3, 2, figsize=(20, 15))
fig.suptitle('COVID-19 daily mortality before and after log transformation', fontsize=32, weight="bold");
plt.subplots_adjust(top=0.93)
ax = plt.subplot(2,2,1)
sns.distplot(df['owid_new_deaths_per_million'] , fit=norm, color = "dodgerblue");
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df['owid_new_deaths_per_million']);
# Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best');
plt.ylabel('Frequency')
plt.title('Mortality/million before transformation', fontsize=18, weight="bold");
# Get also the QQ-plot
ax = plt.subplot(2,2,2)
res = stats.probplot(df['owid_new_deaths_per_million'], plot=plt);
plt.title('Probability plot mortality/million before transformation', fontsize=18, weight="bold");

ax = plt.subplot(2,2,3)
df['owid_new_deaths_per_million_transf'] = np.log1p(df['owid_new_deaths_per_million']);
sns.distplot(df['owid_new_deaths_per_million_transf'], fit=norm, color = "dodgerblue");
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df['owid_new_deaths_per_million_transf'])
# Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],  loc='best')
plt.ylabel('Frequency')
plt.title('Mortality/million after transformation', fontsize=18, weight="bold");
ax = plt.subplot(2,2,4)
# Get also the QQ-plot
res = stats.probplot(df['owid_new_deaths_per_million_transf'], plot=plt);
plt.title('Probability plot mortality/million after transformation', fontsize=18, weight="bold");
plt.subplots_adjust(wspace=0.08, hspace=0.14)

In [None]:
# dropping the skewed target variable as this is no longer needed. For analysis transformed variable is used.
df = df.drop(['owid_new_deaths_per_million'], axis = 1)

#### 6.3 Correlation matrix to test correlation of predictors to target variable

In [None]:
# Computing correlation matrix to describe correlation of variables
corrmat = df.corr() 
k = 50 
cols = corrmat.nlargest(k, 'owid_new_deaths_per_million_transf')['owid_new_deaths_per_million_transf'].index 
cm = np.corrcoef(df[cols].values.T)
f, ax = plt.subplots(figsize =(16, 12)) 
sns.heatmap(cm, ax = ax, cmap = "coolwarm", 
            linewidths = 0.1, yticklabels = cols.values,  
                              xticklabels = cols.values)
plt.show()

In [None]:
#Correlation with output target variable: graph
cor_target = corrmat["owid_new_deaths_per_million_transf"].sort_values(ascending=False)
#Selecting highly correlated features
plt.figure(figsize=(20,8))
cor_target.drop("owid_new_deaths_per_million_transf").plot.bar(color="darkcyan")

In [None]:
#Correlation with output variable: numbers
cor_target = corrmat["owid_new_deaths_per_million_transf"].sort_values(ascending=False)
#Selecting highly correlated features
cor_target

### 8. Ridge regression

#### 8.1 Defining predictors (X) and target outcome (y)

In [None]:
X = df.drop(['country','date','owid_new_deaths_per_million_transf'], axis = 1)
y = df.owid_new_deaths_per_million_transf

#### 8.2 Splitting dataset into train and test; 80 and 20% respectively

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size = 0.2, random_state = 42)

#### 8.3 Testing for multicollinearity

In [None]:
vif = pd.DataFrame()
vif["variables"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif

#### 8.4 Testing linearity

In [None]:
#https://boostedml.com/2018/08/testing-linear-regression-assumptions-the-kaggle-housing-price-dataset.html
from scipy import stats
from statsmodels.regression.linear_model import OLS 
import statsmodels as sm
def abline(slope, intercept):
     #Plot a line from slope and intercept, borrowed from https://stackoverflow.com/questions/7941226/how-to-add-line-based-on-slope-and-intercept-in-matplotlib"""
     axes = plt.gca()
     x_vals = np.array(axes.get_xlim())
     y_vals = intercept + slope * x_vals
     plt.plot(x_vals, y_vals, '--')
 #fit an OLS model to data

X_train_np = np.array(X_train)
y_np = np.array(y_train)

model = OLS(y_np,sm.tools.add_constant(X_train_np))
results = model.fit()
#predict y values for training data
y_hat = model.predict(results.params)
#plot predicted vs actual
plt.plot(y_hat,y_np,'o')
plt.xlabel('Predicted')#,color='white')
plt.ylabel('Actual')#,color='white')
plt.title('Predicted vs. Actual: Visual Linearity Test')#,color='white')
plt.tick_params(axis='x', colors='white')
plt.tick_params(axis='y', colors='white')
abline(1,0)
plt.show()



#### 8.5 Cross-validation for ridge regression


In [None]:
# create an array of alpha values
#https://harvard-iacs.github.io/2018-CS109A/labs/lab-5/solutions/

alphas = np.logspace(-4, 0, 50)
splitter = KFold(10, random_state=42, shuffle=True)

# select the best alpha with RidgeCV
from sklearn.linear_model import RidgeCV
ridge_CV = RidgeCV(alphas=alphas, normalize=True, scoring='neg_mean_squared_error', cv=splitter)
ridge_CV.fit(X_train, y_train)

best_alpha = ridge_CV.alpha_;
print("Best model searched:\nalpha = {}\nintercept = {}\nbetas = {}, ".format(best_alpha, ridge_CV.intercept_, ridge_CV.coef_))
print()

tuned_ridge = Ridge(alpha=best_alpha, normalize=True,)
tuned_ridge.fit(X_train, y_train)
pred_y = tuned_ridge.predict(X_train)
ridge_score = metrics.r2_score(y_train, pred_y)
ridge_EV=metrics.explained_variance_score(y_train, pred_y) #Explained variance
ridge_MAE=metrics.mean_absolute_error(y_train, pred_y) #Mean absolute error
ridge_mse = metrics.mean_squared_error(y_train, pred_y)
ridge_RMSE= np.sqrt(metrics.mean_squared_error(y_train, pred_y))


#ypredict_ridge_best = est.predict(test_set)
tuned_ridge.coef_

# calculate R^2 value, MAE, MSE, RMSE
print("Performance of tuned ridge regression on entire training dataset: \n R2:{:.3f}, EV: {:.3f}, MAE: {:.3f}, MSE:{:.3f}, RMSE:{:.3f}"\
      .format(ridge_score, ridge_EV, ridge_MAE, ridge_score, ridge_mse, ridge_RMSE))
print()

y_pred = tuned_ridge.predict(X_test)
ridge_score = metrics.r2_score(y_test, y_pred)
ridge_EV=metrics.explained_variance_score(y_test, y_pred) #Explained variance
ridge_MAE=metrics.mean_absolute_error(y_test, y_pred) #Mean absolute error
ridge_mse = metrics.mean_squared_error(y_test, y_pred)
ridge_RMSE= np.sqrt(metrics.mean_squared_error(y_test, y_pred))

print("Performance of ridge regression on testing dataset: \n R2:{:.3f}, EV: {:.3f}, MAE: {:.3f}, MSE:{:.3f}, RMSE:{:.3f}"\
      .format(ridge_score, ridge_EV, ridge_MAE, ridge_mse, ridge_RMSE))
print()
print()

#### 8.6 Visualisation of cross-validation scores, validation curve, residual plot and prediction error

In [None]:
from yellowbrick.style import set_palette
set_palette('yellowbrick')
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2,figsize=(25, 12))
viz = CVScores(tuned_ridge, cv=10, scoring='r2', ax=ax1)
viz.fit(X_train, y_train)
viz.finalize()
viz = ValidationCurve(Ridge(), param_name="alpha", param_range=alphas, cv=10, scoring="r2", ax=ax2);
viz.fit(X_train, y_train)
viz.finalize()
viz = ResidualsPlot(tuned_ridge, hist=False, qqplot=True, ax=ax3)
viz.fit(X_train, y_train)
viz.score(X_test, y_test);
viz.finalize()
viz = PredictionError(tuned_ridge, ax=ax4)
viz.fit(X_train, y_train) 
viz.score(X_test, y_test) 
viz.finalize()
f.suptitle('Performance of ridge regression on train and test datasets', fontsize=32, weight="bold");
plt.subplots_adjust(top=0.91)

#### 8.7 Performance of ridge regression on train and test datasets

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(22, 6))
ax = plt.subplot(1,2,1)
x_ax = range(len(X_train))
plt.scatter(x_ax, y_train, s=15, color="dodgerblue", label="Train original")
plt.scatter(x_ax, pred_y, s=15, color="m", label="Train predicted")
#plt.plot(pred_y, 'm--', label=r"$\lambda =  {{{0:1.6f}}}$".format(best_alpha),alpha=0.4)
plt.ylabel('Log daily COVID-19 mortality', size=20)
plt.xlabel('Days since 01/April/2020', size=20)
plt.title('Ridge regression prediction on training dataset', size=20)
plt.xticks(size=20)
plt.yticks(size=20)
plt.legend()
plt.text(3, 3, 'Train R2 = 0.676',
         {'color': 'black', 'fontsize': 20, 
          'bbox': dict(boxstyle="round", fc="white", ec="black", pad=0.2)});

ax = plt.subplot(1,2,2)
x_ax = range(len(X_test))
plt.scatter(x_ax, y_test, s=15, color="dodgerblue", label="Test original")
plt.scatter(x_ax, y_pred, s=15, color="m", label="Test predicted")
#plt.plot(y_pred, 'm--', label=r"$\lambda =  {{{0:1.6f}}}$".format(best_alpha),alpha=0.4)

plt.ylabel('Log daily COVID-19 mortality', size=20)
plt.xlabel('Days since 01/April/2020', size=20)
ax.set_title('Ridge regression prediction on testing dataset', size=20)
plt.xticks(size=20)
plt.yticks(size=20)
plt.legend()
fig.suptitle('Comparison of ridge regression COVID-19 mortality prediction to original data', fontsize=28, weight="bold");
plt.subplots_adjust(top=0.84)
plt.text(2.8, 2.7, 'Test R2 = 0.700',
         {'color': 'black', 'fontsize': 20, 
          'bbox': dict(boxstyle="round", fc="white", ec="black", pad=0.2)});

#### 8.8 Computation of predictor importance by ridge regression

In [None]:
# Feature importance by Ridge Regression 
features = X.keys();
#mpl.rcParams['axes.prop_cycle'] = cycler('color', ['dodgerblue']);
fig = plt.gcf();
fig.set_size_inches(20,13);
ax = plt.subplot(211);
labels = features;
viz = FeatureImportances(tuned_ridge, ax=ax, labels=labels, relative=False);
ax.spines['right'].set_visible(False);
ax.spines['top'].set_visible(False);
ax.grid(False);
fig.suptitle('Predictor importance computed by the ridge regression', fontsize=28, weight="bold");
plt.subplots_adjust(top=0.84);
# Fit and display
viz.fit(X, y);
viz.poof();



### 9. XGBoost

#### 9.1 Grid-search for optimal parameters for XGBoost

In [None]:
#https://www.mikulskibartosz.name/xgboost-hyperparameter-tuning-in-python-using-grid-search/

gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': range(60, 220, 40),
    'max_depth': range (2, 10, 1),
    'learning_rate': [0.1, 0.01, 0.05] };

# Instantiate the regressor: gbm
gbm = XGBRegressor();

# Perform grid search: grid_mse
grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid,
                        scoring='neg_mean_squared_error', cv=10, verbose=False);

grid_mse.fit(X_train, y_train);

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

grid_mse.best_estimator_ # The best_estimator_ field contains the best model trained by GridSearch.

In [None]:
xgb_model1 = grid_mse.best_estimator_
xgb_model1.fit(X_train, y_train, verbose=False)
y_train_pred1 = xgb_model1.predict(X_train)
y_test_pred1 = xgb_model1.predict(X_test)

print('Train r2 score: ', r2_score(y_train, y_train_pred1))
print('Test r2 score: ', r2_score(y_test, y_test_pred1))
train_mse1 = mean_squared_error(y_train, y_train_pred1)
test_mse1 = mean_squared_error(y_test, y_test_pred1)
train_rmse1 = np.sqrt(train_mse1)
test_rmse1 = np.sqrt(test_mse1)
print('Train RMSE: %.4f' % train_rmse1)
print('Test RMSE: %.4f' % test_rmse1)
print()
print()

#### 9.2 Visualisation of cross-validation scores, validation curve, residual plot and prediction error

In [None]:
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2,figsize=(25, 12))
viz = CVScores(xgb_model1, cv=10, scoring='r2', ax=ax1)
viz.fit(X_train, y_train)
viz.finalize()
viz = ValidationCurve(xgb_model1, param_name="max_depth", param_range=np.arange(1, 11), cv=10, scoring="r2", ax=ax2);
viz.fit(X_train, y_train)
viz.finalize()
viz = ResidualsPlot(xgb_model1, hist=False, qqplot=True, ax=ax3)
viz.fit(X_train, y_train)
viz.score(X_test, y_test);
viz.finalize()
viz = PredictionError(xgb_model1, ax=ax4)
viz.fit(X_train, y_train) 
viz.score(X_test, y_test) 
viz.finalize()
f.suptitle('Performance of ridge regression on train and test datasets', fontsize=28, weight="bold");
plt.subplots_adjust(top=0.91)

#### 9.3 Performance of XGBoost on train and test datasets

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(22, 6))
ax = plt.subplot(1,2,1)
x_ax = range(len(X_train))
plt.scatter(x_ax, y_train, s=15, color="dodgerblue", label="Train original")
plt.scatter(x_ax, y_train_pred1, s=15, color="m", label="Train predicted")
#plt.plot(pred_y, 'm--', label=r"$\lambda =  {{{0:1.6f}}}$".format(best_alpha),alpha=0.4)
plt.ylabel('Log daily COVID-19 mortality', size=20)
plt.xlabel('Days since 01/April/2020', size=20)
plt.title('XGBoost prediction on training dataset', size=20)
plt.xticks(size=20)
plt.yticks(size=20)
plt.legend()
#plt.text(3.1, 3.1, 'Train R2 = 0.846')
plt.text(3, 3, 'Train R2 = 0.846',
         {'color': 'black', 'fontsize': 20, 
          'bbox': dict(boxstyle="round", fc="white", ec="black", pad=0.2)});

ax = plt.subplot(1,2,2)
x_ax = range(len(X_test))
plt.scatter(x_ax, y_test, s=15, color="dodgerblue", label="Test original")
plt.scatter(x_ax, y_test_pred1, s=15, color="m", label="Test predicted")
#plt.plot(y_pred, 'm--', label=r"$\lambda =  {{{0:1.6f}}}$".format(best_alpha),alpha=0.4)

plt.ylabel('Log daily COVID-19 mortality', size=20)
plt.xlabel('Days since 01/April/2020', size=20)
ax.set_title('XGBoost prediction on testing dataset', size=20)
plt.xticks(size=20)
plt.yticks(size=20)
plt.legend()
fig.suptitle('Comparison of XGBoost COVID-19 mortality prediction to original data', fontsize=28, weight="bold");
plt.subplots_adjust(top=0.84)
plt.text(2.8, 2.7, 'Test R2 = 0.825',
         {'color': 'black', 'fontsize': 20, 
          'bbox': dict(boxstyle="round", fc="white", ec="black", pad=0.2)});

#### 9.4 Computation of predictor importance by ridge regression

In [None]:
# Feature importance by XGBoost 
features = X.keys()
#mpl.rcParams['axes.prop_cycle'] = cycler('color', ['purple'])
fig = plt.gcf()
fig.set_size_inches(20,13)
ax = plt.subplot(211)
labels = features
viz = FeatureImportances(xgb_model1, ax=ax, labels=labels, relative=True)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.grid(False)
fig.suptitle('Predictor importance computed by the XGBoost', fontsize=28, weight="bold");
plt.subplots_adjust(top=0.90);
# Fit and display
viz.fit(X, y)
viz.poof()