# COVID-19 Vaccine Rollout Score
Prompt: Construct a score for how well countries are doing at their vaccine rollout for COVID-19.

References
* http://www.kasimte.com/2020/02/09/linear-regression-from-time-series-data-using-scikit-learn.html
* https://www.kite.com/python/answers/how-to-scale-pandas-dataframe-columns-with-the-scikit-learn-minmaxscaler-in-python
* https://machinelearningmastery.com/a-gentle-introduction-to-normality-tests-in-python/
* https://machinelearningmastery.com/standardscaler-and-minmaxscaler-transforms-in-python/

In [1]:
# Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import scipy.stats as st
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

In [2]:
# This function determines the linear regression coefficients for a given country in a dataframe
# df = dataframe
# country = desired country (string)
# column_1 = column name (string) that will be the x axis
# column_2 = column name (string) that will be the y axis

def determine_slope(df, country, column_1, column_2):
    # Filter DataFrame to desired country
    df_f = df[[column_1, column_2]].loc[df['country']==country].dropna(subset=[column_2])
    if len(df_f) > 1:
        # Define x and y
        x = df_f[column_1].values.reshape(-1,1)
        y = df_f[column_2]

        #Create linear regression object
        regr = linear_model.LinearRegression()

        # Train linear regression model
        regr.fit(x, y)

        # Make predictions
        y_pred = regr.predict(x)

        # Calculate coefficients
        coefficients = {'country': country, 'r2_score': r2_score(y, y_pred), 'rate': regr.coef_[0]}


    else:
        coefficients = {'country': country, 'r2_score': 0, 'rate': 0}
    return(coefficients)     

In [3]:
# Import dataset
df = pd.read_csv('../data/country_vaccinations.csv')
df.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,,Oxford/AstraZeneca,Government of Afghanistan,http://www.xinhuanet.com/english/asiapacific/2...
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,35.0,Oxford/AstraZeneca,Government of Afghanistan,http://www.xinhuanet.com/english/asiapacific/2...
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,35.0,Oxford/AstraZeneca,Government of Afghanistan,http://www.xinhuanet.com/english/asiapacific/2...
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,35.0,Oxford/AstraZeneca,Government of Afghanistan,http://www.xinhuanet.com/english/asiapacific/2...
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,35.0,Oxford/AstraZeneca,Government of Afghanistan,http://www.xinhuanet.com/english/asiapacific/2...


## Total Vaccines Distributed and People Fully Vaccinated

In [4]:
# Group dataset by country and pull last date for each country
df_countries = df.groupby('country').max()['date'].reset_index()
df_countries.head()

Unnamed: 0,country,date
0,Afghanistan,2021-03-16
1,Albania,2021-03-27
2,Algeria,2021-02-19
3,Andorra,2021-03-19
4,Angola,2021-03-24


In [5]:
# Merge the last date by country dataset with the original dataset to have one dataset with one entry per country (last date only)
df_last_date = pd.merge(df, df_countries, how='inner', left_on=['country', 'date'], right_on=['country', 'date'])
df_last_date.head()

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,2021-03-16,54000.0,54000.0,,,2862.0,0.14,0.14,,74.0,Oxford/AstraZeneca,Government of Afghanistan,http://www.xinhuanet.com/english/asiapacific/2...
1,Albania,ALB,2021-03-27,64075.0,,,1827.0,3113.0,2.23,,,1082.0,Pfizer/BioNTech,Ministry of Health,https://coronavirus.al/lajme/covid19-ministria...
2,Algeria,DZA,2021-02-19,75000.0,,,,3748.0,0.17,,,85.0,Sputnik V,Ministry of Health,https://www.echoroukonline.com/%d9%84%d9%82%d8...
3,Andorra,AND,2021-03-19,9288.0,,,,428.0,12.02,,,5539.0,Pfizer/BioNTech,Government of Andorra,https://www.govern.ad/comunicats/item/12558-el...
4,Angola,AGO,2021-03-24,87022.0,87022.0,,,6044.0,0.26,0.26,,184.0,Oxford/AstraZeneca,Ministry of Health,https://www.angop.ao/en/noticias/saude/covid-1...


In [6]:
# Display countries with highest percent total vaccinations and percent people fully vaccinated
df_vax_distr = df_last_date[['country', 'total_vaccinations_per_hundred', 'people_fully_vaccinated_per_hundred']]
df_vax_distr.sort_values(by='people_fully_vaccinated_per_hundred', ascending=False).head(20)

Unnamed: 0,country,total_vaccinations_per_hundred,people_fully_vaccinated_per_hundred
48,Gibraltar,171.1,79.26
65,Israel,114.7,54.4
125,Seychelles,98.15,33.95
24,Cayman Islands,67.04,23.52
88,Monaco,46.07,21.23
18,Bermuda,54.93,21.12
25,Chile,50.45,17.04
148,United States,41.91,14.99
12,Bahrain,42.55,14.45
124,Serbia,33.55,13.47


## Rate of Vaccination

In [7]:
# Calculate days from start date for each row in the dataframe
df_dates = df.set_index('date').sort_values(by='date')
df_dates.index = pd.to_datetime(df_dates.index)
df_dates['days_from_start'] = (df_dates.index - df_dates.index[0]).days
df_dates.head()

Unnamed: 0_level_0,country,iso_code,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website,days_from_start
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-12-13,Scotland,OWID_SCT,19009.0,19009.0,,,,0.35,0.35,,,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...,0
2020-12-13,United Kingdom,GBR,86465.0,86465.0,,,,0.13,0.13,,,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...,0
2020-12-13,Wales,OWID_WLS,8257.0,8257.0,,,,0.26,0.26,,,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...,0
2020-12-13,Northern Ireland,OWID_NIR,3623.0,3623.0,,,,0.19,0.19,,,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...,0
2020-12-13,England,OWID_ENG,55576.0,55576.0,,,,0.1,0.1,,,"Oxford/AstraZeneca, Pfizer/BioNTech",Government of the United Kingdom,https://coronavirus.data.gov.uk/details/health...,0


In [8]:
# Start date of dataframe
df_dates.index[0]

Timestamp('2020-12-13 00:00:00')

In [9]:
# Create list of all countries in the dataset
countries = list(df_dates['country'].unique())

In [10]:
# Calculate r2 score and linear regression coefficient of percent people fully vaccinated for each country
all_coefs = [determine_slope(df_dates, country, 'days_from_start', 'people_fully_vaccinated_per_hundred') for country in countries]
df_coefs_fully = pd.DataFrame(all_coefs).rename(columns={'r2_score': 'r2_score_fully', 'rate': 'rate_fully'})
df_coefs_fully.head()

Unnamed: 0,country,r2_score_fully,rate_fully
0,Scotland,0.797383,0.060308
1,United Kingdom,0.704878,0.038576
2,Wales,0.784124,0.16432
3,Northern Ireland,0.694973,0.047005
4,England,0.647956,0.030076


In [11]:
# Check that r2 scores aren't below 0.5
df_coefs_fully.loc[(df_coefs_fully['r2_score_fully'] < .5) & (df_coefs_fully['rate_fully'] > 0)]

Unnamed: 0,country,r2_score_fully,rate_fully


In [12]:
# Calculate r2 score and linear regression coefficient of percent total vaccinations for each country
people_coefs = [determine_slope(df_dates, country, 'days_from_start', 'people_vaccinated_per_hundred') for country in countries]
df_coefs_people = pd.DataFrame(people_coefs)
df_coefs_people.head()

Unnamed: 0,country,r2_score,rate
0,Scotland,0.964396,0.515474
1,United Kingdom,0.979414,0.5096
2,Wales,0.967615,0.505771
3,Northern Ireland,0.978591,0.453379
4,England,0.980338,0.520777


In [13]:
# Check that r2 scores aren't below 0.5
df_coefs_people.loc[(df_coefs_people['r2_score'] < .5) & (df_coefs_people['rate'] >0)]

Unnamed: 0,country,r2_score,rate


In [14]:
# Combine percent data with rate of fully vaccinated
df2 = pd.merge(df_vax_distr, df_coefs_fully[['country', 'rate_fully']], left_on=['country'], right_on=['country'])
df2.head()

Unnamed: 0,country,total_vaccinations_per_hundred,people_fully_vaccinated_per_hundred,rate_fully
0,Afghanistan,0.14,,0.0
1,Albania,2.23,,0.000902
2,Algeria,0.17,,0.0
3,Andorra,12.02,,0.013846
4,Angola,0.26,,0.0


In [15]:
# Add rate of total vaccinations and replace null values with 0
df_all_scores = pd.merge(df2, df_coefs_people[['country', 'rate']], left_on=['country'], right_on=['country']).fillna(0)
df_all_scores.head()

Unnamed: 0,country,total_vaccinations_per_hundred,people_fully_vaccinated_per_hundred,rate_fully,rate
0,Afghanistan,0.14,0.0,0.0,0.006598
1,Albania,2.23,0.0,0.000902,0.002588
2,Algeria,0.17,0.0,0.0,0.0
3,Andorra,12.02,0.0,0.013846,0.123653
4,Angola,0.26,0.0,0.0,0.013695


In [16]:
# Check distribution of each column

shapiro_test1 = stats.shapiro(df_all_scores['total_vaccinations_per_hundred'])
print(shapiro_test1)

shapiro_test2 = stats.shapiro(df_all_scores['people_fully_vaccinated_per_hundred'])
print(shapiro_test2)

shapiro_test3 = stats.shapiro(df_all_scores['rate_fully'])
print(shapiro_test3)

shapiro_test4 = stats.shapiro(df_all_scores['rate'])
print(shapiro_test4)

ShapiroResult(statistic=0.6574975252151489, pvalue=1.9275831173353832e-17)
ShapiroResult(statistic=0.4284009337425232, pvalue=4.856419078641719e-22)
ShapiroResult(statistic=0.46877068281173706, pvalue=2.3963610157665527e-21)
ShapiroResult(statistic=0.4982318878173828, pvalue=8.15327938324581e-21)


In [17]:
# Data is NOT nomral, so use minmax scaler
min_max_scaler = MinMaxScaler()
df_all_scores[['total_vaccinations_per_hundred', 
               'people_fully_vaccinated_per_hundred', 
               'rate_fully', 
               'rate']] = min_max_scaler.fit_transform(df_all_scores[['total_vaccinations_per_hundred', 
                                                                      'people_fully_vaccinated_per_hundred', 
                                                                      'rate_fully', 
                                                                      'rate']])
df_all_scores

Unnamed: 0,country,total_vaccinations_per_hundred,people_fully_vaccinated_per_hundred,rate_fully,rate
0,Afghanistan,0.000818,0.000000,0.000000,0.001960
1,Albania,0.013033,0.000000,0.000785,0.000769
2,Algeria,0.000994,0.000000,0.000000,0.000000
3,Andorra,0.070251,0.000000,0.012048,0.036741
4,Angola,0.001520,0.000000,0.000000,0.004069
...,...,...,...,...,...
149,Uruguay,0.085798,0.000126,0.000000,0.149361
150,Venezuela,0.000234,0.000000,0.000000,0.000849
151,Vietnam,0.000292,0.000000,0.000000,0.000813
152,Wales,0.327411,0.160358,0.142982,0.150278


In [22]:
# Calculate score for each - vaccinations per hundred (total and fully) are weighted at 35% each and rates are weighted at 15% each
df_all_scores['rollout_score'] = 0.35*df_all_scores['people_fully_vaccinated_per_hundred'] + 0.35*df_all_scores['total_vaccinations_per_hundred'] + 0.15*df_all_scores['rate_fully'] + 0.15*df_all_scores['rate']
df_all_scores['rollout_score'] = df_all_scores['rollout_score']*100

In [23]:
df_all_scores.sort_values('rollout_score', ascending=False)

Unnamed: 0,country,total_vaccinations_per_hundred,people_fully_vaccinated_per_hundred,rate_fully,rate,rollout_score
48,Gibraltar,1.000000,1.000000,1.000000,0.348885,90.233275
65,Israel,0.670368,0.686349,0.639411,0.193194,59.974156
125,Seychelles,0.573641,0.428337,0.506709,0.267278,46.679042
24,Cayman Islands,0.391818,0.296745,0.282600,0.169965,30.888170
18,Bermuda,0.321040,0.266465,0.393021,0.143818,28.615267
...,...,...,...,...,...,...
151,Vietnam,0.000292,0.000000,0.000000,0.000813,0.022426
150,Venezuela,0.000234,0.000000,0.000000,0.000849,0.020916
95,Namibia,0.000058,0.000000,0.000000,0.000424,0.008413
11,Bahamas,0.000175,0.000000,0.000000,0.000000,0.006137
