# Visualising COVID-19 Vaccinations

Hello everyone! Welcome to my COVID-19 vaccinations notebook, where today we will be analysing the vaccination progress in countries around the world.

<img src="https://i.redd.it/pqu7qf28n0e41.jpg" width="400px"/>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from scipy import stats
from collections import Counter
from plotly.offline import iplot
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor, ElasticNet, Lasso, Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, MaxAbsScaler, RobustScaler

In [None]:
df = pd.read_csv('../input/covid-world-vaccination-progress/country_vaccinations.csv')

### Descriptions of features
* country - The countries which are vaccinated
* iso_code - Code which represents the currency of the country
* date - Day of entry
* iso_code - Number of vaccinations made on that day
* total_vaccinated _ Total amount of people vaccinated in the country
* people_fully_vaccinated - Number of people in the country who have two vaccinations
* daily_vaccinations - Number of vaccinations on that day
* total_vaccinations_per_hundred - Calculated by (total vaccinated / population) * 100
* people_vaccinated_per_hundred - Calculated by (people vaccinated / population) * 100
* people_fully_vaccinated_per_hundred - Calculated by (people fully vaccinated / population) * 100
* daily_vaccinations_per_million - Calculated by (daily vaccinations / population) * one million
* vaccines - Type of vaccine used
* source_name - Source of informaiton for the vaccines
* source_website - Website for source of informaiton for the vaccines

In [None]:
df.head()

In [None]:
cols_data = []
total_data = []
df_by_country = []
for country in np.unique(df['country']):
    df_by_country.append(df[df['country']==country])

def bars(data, x, y, title, figsize=(20, 12), rotation=75, size=8, width=None, height=None, colour=None):
    count = pd.DataFrame({x:data[x], y:data[y]}).sort_values(ascending=False, by=y)
    if not colour == None:
        c = count[colour]
    else:
        c = None
    fig = px.bar(count, x=x, y=y, title=title, width=width, height=height, color=c)
    fig.show()
    
def scatter(countries, col, threshold):
    i = 0
    fig = go.Figure()
    for country in countries:
        if list(df[df['country']==country][col])[-1]<threshold:
            if i % 2 == 0:
                i += 1
                continue
        i += 1
        df_temp = df[df['country']==country]
        trace = go.Scatter(x=df_temp['date'], y=df_temp[col], 
                           name=country,
                           mode='markers+lines')
        fig.add_trace(trace)
    fig.update_layout(legend_title=dict(text='Countries', font=dict(family="sans-serif",
                                         size=18)))
    fig.update_xaxes(title='date')
    fig.update_yaxes(title=col)
    fig.show()
    
def pie(data, x, y, title):
    data = pd.DataFrame({x:data.keys(), y:data.values()}).sort_values(ascending=False, by=y)
    fig = px.pie(data, values=y, names=x)
    fig.update_layout(legend_title=dict(text=title, font=dict(family="sans-serif",
                                         size=18)))
    fig.show()
    
def nulls(name, threshold, length):
    countries = np.array([])
    usable = 0
    for i in df_by_country:
        i = i.reset_index(drop=True)
        col = i[name]
        num_null = col.isnull().sum()
        col_len = len(col.index)
        usability = round(num_null/col_len*100, 2)<threshold
        if usability and col_len>length:
            usable += 1
            countries = np.append(countries, i['country'][0])
    return countries, name

In [None]:
def last_item(col):
    total_vaccs = []
    for country in df_by_country:
        total_vaccs.append(np.array(country[col])[-1])
    data = dict(zip(np.unique(df['country']), total_vaccs))
    return data

def preprocess(col, x, y):
    count = Counter(df[col])
    count = pd.DataFrame({x:count.keys(), y:count.values()}).sort_values(ascending=False, by=y)
    count = dict(zip(count[x], count[y]))
    return count

def bar_pre(col, values):
    k = 0
    for i in df[col]:
        country = df['country'][k]
        for j in i.split(', '):
            if j in values:
                values[j].append(country)
        k += 1
    return values

def bar_col(name, values, colour=False):
    col_data = bar_pre(name, values)
    cols_data.append(list(col_data.keys()))
    temp = []
    for point in list(col_data.keys()):
        column = 'days using '+point
        count = Counter(col_data[point])
        data = pd.DataFrame({'countries':count.keys(), column:count.values()})
        temp.append(data)
        if colour:
            c = column
        else:
            c = None
        bars(data, 'countries', column, 'Countries who use '+point, colour=c)
    total_data.append(temp)

# Source type

Firstly, we perform an analysis to see the most popular sources.

In [None]:
count = Counter(df['source_name'])
count = pd.DataFrame({'name':count.keys(), 'value':count.values()})
count = count[count['value']>50]
bars(count, x='name', y='value', title='sources', width=800, height=600, colour='value')

# Total vaccinations

Secondly, we will analyse which countries have the largest amount of total vaccinations.

In [None]:
data = last_item('total_vaccinations')
data = pd.DataFrame({'countries':data.keys(), 'total_vaccinations':data.values()})
data = data[data['total_vaccinations']>130000]
bars(data, 'countries', 'total_vaccinations', 'total_vaccinations per country', height=1000, colour='total_vaccinations')

# People fully vaccinated

Then, we take a look at how many people have been fully vaccinated twice.

In [None]:
data = last_item('people_fully_vaccinated')
data = pd.DataFrame({'countries':data.keys(), 'people_fully_vaccinated':data.values()})
data = data[data['people_fully_vaccinated']>50000]
bars(data, 'countries', 'people_fully_vaccinated', 'people_fully_vaccinated per country', 
     width=600, height=650, colour='people_fully_vaccinated')

# Countries using vaccines

Now we turn our attention to which countries use the different vaccines.

In [None]:
bar_col('vaccines', {'Pfizer/BioNTech':[], 'Covaxin':[], 'Oxford/AstraZeneca':[], 
                                       'Moderna':[], 'Sinopharm/Beijing':[], 
                                        'Sinopharm/Wuhan':[], 'Sputnik V':[], 'Sinovac':[]},
       colour=True)

# Countries using sources

Similarly, we see which countries use the different sources that provide information on COVID-19.

In [None]:
bar_col('source_name', {'Ministry of Health':[], 'Government of the United Kingdom':[], 
                            'National Health Commission':[], 'National Health Service':[]},
       colour=True)

# Total vaccinations for US, China and UK

Afterwards, we take a closer look at the total vaccinations for some of the leading countries; US, China and UK.

In [None]:
for country in ['United States', 'China', 'United Kingdom']:
    df_temp = df[df['country']==country]
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_temp['date'], y=df_temp['total_vaccinations'],
                             mode='markers+lines'))
    
    fig.update_layout(title_text=country)
    fig.update_xaxes(title='Date')
    fig.update_yaxes(title='Total vaccinations')
    fig.show()

# Global vaccination progress

Next, we analyse globally how the different nations are faring in the total, daily and full vaccinations.

In [None]:
for i in [['total_vaccinations', 10, 15, 1000000],
          ['total_vaccinations_per_hundred', 10, 15, 11],
          ['daily_vaccinations', 2.5, 20, 200000],
          ['daily_vaccinations_per_million', 2.05, 20, 0],
          ['people_fully_vaccinated', 60, 20, 500000],
          ['people_fully_vaccinated_per_hundred', 60, 20, 4]]:
    countries, title = nulls(i[0], i[1], i[2])
    scatter(countries, title, i[3])

# Global comparison for vaccination type

Subsequently, we visualise which vaccines are most popular among the different nations.

In [None]:
count = Counter(df['vaccines'])
count = pd.Series(count).sort_values(ascending=False)[:10]
count = dict(zip(count.keys(), count))
pie(count, 'vaccine', 'days using vaccine', '10 most used vaccine combinations')

# Vaccines used for the 5 leading countries

Then, we visualise the various vaccines used by the five leading countries.

Please note that the "Days using vaccines" label only shows how many days are included in the dataset, not representing the total amount per country in real time.

In [None]:
countries = ['United States', 'China', 'United Kingdom', 'India', 'Israel']
results = [[], []]
for country in countries:
    count = Counter(df[df['country']==country]['vaccines'])
    result = pd.DataFrame({'Vaccines':count.keys(), 'Days using vaccines':count.values()})
    pie(count, 'Vaccines', 'Days using vaccines', country)

In [None]:
df['year'] = [int(i.split('-')[0]) for i in df['date']]
df['month'] = [int(i.split('-')[1]) for i in df['date']]
df['day'] = [int(i.split('-')[2]) for i in df['date']]

for col in df:
    df[col] = df[col].fillna(0)

# Correlation

The final visualisation is checking out the correlation for each feature in our dataset.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
sns.heatmap(df.corr(), annot=True)
plt.show()

# Predicting total vaccinations

In [None]:
X = df.drop(['total_vaccinations', 'people_vaccinated', 'iso_code', 'date', 'source_website'], axis=1)
y = df['total_vaccinations']
X = X.merge(pd.get_dummies(X['source_name']), left_index=True, right_index=True).drop('source_name', axis=1)
X = X.merge(pd.get_dummies(X['vaccines']), left_index=True, right_index=True).drop('vaccines', axis=1)
X = X.drop('country', axis=1)

The last thing we will do is using machine learning models to predict the "total_vaccinations" feature using the other variables in our data.

The predictors we will use are XGBoost, Random Forest, Elastic Net, Lasso, Ridge and SVR.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model_names = ['XGBRegressor', 'RandomForestRegressor', 'ElasticNet', 'Lasso', 'Ridge', 'SVR']
score_list = []
r2_list = []
mae_list = []
mse_list = []

for model in [XGBRegressor(), RandomForestRegressor(), ElasticNet(), Lasso(), Ridge(), SVR()]:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    score = model.score(X_test, y_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    score_list.append(score)
    r2_list.append(r2)
    mae_list.append(mae)
    mse_list.append(mse)

Here we can evaluate the performance of the different algorithms.

In [None]:
results = pd.DataFrame({'score':score_list, 'r2':r2_list, 'mae':mae_list, 'mse':mse_list})
results.index = model_names
for col in results:
    temp_results = results.sort_values(by=col, ascending=False)
    fig = px.bar(temp_results, temp_results.index, col,
                 title=col, color=results[col])
    fig.show()

<img src="https://i.redd.it/g4njpylqb3q41.jpg" width="500px"/>

## Thank you for reading this notebook.
## If you enjoyed this notebook and found it helpful, please give it an upvote and provide feedback, as it would help me make more of these.