In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
import matplotlib.ticker as tick

import seaborn as sns

colors = ["#9DAFB0", "#75E6DA", '#189AB4', '#05445E', '#4D707E', '#56B66B']


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Intro 

Hi all, welcome to my note book this is my attempt to visualize the progress of vaccination across the world. 

In this notebook i focused on visualizing based on continents! if you see any flaw in my EDA or if you have any feedback or criticism that you want to give, please fill free to comment cheers!

# Data Preparation

In [None]:
df_data = pd.read_csv("/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv")
df_data

In [None]:
df_data.date = pd.to_datetime(df_data.date)

In [None]:
df_world = pd.read_csv("/kaggle/input/population-by-country-2020/population_by_country_2020.csv")
df_world.rename(columns={'Country (or dependency)': "country"},inplace=True)
df_world

In [None]:
df_continent = pd.read_csv("/kaggle/input/country-mapping-iso-continent-region/continents2.csv")
df_continent

In [None]:
col = [
    'name', 
    'region',
    'Population (2020)',
    'alpha-2',
    'alpha-3'
]

df_countries = pd.merge(df_continent, df_world, left_on='name', right_on='country', how='left')
df_countries = df_countries[col]
df_countries

In [None]:
df_merged = pd.merge(df_data, df_countries, left_on='country', right_on='name', how='left')
df_merged

In [None]:
def reformat_large_tick_values(tick_val, pos):
    """
    Turns large tick values (in the billions, millions and thousands) such as 4500 into 4.5K and also appropriately turns 4000 into 4K (no zero after the decimal).
    """
    if tick_val >= 1000000000:
        val = round(tick_val/1000000000, 1)
        new_tick_format = '{:}B'.format(val)
    elif tick_val >= 1000000:
        val = round(tick_val/1000000, 1)
        new_tick_format = '{:}M'.format(val)
    elif tick_val >= 1000:
        val = round(tick_val/1000, 1)
        new_tick_format = '{:}K'.format(val)
    elif tick_val < 1000:
        new_tick_format = round(tick_val, 1)
    else:
        new_tick_format = tick_val

    # make new_tick_format into a string value
    new_tick_format = str(new_tick_format)
    
    # code below will keep 4.5M as is but change values such as 4.0M to 4M since that zero after the decimal isn't needed
    index_of_decimal = new_tick_format.find(".")
    
    if index_of_decimal != -1:
        value_after_decimal = new_tick_format[index_of_decimal+1]
        if value_after_decimal == "0":
            # remove the 0 after the decimal point since it's not needed
            new_tick_format = new_tick_format[0:index_of_decimal] + new_tick_format[index_of_decimal+2:]
            
    return new_tick_format

# EDA

## Vaccination on Africa

In [None]:
target_country = 'Africa'
df = df_merged.groupby(['region', 'vaccines']).sum().reset_index()
df_continent = df[df.region == target_country].sort_values("daily_vaccinations")

fig, ax = plt.subplots(figsize=(16, 8))

ax.barh(df_continent.vaccines, df_continent.daily_vaccinations, color='#189AB4', edgecolor='black')

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.xaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_axisbelow(True)

plt.legend(['Total Vaccination'])
plt.title(f"The use of different kind of vaccines in {target_country}", loc='left')
plt.grid(alpha=0.3, axis='x')
plt.suptitle(f"{target_country}'s Vaccination", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()

In [None]:
target_country = 'Africa'
target_vaccine = ['Oxford/AstraZeneca, Sinopharm/Beijing', 'Sinopharm/Beijing', 'Johnson&Johnson', 'Moderna, Oxford/AstraZeneca, Pfizer/BioNTech']
df = df_merged[df_merged.region == target_country]

fig, ax = plt.subplots(1, 4, figsize=(24, 6))
for i in range(4) : 

    df_vacc = df[df.vaccines == target_vaccine[i]]
    df_vacc = df_vacc.groupby(['country']).sum().sort_values('daily_vaccinations', ascending=False)[0:5]
    ax[i].bar(df_vacc.index, df_vacc.daily_vaccinations, color='#189AB4', edgecolor='black')
    ax[i].spines['right'].set_visible(False)
    ax[i].spines['top'].set_visible(False)
    ax[i].yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
    ax[i].set_axisbelow(True)
    ax[i].set_title(target_vaccine[i], loc='left')
    ax[i].grid(alpha=0.3, axis='y')
    ax[i].set_xlabel("Country")
    ax[i].set_ylabel("Vaccinations")


plt.legend(['Total Vaccination'], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False)
plt.suptitle(f"{target_country}'s Vaccination Top 4 Vaccines", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()

## Vaccination on Europe

In [None]:
target_country = 'Europe'
df = df_merged.groupby(['region', 'vaccines']).sum().reset_index()
df_continent = df[df.region == target_country].sort_values("daily_vaccinations")

fig, ax = plt.subplots(figsize=(16, 8))
ax.barh(df_continent.vaccines, df_continent.daily_vaccinations, color='#189AB4', edgecolor='black')

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.xaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_axisbelow(True)

plt.legend(['Total Vaccination'])
plt.title(f"The use of different kind of vaccines in {target_country}", loc='left')
plt.grid(alpha=0.3, axis='x')
plt.suptitle(f"{target_country}'s Vaccination", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()

In [None]:
target_country = 'Europe'
target_vaccine = ['Moderna, Oxford/AstraZeneca, Pfizer/BioNTech', 'Oxford/AstraZeneca, Pfizer/BioNTech', 'EpiVacCorona, Sputnik V', 'Moderna, Pfizer/BioNTech']
df = df_merged[df_merged.region == target_country]

fig, ax = plt.subplots(1, 4, figsize=(24, 6))
for i in range(4) : 

    df_vacc = df[df.vaccines == target_vaccine[i]]
    df_vacc = df_vacc.groupby(['country']).sum().sort_values('daily_vaccinations', ascending=False)[0:5]
    ax[i].bar(df_vacc.index, df_vacc.daily_vaccinations, color='#189AB4', edgecolor='black')
    ax[i].spines['right'].set_visible(False)
    ax[i].spines['top'].set_visible(False)
    ax[i].yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
    ax[i].set_axisbelow(True)
    ax[i].set_title(target_vaccine[i], loc='left')
    ax[i].grid(alpha=0.3, axis='y')
    ax[i].set_xlabel("Country")
    ax[i].set_ylabel("Vaccinations")


plt.legend(['Total Vaccination'], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False)
plt.suptitle(f"{target_country}'s Vaccination Top 4 Vaccines", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()

## Vaccination on Asia

In [None]:
target_country = 'Asia'
df = df_merged.groupby(['region', 'vaccines']).sum().reset_index()
df_continent = df[df.region == target_country].sort_values("daily_vaccinations")

fig, ax = plt.subplots(figsize=(16, 8))

ax.barh(df_continent.vaccines, df_continent.daily_vaccinations, color='#189AB4', edgecolor='black')

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.xaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_axisbelow(True)

plt.legend(['Total Vaccination'])
plt.title(f"The use of different kind of vaccines in {target_country}", loc='left')
plt.grid(alpha=0.3, axis='x')
plt.suptitle(f"{target_country}'s Vaccination", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()

In [None]:
target_country = 'Asia'
target_vaccine = ['Sinopharm/Beijing, Sinopharm/Wuhan, Sinovac', 'Covaxin, Oxford/AstraZeneca', 'Sinovac', 'Moderna, Pfizer/BioNTech']
df = df_merged[df_merged.region == target_country]

fig, ax = plt.subplots(1, 4, figsize=(24, 6))
for i in range(4) : 

    df_vacc = df[df.vaccines == target_vaccine[i]]
    df_vacc = df_vacc.groupby(['country']).sum().sort_values('daily_vaccinations', ascending=False)[0:5]
    ax[i].bar(df_vacc.index, df_vacc.daily_vaccinations, color='#189AB4', edgecolor='black')
    ax[i].spines['right'].set_visible(False)
    ax[i].spines['top'].set_visible(False)
    ax[i].yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
    ax[i].set_axisbelow(True)
    ax[i].set_title(target_vaccine[i], loc='left')
    ax[i].grid(alpha=0.3, axis='y')
    ax[i].set_xlabel("Country")
    ax[i].set_ylabel("Vaccinations")


plt.legend(['Total Vaccination'], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False)
plt.suptitle(f"{target_country}'s Vaccination Top 4 Vaccines", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()

## Vaccination on America

In [None]:
target_country = 'Americas'
df = df_merged.groupby(['region', 'vaccines']).sum().reset_index()
df_continent = df[df.region == target_country].sort_values("daily_vaccinations")

fig, ax = plt.subplots(figsize=(16, 8))

ax.barh(df_continent.vaccines, df_continent.daily_vaccinations, color='#189AB4', edgecolor='black')

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.xaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_axisbelow(True)

plt.legend(['Total Vaccination'])
plt.title(f"The use of different kind of vaccines in {target_country}", loc='left')
plt.grid(alpha=0.3, axis='x')
plt.suptitle(f"{target_country}'s Vaccination", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()

In [None]:
target_country = 'Americas'
target_vaccine = ['Johnson&Johnson, Moderna, Pfizer/BioNTech', 'Oxford/AstraZeneca, Sinovac', 'Pfizer/BioNTech, Sinovac', 'Oxford/AstraZeneca, Pfizer/BioNTech, Sputnik V']
df = df_merged[df_merged.region == target_country]

fig, ax = plt.subplots(1,4, figsize=(24, 6))
for i in range(4) : 

    df_vacc = df[df.vaccines == target_vaccine[i]]
    df_vacc = df_vacc.groupby(['country']).sum().sort_values('daily_vaccinations', ascending=False)[0:5]
    ax[i].bar(df_vacc.index, df_vacc.daily_vaccinations, color='#189AB4', edgecolor='black')
    ax[i].spines['right'].set_visible(False)
    ax[i].spines['top'].set_visible(False)
    ax[i].yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
    ax[i].set_axisbelow(True)
    ax[i].set_title(target_vaccine[i], loc='left')
    ax[i].grid(alpha=0.3, axis='y')
    ax[i].set_xlabel("Country")
    ax[i].set_ylabel("Vaccinations")


plt.legend(['Total Vaccination'], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False)
plt.suptitle(f"{target_country}'s Vaccination Top 4 Vaccines", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()

## Vaccination Oceania

In [None]:
target_country = 'Oceania'
df = df_merged.groupby(['region', 'vaccines']).sum().reset_index()
df_continent = df[df.region == target_country].sort_values("daily_vaccinations")

fig, ax = plt.subplots(figsize=(12, 6))

ax.barh(df_continent.vaccines, df_continent.daily_vaccinations, color='#189AB4', edgecolor='black')

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.xaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_axisbelow(True)

plt.legend(['Total Vaccination'])
plt.title(f"The use of different kind of vaccines in {target_country}", loc='left')
plt.grid(alpha=0.3, axis='x')
plt.suptitle(f"{target_country}'s Vaccination", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()

## World Vaccination Progress

In [None]:
df1 = df_merged.groupby('region').sum()
df2 = df_countries.groupby('region').sum()

fig, ax = plt.subplots(figsize=(20, 8))
bars1 = ax.bar(df1.index, df1.daily_vaccinations, color='#189AB4')
bars2 = ax.bar(df2.index, df2['Population (2020)'], color='#9DAFB0', alpha=0.5)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_axisbelow(True)
ax.set_xlabel("Continent", fontweight='bold')
ax.set_ylabel("People vaccinated", fontweight='bold')

for bar in bars1 : 
    height = bar.get_height()
    ax.annotate('{}'.format(
        reformat_large_tick_values(height, 0)), 
        xy=(bar.get_x() + bar.get_width()  / 2, height), 
        ha='center', 
        va='bottom'
    )

plt.legend(['Total people vaccinated', 'Total population'])
plt.title(f"The number of people vaccinated for every continent approximately", loc='left')
plt.grid(alpha=0.3, axis='y')
plt.suptitle("World Vaccination Progress", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()


In [None]:
df1 = df1[['daily_vaccinations']]
df1['not_vaccinated'] = df2['Population (2020)'] - df1['daily_vaccinations']
total = df2['Population (2020)']

bar_vax = [i / j * 100 for i, j in zip(df1['daily_vaccinations'], total)]
bar_not = [i / j * 100 for i, j in zip(df1['not_vaccinated'], total)]

fig, ax = plt.subplots(figsize=(20, 8))
bars1 = ax.bar(df1.index, bar_vax, color='#189AB4')
bars2 = ax.bar(df1.index, bar_not, color='#9DAFB0', bottom=bar_vax, alpha=0.5)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_axisbelow(True)
ax.set_xlabel("Continent", fontweight='bold')
ax.set_ylabel("% People vaccinated", fontweight='bold')

for bar in bars1 : 
    height = bar.get_height()
    ax.annotate('{}%'.format(
        reformat_large_tick_values(height, 0)), 
        xy=(bar.get_x() + bar.get_width()  / 2, height), 
        ha='center', 
        va='bottom'
    )

plt.legend(['% people vaccinated', '% not vaxinated'])
plt.title(f"The number of percentage of people vaccinated for every continent approximately", loc='left')
plt.grid(alpha=0.3, axis='y')
plt.suptitle("World Vaccination Progress", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()

In [None]:
df = df_merged.groupby(['region', 'date']).sum().reset_index()

colors = ["#9DAFB0", "#75E6DA", '#189AB4', '#F67B50', '#A82810']


fig, ax = plt.subplots(figsize=(20, 8))
sns.lineplot(data=df, x=df.date, y=df.daily_vaccinations, hue='region', palette=colors[0:5])

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_axisbelow(True)
ax.set_xlabel("Date", fontweight='bold')
ax.set_ylabel("Daily Vaccinations", fontweight='bold')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False)

plt.title(f"The number of daily vaccinations accross multiple continent", loc='left')
plt.grid(alpha=0.3, axis='y')
plt.suptitle("World Vaccination Progress", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()


In [None]:
col = df_merged.groupby('country').max()['total_vaccinations'].sort_values(ascending=False)[0:10].index
col = list(col)


fig, ax = plt.subplots(figsize=(20, 8))
df = df_merged[df_merged.country.isin(col)]

sns.lineplot(data=df, x='date', y='total_vaccinations', hue='country', ax=ax, palette="hls")

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_axisbelow(True)
ax.set_xlabel("Date", fontweight='bold')
ax.set_ylabel("Vaccinations", fontweight='bold')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False)

plt.title(f"The number of total vaccinations on top 10 country", loc='left')
plt.grid(alpha=0.3, axis='y')
plt.suptitle("World Vaccination Progress (Top 10 Country)", ha='left', x='0.125', y=0.96, fontsize=18, fontweight='bold')
plt.show()