# Introduction

Hello, I'm a complete beginner to data science and trying out my first dataset submission. Thanks Gabriel Preda for releasing this [set](https://www.kaggle.com/gpreda/covid-world-vaccination-progress)!

Insights gathered:
 - What types of vaccines are being used around the world?
 - Which countries are using which vaccines?
 - Which countries have vaccinated the most people?
 - Which countries have vaccinated the highest percentage of their population?

# What vaccines are beings used around the world?

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import display

covid_data = pd.read_csv('../input/covid-world-vaccination-progress/country_vaccinations.csv')

# List of list of vaccines
all_vaccines = [v.split(',') for v in covid_data.vaccines]
# Flatten list
all_vaccines = np.concatenate(all_vaccines)
# Trim spaces
all_vaccines = [v.strip() for v in all_vaccines]

all_unique_vaccines = np.unique(all_vaccines).tolist()

message = 'There are {} unique vaccines being used around the world:\n';
for vaccine in all_unique_vaccines:
    message += f" - {vaccine}\n"
    
print(message.format(len(all_unique_vaccines)))

## How many countries are using each vaccine?

In [None]:
vaccines_to_num_countries = {vaccine: 0 for vaccine in all_unique_vaccines}

vaccines_per_country = covid_data.groupby(['country', 'vaccines']).size().reset_index()
vaccines_per_country = vaccines_per_country.drop(columns = 0) # Drop the last column (which is named 0)

# Some countries use multiple vaccine types
for _, row in vaccines_per_country.iterrows():
    vaccine_list = row.vaccines.split(',')
    
    for vaccine in vaccine_list:
        vaccines_to_num_countries[vaccine.strip()] += 1

vaccine_usage_df = pd.DataFrame(data = {
    'Vaccine': vaccines_to_num_countries.keys(),
    '# Countries': vaccines_to_num_countries.values()
})

vaccine_usage_df.sort_values(by = ['# Countries'], ascending = False).reset_index(drop = True)

In [None]:
columns = ['total_vaccinations', 'people_fully_vaccinated']
cleaned_covid_data = covid_data.apply(lambda row: row.fillna(0) if row.name in columns else row)

vaccinations_by_country = cleaned_covid_data.groupby('country').sum().loc[:, columns]

# Top 10 countries by total vaccinations

In [None]:
from IPython.core.display import HTML

top_by_total_series = vaccinations_by_country.total_vaccinations.sort_values(ascending = False).iloc[:10]
top_by_total_df = pd.DataFrame({ 'Total Vaccinations': top_by_total_series })

top_by_full_series = vaccinations_by_country.people_fully_vaccinated.sort_values(ascending = False).iloc[:10]
top_by_full_df = pd.DataFrame({ 'Total Full Vaccinations': top_by_full_series })

with pd.option_context('display.float_format', '{:,.0f}'.format):
    display(HTML(
        '<table><tr>'
            f'<td>{top_by_total_df.to_html()}</td>'
            '<td style="background-color: white"></td>'
            f'<td>{top_by_full_df.to_html()}</td>'
        '</tr></table>'
    ))


# Top 10 countries by percentage of population vaccinated

In [None]:
columns = ['total_vaccinations_per_hundred', 'people_fully_vaccinated_per_hundred']
cleaned_covid_data = covid_data.apply(lambda row: row.fillna(0) if row.name in columns else row)

# Get the latest date data for each country as a list of tuples.
latest_country_dates = cleaned_covid_data.groupby('country').max().loc[:, 'date']
latest_country_dates = list(latest_country_dates.items())

# Get the rows at each latest date.
cleaned_covid_data = cleaned_covid_data.set_index(['country', 'date'])
latest_covid_data = cleaned_covid_data.loc[latest_country_dates]

# Select relevant columns and index by country.
latest_covid_data = latest_covid_data.reset_index().set_index('country')
latest_covid_data = latest_covid_data.loc[:, ['total_vaccinations_per_hundred', 'people_fully_vaccinated_per_hundred']]

# Rename columns.
total_column, total_full_column = '% Population Vaccinated', '% Population Fully Vaccinated'
latest_covid_data = latest_covid_data.rename(columns = {'total_vaccinations_per_hundred': total_column, 'people_fully_vaccinated_per_hundred': total_full_column})

# Get top 10 of each category.
top_by_total_percent = latest_covid_data.loc[:, [total_column]].sort_values(by = total_column, ascending = False).iloc[:10]
top_by_full_percent = latest_covid_data.loc[:, [total_full_column]].sort_values(by = total_full_column, ascending = False).iloc[:10]

with pd.option_context('display.float_format', '{:,.0f}'.format):
    display(HTML(
        '<table><tr>'
            f'<td>{top_by_total_percent.to_html()}</td>'
            '<td style="background-color: white"></td>'
            f'<td>{top_by_full_percent.to_html()}</td>'
        '</tr></table>'
    ))