![](https://1gew6o3qn6vx9kp3s42ge0y1-wpengine.netdna-ssl.com/wp-content/uploads/prod/2020/12/GettyImages-1248797862-scaled.jpg)

## Explore the dataset

What Columns dose the dataset include

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import plotly.express as px

pd.set_option('display.max_colwidth', None)

In [None]:
header = pd.read_csv('/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv', nrows=0).columns
list(header)

Only load some columns from the original csv. The below table shows how a record looks like.

In [None]:
cols = ['country', 'date', 'total_vaccinations', 'daily_vaccinations', 'total_vaccinations_per_hundred',
        'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'vaccines']

data = pd.read_csv('/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv', usecols=cols)
data

The data information shows there are many '*NaN*' in the dataset.

In [None]:
data.info()

The '*vaccines*' column have many values combinated with multiple vaccines seperated by comma.

In [None]:
pd.DataFrame(data['vaccines'].value_counts())

## Process vaccines data

Firstly, let's replace all '*NaN*' with 0.

In [None]:
data = data.fillna(0)
data

Prepare a small dataset with only 4 columns as: '*country*', '*date*', '*daily_vaccinations*' and '*vaccines*'.

For each row of this dataset, if the '*vaccines*' field includes multiple vaccines, then create new row for each vaccine, and assign an average '*daily_vaccinations*' value to the '*daily_vaccinations*' field of a new row. At last, put the new rows to the new data set - '*averaged_data*'.

**Note**: This average method is not very reasonable and just for showing the statistics of each single vaccine.

In [None]:
subdata = data[['country', 'date', 'daily_vaccinations', 'vaccines']]
averaged_data = []

for row in subdata.itertuples():
    vaccines = row.vaccines.split(',')
    number = len(vaccines)
    count = 0
    total_vaccinations = row.daily_vaccinations
    average = float(math.ceil(row.daily_vaccinations / number))
    remain = row.daily_vaccinations - average * (number - 1)
    
    for vac in vaccines:
        if (vac != vaccines[number - 1]):
            averaged_data.append((row.country, row.date, average, vac.strip()))
        else:
            averaged_data.append((row.country, row.date, remain, vac.strip()))
        
averaged_data = pd.DataFrame(averaged_data, columns = ['country', 'date', 'daily_vaccinations', 'vaccines'])
averaged_data

Figure out the total vaccinations for each vaccine from the daily vaccinations data. Moreover, how many countries and what countries are using a certain vaccine can be figured out based on the averaged data. 



In [None]:
vaccine_data  = averaged_data.groupby('vaccines')['daily_vaccinations'].sum()
vaccine_data = pd.DataFrame(vaccine_data)
vaccine_data = vaccine_data.sort_values(ascending=False, by='daily_vaccinations')

number_list = []
name_list = []
for name in vaccine_data.index:
    countries = averaged_data[averaged_data['country'].notnull() & (averaged_data['vaccines'] == name)]['country'].unique()
    name_list.append(', '.join(countries))
    number_list.append(countries.size)
    
vaccine_data['total_countries'] = number_list
vaccine_data['countries'] = name_list
vaccine_data

## Visualization

In [None]:
fig = px.bar(x=vaccine_data.index, y=vaccine_data['daily_vaccinations'],
             color=vaccine_data.index,
             title='How Many Vaccinations With a Certain Vaccine',
             labels={"x": "Vaccine", "y": "Number of Vaccinations"},
             color_discrete_sequence=px.colors.sequential.Electric)
fig.show()

In [None]:
fig = px.bar(x=vaccine_data.index, y=vaccine_data['total_countries'],
             color=vaccine_data.index,
             title='How Many Countries With a certain Vaccine',
             labels={"x": "Vaccine", "y": "Number of Countries"},
             color_discrete_sequence=px.colors.sequential.Electric)
fig.show()

In [None]:
dates = averaged_data.date.unique()
vaccines = vaccine_data.index
vaccine_daily_data = pd.DataFrame(index = dates, columns = vaccines)
vaccine_daily_data = vaccine_daily_data.sort_index()

for date in dates:
    daily_data  = averaged_data[averaged_data['date'] == date].groupby('vaccines')['daily_vaccinations'].sum()
    vaccine_daily_data.loc[date] = daily_data

In [None]:
fig = px.line(vaccine_daily_data,
              x=vaccine_daily_data.index, y=vaccine_daily_data.columns,
              title='Daily Vaccinations Trend - Against Each Vaccine')

fig.update_layout(xaxis_title='Date', yaxis_title='Number of Daily Vaccinations')
fig.show()

In [None]:
vaccine_cumulative_data = vaccine_daily_data
dates = np.sort(dates)

for i in range(len(dates)):
    if (i > 0):
        vaccine_cumulative_data.loc[dates[i]] = vaccine_cumulative_data.loc[dates[i - 1]].add(vaccine_daily_data.loc[dates[i]], fill_value = 0)

In [None]:
fig = px.line(vaccine_cumulative_data,
              x=vaccine_cumulative_data.index, y=vaccine_daily_data.columns,
              title='Cumulative Vaccinations Trend - Against Each Vaccine')

fig.update_layout(xaxis_title='Date', yaxis_title='Number of Cumulative Vaccinations')
fig.show()

In [None]:
top = 30

vaccinations_data = data.groupby('country')['total_vaccinations'].max()
vaccinations_data = pd.DataFrame(vaccinations_data)
vaccinations_data = vaccinations_data.sort_values(ascending=False, by='total_vaccinations').iloc[:top]

In [None]:
fig = px.bar(x=vaccinations_data.index, y=vaccinations_data['total_vaccinations'],
             color=vaccinations_data.index,
             title=f'Top {top} Countries - Total Vaccinations',
             labels={"x": "Country", "y": "Number of Vaccinations"},
             color_discrete_sequence =px.colors.sequential.Electric)
fig.show()

In [None]:
vaccinations_ratio_data = data.groupby('country')['total_vaccinations_per_hundred'].max()
vaccinations_ratio_data = pd.DataFrame(vaccinations_ratio_data)
vaccinations_ratio_data = vaccinations_ratio_data.sort_values(ascending=False, by='total_vaccinations_per_hundred').iloc[:top]

fig = px.bar(x=vaccinations_ratio_data.index, y=vaccinations_ratio_data['total_vaccinations_per_hundred'],
             color=vaccinations_ratio_data.index,
             title=f'Top {top} Countries - Vaccinations Per Hundred',
             labels={"x": "Country", "y": "Vaccinations Per Hundred"},
             color_discrete_sequence =px.colors.sequential.Electric)
fig.show()

In [None]:
vaccinated_ratio_data = data.groupby('country')['people_vaccinated_per_hundred'].max()
vaccinated_ratio_data = pd.DataFrame(vaccinated_ratio_data)
vaccinated_ratio_data = vaccinated_ratio_data.sort_values(ascending=False, by='people_vaccinated_per_hundred').iloc[:top]

fig = px.bar(x=vaccinated_ratio_data.index, y=vaccinated_ratio_data['people_vaccinated_per_hundred'],
             color=vaccinated_ratio_data.index,
             title=f'Top {top} Countries - Peple Vaccinated Per Hundred',
             labels={"x": "Country", "y": "Peple Vaccinated Per Hundred"},
             color_discrete_sequence =px.colors.sequential.Electric)
fig.show()

In [None]:
fully_vaccinated_ratio_data = data.groupby('country')['people_fully_vaccinated_per_hundred'].max()
fully_vaccinated_ratio_data = pd.DataFrame(fully_vaccinated_ratio_data)
fully_vaccinated_ratio_data = fully_vaccinated_ratio_data.sort_values(ascending=False, by='people_fully_vaccinated_per_hundred').iloc[:top]

fig = px.bar(x=fully_vaccinated_ratio_data.index, y=fully_vaccinated_ratio_data['people_fully_vaccinated_per_hundred'],
             color=fully_vaccinated_ratio_data.index,
             title=f'Top {top} Countries - People Fully Vaccinated Per Hundred',
             labels={"x": "Country", "y": "People Fully Vaccinated Per Hundred"},
             color_discrete_sequence =px.colors.sequential.Electric)
fig.show()