# Last update on the 6th of April 2021
# Corona virus vaccinations in the world

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as ex

plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
full_df = pd.read_csv('/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv')
full_df

In [None]:
full_df.info()

**1. What country has vaccinated more people? 
Or how many daily vaccinations are there per country if we sort by total vaccinations mean and sum values? And what countries are top 10?******

There are some columns with a lot of missing data. Thus, for us the most important columns are: country, date, vaccines (no missing values) and daily_vaccinations_per_million and daily_vaccinations. 

Exploring the list of countries in the table shows that there are some duplicates - like having data for England, Scotland, Wales and Northern Island from the one hand, and for the UK from the other hand. We want to delete some data, only the UK data will be in use.

In [None]:
country_list = full_df.country.unique().tolist()
print(country_list)
full_df = full_df[full_df.country.apply(lambda x: x not in ['England', 
                            'Scotland', 'Wales', 'Northern Ireland'])]

In [None]:
daily = full_df.loc[:,['country', 'date', 'vaccines', 'daily_vaccinations']].dropna(subset=['daily_vaccinations'])
daily["date"] = pd.to_datetime(daily["date"], format = '%Y-%m-%d')

daily_agg = daily.groupby('country').daily_vaccinations.agg(
    sum_vaccination_amt  = 'sum',
    mean_vaccination_amt = 'mean')


full_daily = daily.merge(daily_agg, on = 'country')
full_daily

In [None]:
sort = daily_agg.sort_values(by = 'mean_vaccination_amt', ascending = False).reset_index()
countries = sort.country.to_list()

graphing_mean = full_daily.sort_values(by = 'mean_vaccination_amt', ascending = False)
top10_mean = graphing_mean[graphing_mean['country'].isin(countries[:10])]

fig = plt.figure(figsize = (12,8))
ax = fig.add_subplot()
sns.lineplot( x = 'date', y = 'daily_vaccinations', hue = 'country', 
    data = top10_mean)
plt.legend(ncol = 3, frameon = False, title = '')
plt.xticks(rotation=45) 
plt.title('Top 10 countries vaccinations (sorted by mean values)')

In [None]:
sort_sum = daily_agg.sort_values(by = 'sum_vaccination_amt', ascending = False).reset_index()
countries_sum = sort_sum.country.to_list()

graphing_sum = full_daily.sort_values(by = 'sum_vaccination_amt', ascending = False)
top10_sum = graphing_sum[graphing_sum['country'].isin(countries_sum[:10])]
top40_sum = graphing_sum[graphing_sum['country'].isin(countries_sum[:40])]

fig = plt.figure(figsize = (12,8))
ax = fig.add_subplot()
sns.lineplot( x = 'date', y = 'daily_vaccinations', hue = 'country', 
    data = top10_sum)
plt.legend(ncol = 3, frameon = False, title = '')
plt.xticks(rotation=45) 
plt.title('Top 10 countries vaccinations (sorted by total amount of vaccinations)')

Answering the first question we can say that the first 7 positions are hold by the same countries - the United States, China, India, the UK, Brazil, Turkey and Germany. And there is a bit difference in the next positions, there are such countries like France, Russia, Indonesia and Morocco. 
The main trend here is increasing of numbers of daily vaccinations even though there some local peaks for China. And this is very understandable - the more the population is - the more patients are there.

**2. How many vaccinations are there per country per million people (total value)? In other words, which countries have vaccinated the biggest proportion of their populations?**

In [None]:
daily_mil = full_df.loc[:,['country', 'date', 
    'daily_vaccinations_per_million']].dropna(subset=['daily_vaccinations_per_million'])
daily_mil["date"] = pd.to_datetime(daily_mil["date"], format = '%Y-%m-%d')

daily_agg_mil = daily_mil.groupby('country').daily_vaccinations_per_million.agg(
    sum_vaccination_amt  = 'sum',
    mean_vaccination_amt = 'mean')

full_daily_mil = daily_mil.merge(daily_agg_mil, on = 'country')


sorted_sum = daily_agg_mil.sort_values(by = 'sum_vaccination_amt', ascending = False).reset_index()
countries_sum_mil = sorted_sum.country.to_list()

graphing_sum_mil = full_daily_mil.sort_values(by = 'sum_vaccination_amt', ascending = False)
top10_sum_mil = graphing_sum_mil[graphing_sum_mil['country'].isin(countries_sum_mil[:10])]
top40_sum_mil = graphing_sum_mil[graphing_sum_mil['country'].isin(countries_sum_mil[:40])]

fig = plt.figure(figsize = (12,8))
ax = fig.add_subplot()
sns.lineplot( x = 'date', y = 'daily_vaccinations_per_million', hue = 'country', 
    data = top10_sum_mil)
plt.legend(ncol = 3, frameon = False, title = '')
plt.xticks(rotation=45) 
plt.title('Top 10 countries vaccinations (sorted by total amount of vaccinations per million)')

If we take into consideration 2 previous graphs, than we can see that absolute numbers are true in vaccinations per million only in case of Israel (it was true also for the United Kingdom till the end of March). Such huge countries like the US, India or China are far away from vaccinatig big percent of their populations. The best job so far is done by Gibraltar, Bhutan and Israel.

Let's have a look at total numbers of vaccinations per country and total vaccinations per million per country and see where the leaders are and how far are the rest of top 40 countries. 

In [None]:
plt.figure(figsize=(8,14))
sns.barplot(x=top40_sum['sum_vaccination_amt'], y=top40_sum['country'], palette="RdBu")
plt.xlabel("sum_vaccination_amt")
plt.ylabel("Country")
plt.title("Total vaccinations per country, top 40")
plt.show()

In [None]:
plt.figure(figsize=(8,16))
sns.barplot(x=top40_sum_mil['sum_vaccination_amt'], y=top40_sum_mil['country'], palette="RdBu")
plt.xlabel("sum_vaccination_amt_per_million")
plt.ylabel("Country")
plt.title("Vaccinations per country per million, top 40")
plt.show()

In [None]:
data = full_daily.groupby('vaccines')['sum_vaccination_amt'].sum()
data = pd.DataFrame(data).reset_index()
data = data.sort_values(by = 'sum_vaccination_amt', ascending = False).reset_index()

In [None]:
plt.figure(figsize=(8,16))
sns.barplot(x=data['sum_vaccination_amt'], y=data['vaccines'], palette="RdBu")
plt.xlabel("# doses of vaccines combination")
plt.ylabel("Vaccines combinations")
plt.title("Vaccine combinations and their numbers in use")
plt.show()

**3. What vaccines are used and in which countries?**

In [None]:
vaccines = full_daily.loc[:,['country', 'vaccines']]
vaccines = vaccines.groupby('vaccines')['country'].unique()
vaccines = pd.DataFrame(vaccines).reset_index()
vaccines

Here is the list of vaccine combinations and countries that use those combinations. In the next cell you can see the same data on the map.

In [None]:
d = {}
for i in vaccines["vaccines"].unique():
    d[i] = [vaccines["country"][j] for j in vaccines[vaccines["vaccines"]==i].index]
d

In [None]:
title = "Popular Vaccines"
data = full_daily
fig = ex.choropleth(data, 
            locations="country", 
            locationmode='country names',
            color="vaccines", 
            hover_name="country", 
                   )


fig.update_layout(title=title, 
            title_x=0.5,
            legend_orientation = 'h'
)
fig.show()