In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div class="alert alert-block alert-info" style="margin-top: 20px">
<ol>
<li><a href="#i">Introduction</a></li>
<li><a href="#i">Problem Statement</a></li> 
    <li><a href="#i">Imports</a></li> 
    <li><a href="#i">Data Exploration </a></li> 
    <li><a href="#i">Vaccine Types Used Per Country</a></li> 
    <li><a href="#i">Countries in Which More People Were Vaccinated</a></li>
    <li><a href="#i">Daily Vaccination vs Population</a></li>
    <li><a href="#i">Conclusion</a></li>
    

## 1. Introduction 

Context
Data is collected daily from Our World in Data GitHub repository for covid-19, merged and uploaded.

Content
The data contains the following information:

**Country **- this is the country for which the vaccination information is provided;
Country ISO Code - ISO code for the country;
**Date **- date for the data entry; for some of the dates we have only the daily vaccinations, for others, only the (cumulative) total;
Total number of vaccinations - this is the absolute number of total immunizations in the country;
Total number of people vaccinated - a person, depending on the immunization scheme, will receive one or more (typically 2) vaccines; at a certain moment, the number of vaccination might be larger than the number of people;
Total number of people fully vaccinated - this is the number of people that received the entire set of immunization according to the immunization scheme (typically 2); at a certain moment in time, there might be a certain number of people that received one vaccine and another number (smaller) of people that received all vaccines in the scheme;
Daily vaccinations (raw) - for a certain data entry, the number of vaccination for that date/country;
Daily vaccinations - for a certain data entry, the number of vaccination for that date/country;
Total vaccinations per hundred - ratio (in percent) between vaccination number and total population up to the date in the country;
Total number of people vaccinated per hundred - ratio (in percent) between population immunized and total population up to the date in the country;
Total number of people fully vaccinated per hundred - ratio (in percent) between population fully immunized and total population up to the date in the country;
Number of vaccinations per day - number of daily vaccination for that day and country;
Daily vaccinations per million - ratio (in ppm) between vaccination number and total population for the current date in the country;
Vaccines used in the country - total number of vaccines used in the country (up to date);
Source name - source of the information (national authority, international organization, local organization etc.);
Source website - website of the source of information;

## 2.Problem Statment

Track COVID-19 vaccination in the World, answer instantly to your questions:

Which country is using what vaccine?

In which country the vaccination programme is more advanced?

Where are vaccinated more people per day? But in terms of percent from entire population ?

## 3. Imports 


In [None]:
import os
# to interact with the operating system 

import numpy as np

import pandas as pd
# data structure tool for data manipulation and analysis

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from matplotlib.pyplot import figure
# for data visualization

import warnings
warnings.filterwarnings('ignore')

## 4. Data Exploration 

In [None]:
df = pd.read_csv("../input/covid-world-vaccination-progress/country_vaccinations.csv")

![](http://)

In [None]:
# data exploration
df.head()

In [None]:
# data description
df.info()

In [None]:
# explore the date range in which data were logeed
# we need to convert created date from object to date fromat
df['date'] = pd.to_datetime(df['date'])
# get the date range
print ("The dataset contains vaccination information since :" , df['date'].min(),"to :" , df['date'].max()  ) 

In [None]:
# What is the total number of countries that exist in the dataset?
print ( "There are " , df['country'].nunique() , "different countries in the dataset")
print ( "There are " , df['iso_code'].nunique() , "different iso_code in the dataset")

In [None]:
print ("since we have in the datasets countries with missing iso_code, country column will be used for data classification and grouping") 

## 5. Vaccine Types Used Per Country


In [None]:
# create new dataframe contains country and vaccine type
df_vaccine_type = df[["country","vaccines"]]

# How many missing values in country column?
print (" The dataset contains: " , df_vaccine_type['country'].isnull().sum() , "missing country values" )
print (" The dataset contains: " , df_vaccine_type['vaccines'].isnull().sum() , "missing vaccines values" )

In [None]:
# disply the types of vaccines used
df_vaccine_type["vaccines"].value_counts().to_frame()

### Vaccine Types Used for each Country

In [None]:
# grouping results
df_group = df_vaccine_type.groupby(['country', 'vaccines']).count().reset_index()
pd.set_option("display.max_rows", None, "display.max_columns", None)
df_group

#### It can be observed that some countries tended to use more than one vaccine type, so we will split the countries list into two lists.

 ### 5.1 Countries in Which Mutli-Vaccine Types were Used 

In [None]:
Multi_vaccine_country = df_group[df_group.vaccines.str.contains(",")].reset_index(drop= True)
Multi_vaccine_country

 ### 5.2 Countries in Which Single-Vaccine Type was Used 

In [None]:
Single_vaccine_country = df_group.drop(df_group[df_group.vaccines.str.contains(",")].index).reset_index(drop = True)
Single_vaccine_country

#### Let's explore the most used vaccines

In [None]:
# create new dataframe contains country and vaccine type
top_used_vaccines = df[["total_vaccinations","vaccines"]]

# How many missing values in country column?
print (" The dataset contains: " , top_used_vaccines['total_vaccinations'].isnull().sum() , "missing country values" )

In [None]:
# simply drop whole row with NaN 
top_used_vaccines.dropna( axis=0, inplace=True)

# reset index, because we droped two rows
top_used_vaccines.reset_index(drop=True, inplace=True)

# group by vaccine types
top_used_vaccines = top_used_vaccines.groupby(['vaccines'])['total_vaccinations'].sum().reset_index()

# sort the total people vaccinated by a certain vaccine type
top_used_vaccines_sorted = top_used_vaccines.sort_values(by = 'total_vaccinations', ascending=False)

# reset index
top_used_vaccines_sorted.set_index('vaccines', inplace=True)

# add percentage column
top_used_vaccines_sorted['percent'] = (top_used_vaccines_sorted['total_vaccinations'] / top_used_vaccines_sorted['total_vaccinations'].sum()) * 100

# top 5 vaccines used
top_vaccines = top_used_vaccines_sorted.nlargest(5, columns=['total_vaccinations']).reset_index()
top_vaccines
#vac_people_sorted[vac_people_sorted['people_vaccinated']==vac_people_sorted['people_vaccinated'].max()]

In [None]:

sns.set_style("ticks")
sns.set_context("poster")
sns.barplot(x= 'vaccines', y= 'percent', data= top_vaccines)
sns.set(rc={'figure.figsize':(8,8)})
plt.yticks(fontsize = 15)
plt.xticks(fontsize = 15 , rotation = 90)
plt.xlabel('Vaccines' ,fontsize = 15)
plt.ylabel('Percent' ,fontsize = 15)
plt.title('Top Used Vaccine Types', fontsize = 20 )

## 6. Countries in Which More People Were Vaccinated  

In [None]:
# We will anaylze seprately the number of people vaccinated and number of people fully vaccinated people 
df_fltrd = df[["country" , "people_vaccinated" , "people_fully_vaccinated" , "total_vaccinations"]]
# data description
df_fltrd.info()

In [None]:
# How many missing values in country column?
print (" The dataset contains: " , df_fltrd["people_vaccinated"].isnull().sum() , "missing values for people_vaccinated" )
print (" The dataset contains: " , df_fltrd["people_fully_vaccinated"].isnull().sum() , "missing values for people_fully_vaccinated" )
print (" The dataset contains: " , df_fltrd["total_vaccinations"].isnull().sum() , "missing values for total vaccinations" )


 ### 6.1 Top Countries in Which People Were Partially Vaccinated

In [None]:
# simply drop whole row with NaN 
df_fltrd.dropna( axis=0, inplace=True)

# reset index, because we droped two rows
df_fltrd.reset_index(drop=True, inplace=True)

# group by vaccine types
df_1 = df_fltrd.groupby(['country'])['people_vaccinated'].sum().reset_index()

# sort the total people vaccinated by a certain vaccine type
top_country_sorted = df_1.sort_values(by = 'people_vaccinated', ascending=False)
top_country_sorted.reset_index(drop = True)
# reset index
#top_country_sorted.set_index('country', inplace=True)
#top_country_sorted
# top 5 vaccines used
top_five_countries = top_country_sorted.nlargest(5, columns=['people_vaccinated']).reset_index(drop = True)
top_five_countries

In [None]:
sns.set_style("ticks")
sns.set_context("poster")
sns.barplot(x= 'country', y= 'people_vaccinated', data= top_five_countries)
sns.set(rc={'figure.figsize':(8,8)})
plt.yticks(fontsize = 15)
plt.xticks(fontsize = 15 , rotation = 90)
plt.xlabel('country' ,fontsize = 15)
plt.ylabel('Number of Prtially Vaccinated People' ,fontsize = 15)


 ### 6.2 Top Countries in Which People Were Fully Vaccinated

In [None]:
# grouping
df_2 = df_fltrd.groupby(['country'])['people_fully_vaccinated'].sum().reset_index()

# sort the total people vaccinated by a certain vaccine type
top_fully_country_sorted = df_2.sort_values(by = 'people_fully_vaccinated', ascending=False)
top_fully_country_sorted.reset_index(drop = True)

# reset index
#top_country_sorted.set_index('country', inplace=True)
top_five_fully_countries = top_fully_country_sorted.nlargest(5, columns=['people_fully_vaccinated']).reset_index(drop = True)
top_five_fully_countries

In [None]:
sns.set_style("ticks")
sns.set_context("poster")
sns.barplot(x= 'country', y= 'people_fully_vaccinated', data= top_five_fully_countries)
sns.set(rc={'figure.figsize':(8,8)})
plt.yticks(fontsize = 15)
plt.xticks(fontsize = 15 , rotation = 90)
plt.xlabel('country' ,fontsize = 15)
plt.ylabel('Number of Fullly Vaccinated People' ,fontsize = 15)

In [None]:
# grouping
df_1 = df_fltrd.groupby(['country'])['total_vaccinations'].sum().reset_index()

# sort the total people vaccinated by a certain vaccine type
top_total_country_sorted = df_1.sort_values(by = 'total_vaccinations', ascending=False)
top_total_country_sorted.reset_index(drop = True)
# reset index
#top_country_sorted.set_index('country', inplace=True)

# top 5 vaccines used
top_five_total_countries = top_total_country_sorted.nlargest(5, columns=['total_vaccinations']).reset_index(drop = True)
top_five_total_countries

 ### 6.3 Top Countries with High Total Vaccinations

In [None]:
sns.set_style("ticks")
sns.set_context("poster")
sns.barplot(x= 'country', y= 'total_vaccinations', data= top_five_total_countries)
sns.set(rc={'figure.figsize':(8,8)})
plt.yticks(fontsize = 15)
plt.xticks(fontsize = 15 , rotation = 90)
plt.xlabel('country' ,fontsize = 15)
plt.ylabel('Total Vaccinations' ,fontsize = 15)

## 7. Daily Vaccinations vs Population

In [None]:
# create new dataframe contains country and vaccine type
daily_vaccin = df[["country","daily_vaccinations_per_million"]]
print (" The dataset contains: " , daily_vaccin ["daily_vaccinations_per_million"].isnull().sum() , "missing values" )

In [None]:
# simply drop whole row with NaN 
daily_vaccin.dropna( axis=0, inplace=True)

# reset index, because we droped two rows
daily_vaccin .reset_index(drop=True, inplace=True) 

In [None]:
daily_vaccin = daily_vaccin.groupby(['country'])['daily_vaccinations_per_million'].sum().reset_index()
daily_vaccin_sorted = daily_vaccin.sort_values(by = 'daily_vaccinations_per_million', ascending=False)
daily_vaccin_sorted.set_index('country', inplace=True)
daily_vaccin_sorted[daily_vaccin_sorted['daily_vaccinations_per_million']==daily_vaccin_sorted['daily_vaccinations_per_million'].max()]
print ("County that has vaccinated a larger percent from its population is  :" ,daily_vaccin_sorted['daily_vaccinations_per_million'].head(1) , "vaccinations") 

## 8. Conclusion 

* The Dataset contains Covid-19 world vaccination progress from 2020-12-13 to 2021-02-01 for **67 countries**. In terms of vaccines used, there were **40 countries** tended to use single vaccin type, where **27 countries used two or more vaccines**. However, **Moderna and Pfizer/BioNTech** together were the most used vaccines with **558100601** recorded vaccinations accounted nearly **49%** of the total given vaccinations.

* The data analysis shows that **United States** is the country where more people were vaccinated regradless the stage of vaccination (partially or fully vaccined). While, Israel was the country in which more people were vaccinated in terms of percent from entire population.