# COVID-19 World Vaccination Progress


**Updated to 25/04/2020**

## 1. Moduls to Use

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Import Datasets

In [None]:
#Path for the dataset

path1 = '../input/covid-world-vaccination-progress/country_vaccinations.csv'

In [None]:
#Read the csv 

datos = pd.read_csv(path1)

## 3. General Analysis

In [None]:
#List of countrys

len(datos['country'].unique())

There is 80 diferents countrys

In [None]:
#See 10th rows (sample)

datos.sample(20)

In [None]:
#Shape of the DataFrame

datos.shape

More records than countrys are observed in total. This means that some countries have more than one record row

In [None]:
#List of columns

list(datos.columns)

In [None]:
#Unique values per column

datos.nunique()

There is a difference between the number of countrys (80) and iso_code (75)

In [None]:
#Count the number of null per Country iso_code

datos[datos['iso_code'].isnull()]['country'].value_counts()

Not having the iso_code is a problem if I want to join with the geopandas base, therefore I am going to fill in the NaNs in this column

In [None]:
#First complete the iso_code for Northern Cyprus

indices_northern_cyprus = list(datos[datos['country'] == 'Northern Cyprus'].index) #indices of rows for Northern Cyprus
 
for indice in indices_northern_cyprus:
    datos.at[indice,'iso_code'] = 'CYP'

The rest of the NaN correspond to iso_code 'GBR'. We can fix this  with simple imputation

In [None]:
datos['iso_code'] = datos['iso_code'].fillna('GBR')

In [None]:
#Again count the number of null per Country iso_code. The result is 0

datos[datos['iso_code'].isnull()]['country'].value_counts()

Fixed the NaN in iso_code I observe the NaN in the rest of the columns

In [None]:
datos.isnull().sum()

### 3.1 Total vaccinations per contry


We analyze the column 'total_vaccinations': This is the absolute number of total immunizations in the country

In [None]:
#First five countrys con most vacinations

total_vaccinations_contry = datos.groupby("country").max().sort_values(by="total_vaccinations",ascending=False)['total_vaccinations']
total_vaccinations_contry.head()

In [None]:
#Countrys with most vacinations

fig,ax = plt.subplots(figsize=(16,10))
ax = sns.barplot(x = (total_vaccinations_contry.head(20).values)/1e6, y = total_vaccinations_contry.head(20).index, color='coral')
ax.set_title("Absolute number of total immunizations in the country - 20th", fontsize=20)
ax.set_xlabel("Total vaccinations [M]", fontsize=16)
ax.set_ylabel("Country", fontsize=16)

plt.show()

### 3.2 Total vaccinations in Argentina - Evolution

In [None]:
total_vaccinations_argentina = datos[datos['country']=='Argentina'][['date','total_vaccinations']]
total_vaccinations_argentina = total_vaccinations_argentina.dropna()

In [None]:
#Countrys with most vacinations

x = total_vaccinations_argentina['date']
y = total_vaccinations_argentina['total_vaccinations']

fig,ax = plt.subplots(figsize=(16,10))
ax.plot(x, y/1000, marker='o',ls='--',color='r')
ax.set_title("Absolute number of total immunizations in Argentina", fontsize=20)
ax.set_ylabel("Total vaccinations [K]", fontsize=16)
ax.set_xlabel("Day", fontsize=16)


ax.set_xticks(x)
ax.set_xticklabels(x,fontsize=8,rotation = 70)

plt.show()

### 3.3 Peope vaccinated and people fully vaccinated per contry


We analyze the columns: 

    Total number of vaccinations - this is the absolute number of total immunizations in the country
    
   
    Total number of people vaccinated - a person, depending on the immunization scheme, will receive one or more (typically 2) vaccines; at a certain moment, the number of vaccination might be larger than the number of people;

In [None]:
#First five countrys

people_vaccinated_contry = datos.groupby("country").max().sort_values(by="people_vaccinated",ascending=False)[['people_vaccinated','people_fully_vaccinated']]
people_vaccinated_contry.head()

In [None]:
labels = total_vaccinations_contry.head(20).index
people_vaccinated = people_vaccinated_contry['people_vaccinated'].head(20)
people_fully_vaccinated = people_vaccinated_contry['people_fully_vaccinated'].head(20)

x = np.arange(len(labels)) 


width = 0.35       # the width of the bars: can also be len(x) sequence

fig, ax = plt.subplots(figsize=(20,12))

ax.bar(labels, people_vaccinated/1e6 , width, label='People Vaccinated')
ax.bar(labels, people_fully_vaccinated/1e6, width,label='People Fully Vaccinated')

ax.set_ylabel('Total [M]',fontsize=18)
ax.set_xlabel('Country',fontsize=18)
ax.set_title("People vaccinates and fully vaccinated - 20th",fontsize=22)
ax.set_xticks(x)
ax.set_xticklabels(labels,fontsize=14,rotation = 70)

ax.legend()

plt.show()

### 3.4 Peope vaccinated and people fully vaccinated in Argentina - Evolution

In [None]:
people_vaccinated_argentina = datos[datos['country']=='Argentina'][['date','people_vaccinated','people_fully_vaccinated']]
people_vaccinated_argentina = people_vaccinated_argentina.dropna()

In [None]:
#Countrys with most vacinations

x = people_vaccinated_argentina['date']
y = people_vaccinated_argentina['people_vaccinated']
z = people_vaccinated_argentina['people_fully_vaccinated']

fig,ax = plt.subplots(figsize=(16,10))
ax.plot(x, y/1000, marker='o',ls='--',color='r',label='People Vaccinated')
ax.plot(x, z/1000, marker='o',ls='--',color='b',label='People Fully Vaccinated')
ax.set_title('People vaccinates and fully vaccinated - Argentina', fontsize=20)
ax.set_ylabel("Total vaccinations [K]", fontsize=16)
ax.set_xlabel("Day", fontsize=16)


ax.set_xticks(x)
ax.set_xticklabels(x,fontsize=8,rotation = 70)

ax.legend()

plt.show()

### 3.5 Vaccinations per hundred per country


We analyze the columns: 

    Total vaccinations per hundred - ratio (in percent) between vaccination number and total population up to the date in the country;
    
    Total number of people vaccinated per hundred - ratio (in percent) between population immunized and total population up to the date in the country;
    
    Total number of people fully vaccinated per hundred - ratio (in percent) between population fully immunized and total population up to the date in the country;

In [None]:
#First five countrys

vaccinated_per_hundred = datos.groupby("country").max().sort_values(by="total_vaccinations_per_hundred",ascending=False)[['total_vaccinations_per_hundred','people_vaccinated_per_hundred','people_fully_vaccinated_per_hundred']]
vaccinated_per_hundred.head()

In [None]:
labels = vaccinated_per_hundred.head(20).index
total_vaccinations_per_hundred = vaccinated_per_hundred['total_vaccinations_per_hundred'].head(20)
people_vaccinated_per_hundred = vaccinated_per_hundred['people_vaccinated_per_hundred'].head(20)
people_fully_vaccinated_per_hundred = vaccinated_per_hundred['people_fully_vaccinated_per_hundred'].head(20)

x = np.arange(len(labels)) 


width = 0.35       # the width of the bars: can also be len(x) sequence

fig, ax = plt.subplots(figsize=(20,12))

ax.bar(labels, total_vaccinations_per_hundred , width, label='Total vaccinations per hundred')
ax.bar(labels, people_vaccinated_per_hundred, width,label='People vaccinated per hundred')
ax.bar(labels, people_fully_vaccinated_per_hundred, width,label='People fully vaccinated per hundred')

ax.set_ylabel('n / hundread',fontsize=18)
ax.set_xlabel('Country',fontsize=18)
ax.set_title("Vaccinations per hundred per country - 20th",fontsize=22)
ax.set_xticks(x)
ax.set_xticklabels(labels,fontsize=14,rotation = 70)

ax.legend()

plt.show()

### 3.6 Peope vaccinated and people fully vaccinated in Argentina - Evolution

In [None]:
vaccinated_per_hundread_argentina = datos[datos['country']=='Argentina'][['date','total_vaccinations_per_hundred','people_vaccinated_per_hundred','people_fully_vaccinated_per_hundred']]
vaccinated_per_hundread_argentina = vaccinated_per_hundread_argentina.dropna()
vaccinated_per_hundread_argentina

In [None]:
#Vacinations per hundread

a = vaccinated_per_hundread_argentina['date']
b = vaccinated_per_hundread_argentina['total_vaccinations_per_hundred']
c = vaccinated_per_hundread_argentina['people_vaccinated_per_hundred']
d = vaccinated_per_hundread_argentina['people_fully_vaccinated_per_hundred']

fig,ax = plt.subplots(figsize=(16,10))
ax.plot(a, b, marker='o',ls='--',color='r',label='Total vaccinations per hundred')
ax.plot(a, c, marker='o',ls='--',color='b',label='People vaccinated per hundred')
ax.plot(a, d, marker='o',ls='--',color='g',label='People fully vaccinated per hundred')


ax.set_title('Vaccinations per hundred - Argentina', fontsize=20)
ax.set_ylabel("n / hundread", fontsize=16)
ax.set_xlabel("Day", fontsize=16)


ax.set_xticks(a)
ax.set_xticklabels(a,fontsize=8,rotation = 70)

ax.legend()

plt.show()

### 3.7 Daily vaccinations per million


We analyze the column 'daily_vaccinations_per_million': Ratio (in ppm) between vaccination number and total population for the current date in the country

In [None]:
#First five countrys con most vacinations

daily_vaccinations_per_million = datos.groupby("country").max().sort_values(by="daily_vaccinations_per_million",ascending=False)['daily_vaccinations_per_million']
daily_vaccinations_per_million.head()

In [None]:
#Countrys with most vacinations

fig,ax = plt.subplots(figsize=(16,10))
ax = sns.barplot(x = (daily_vaccinations_per_million.head(20).values), y = daily_vaccinations_per_million.head(20).index, color='coral')
ax.set_title("Daily vaccinations per million - 20th", fontsize=20)
ax.set_xlabel('Ratio', fontsize=16)
ax.set_ylabel("Country", fontsize=16)

plt.show()

### 3.7 Daily vaccinations per million in Argentina - Evolution

In [None]:
daily_vaccinations_per_million_argentina = datos[datos['country']=='Argentina'][['date','daily_vaccinations_per_million']]
daily_vaccinations_per_million_argentina = daily_vaccinations_per_million_argentina
daily_vaccinations_per_million_argentina.head()

In [None]:
#Countrys with most vacinations

x = daily_vaccinations_per_million_argentina['date']
y = daily_vaccinations_per_million_argentina['daily_vaccinations_per_million']

fig,ax = plt.subplots(figsize=(16,10))
ax.plot(x, y, marker='o',ls='--',color='r')
ax.set_title("Daily vaccinations per million in Argentina", fontsize=20)
ax.set_ylabel("Ratio", fontsize=16)
ax.set_xlabel("Day", fontsize=16)


ax.set_xticks(x)
ax.set_xticklabels(x,fontsize=8,rotation = 70)

plt.show()

## 4. Vaccines

We analyze the column 'vaccines': total number of vaccines used in the country (up to date);

In [None]:
#Vaccines in every country

vaccines = datos.groupby("country").max().sort_values(by="vaccines",ascending=False)['vaccines']
vaccines

As can be seen, the names of the vaccines in each record are separated by a comma and they are all in the same column.

### 4.1 Number of vaccines per country

In [None]:
#Split the text of each row and count the amount per row

count_vaccines = vaccines.str.split(',').map(lambda x: len(x))
count_vaccines.sort_values(ascending=False).head(10)

### 4.2 List of vaccines

Because the names of the vaccines in each record are all in the same column for an specific row I need to perform cleaning operations to obtain the list of available vaccines

In [None]:
#Revome white space in text

def remove_white_space(texto):
    import re
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', texto).strip()

In [None]:
#In a dataframe I create a column for each vaccine in each row. Since the country that has the most different vaccines is 5, therefore I have 5 columns
df_vaccines = pd.DataFrame(list(vaccines.str.split(',')))

#Going through each column I get a list of vaccines, which have repeated values because they have blank spaces in the text
vaccine_list = []
for i in df_vaccines.columns:
    candidate = df_vaccines[i].unique()
    for j in candidate:
        if j == None:
            pass
        else:
            if j not in vaccine_list:
                vaccine_list.append(j)

#I remove the blank spaces from the texts that have it and I get a final list of vaccines             
vaccine_list_final = []                
for k in vaccine_list:
    k_new = remove_white_space(k)
    if k_new not in vaccine_list_final:
        vaccine_list_final.append(k_new)
        
vaccine_list_final

As can be seen, 8 types of vaccines are available

### 4.3 Full dataset of vaccines


I am going to create a dataset that has the vaccine it has for each country. Each vaccine will be located in a specific column

I create a dataframe with all the NaN elements. I will use it to complete later

In [None]:
df_full_vaccines = pd.DataFrame(np.nan, index=vaccines.index, columns=vaccine_list_final)
df_full_vaccines

Previously create a dataframe where in each column the vaccine that each country has is placed, but the vaccines per column are not unique.

In [None]:
df_vaccines_parcial = df_vaccines.set_index(vaccines.index)
df_vaccines_parcial

Taking the previous dataframe I am going to concatenate all the columns into one

In [None]:
pd_list = []
for i in range(len(df_vaccines_parcial.columns)):
    pd_list.append(df_vaccines_parcial[i])
    
new_df = pd.concat(pd_list)

From the previous dataframe (concatenated) I am filtering by type of vaccine and placing in the corresponding column. I get a dataframe with 1 vaccine per column

In [None]:
for i in vaccine_list_final:
    df_full_vaccines[i] = new_df[new_df==i]

df_full_vaccines

### 4.4 How many countries use a certain vaccine

In [None]:
vaccines_country = df_full_vaccines.count().sort_values(ascending=False)
vaccines_country

In [None]:
#Countrys with most vacinations

fig,ax = plt.subplots(figsize=(16,10))
ax = sns.barplot(x = vaccines_country.values, y = vaccines_country.index, color='lightblue')
ax.set_title("How many countries use a certain vaccine", fontsize=20)
ax.set_xlabel("n", fontsize=16)
ax.set_ylabel("Vaccine", fontsize=16)

plt.show()