In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Load important libraries
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.patches as mpatches
%pip install bs4
import requests
from bs4 import BeautifulSoup
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('../input/covid-world-vaccination-progress/country_vaccinations.csv')

In [None]:
data.head(100)

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.describe(include='all')

In [None]:
#Dropping source name and source website as they are not imperative enough to answer the questions.
data.drop(['source_name','source_website'],axis=1,inplace=True)

In [None]:
data.head(10)

In [None]:
#Showing Country name without duplicates.
data['country'].unique()

Removing England,Wales,Northern Ireland and Scotland from country column because they all come under UK(United Kingdom).

In [None]:
data=data[data['country'].apply(lambda x: x not in ['England','Wales','Northern Ireland','Scotland'])]

In [None]:
#Making a list to store the total occurence of each country in our dataset.
l=[]
for i in range(94):
    l.append(data['country'].value_counts()[i])

**Making a piechart inorder to show the total occurence of each unique country in our dataset.**

In [None]:
#United kingdom has the highest occurence (69) in our dataset and Saint Helena and Greenland has the lowest occurence (1).
fig = px.pie(data, values=l, names=data['country'].value_counts().index)
fig.update_traces(textposition='inside', textfont_size=14)
fig.show()

# Which country is using what vaccine?

In [None]:
#Number of unique vaccines with their total use.
data['vaccines'].value_counts().unique

The use of Moderna, Oxford/AstraZeneca, Pfizer/BioNTech is maximum (1013) and Johnson&Johnson is minimum (4).


In [None]:
#Plot to show the counts of vaccines. Clearly, 
plt.figure(figsize=(12,14))
sns.countplot(y=data['vaccines'],order=data['vaccines'].value_counts().index)
plt.show()

In [None]:
#Zero null values are present in vaccines.
data['vaccines'].isnull().sum()

In [None]:
df1=data[['country','iso_code','vaccines']]

In [None]:
df1

In [None]:
# Map showing the use of different vaccines by each country.
fig = px.choropleth(df1, locations='iso_code',color=df1['vaccines'],hover_name="country", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma,width=1500,height=700)
fig.show()

# Which country has vaccinated more people?

In [None]:
data.head(10)

In [None]:
#Total null values in all columns
data.isnull().sum()

In [None]:
df2=data[['country','daily_vaccinations']]

In [None]:
#Grouping the data on country column by sum of daily vaccinations in each country. 
df2a=df2.groupby('country').sum().sort_values('daily_vaccinations',ascending=False)

In [None]:
df2a=df2a.reset_index()

In [None]:
df2a

Top 10 countries with total of daily vaccinations

In [None]:
#United states has the highest number of vaccinations given so far.
fig = px.bar(df2a.head(10), y="daily_vaccinations",color='daily_vaccinations', x="country",title='Country vs Total daily vaccinations')
fig.show()

In [None]:
fig = go.Figure(data=go.Choropleth(
    locationmode = "country names",
    locations = df2a['country'],
    z = df2a['daily_vaccinations'],
    text = df2a['daily_vaccinations'],
   # colorscale=colors,
    autocolorscale=True,
    reversescale=False,
    colorbar_title = 'Total of daily vaccinations',
))

fig.update_layout(
    title_text='Total of daily vaccinations by each country',
    geo=dict(
        showcoastlines=True,
    ),
)

# Which country has vaccinated a larger percent from its population?

I have done web scrapping to obtained the data containing population of each country in our dataset inorder to calculate the vaccination by population ratio.

In [None]:
URL=requests.get('https://www.worldometers.info/world-population/population-by-country/')

In [None]:
URL.status_code

In [None]:
content=BeautifulSoup(URL.text,'html.parser')

In [None]:
table=content.find('table',class_="table table-striped table-bordered")


In [None]:
#Scrapping the id,country and population column from the website's table.
for i in table.find_all('tbody'):
    rows=i.find_all('tr')
    l1=[]
    l2=[]
    l3=[]
    for j in rows:
        #print(j)
        l1.append(j.find('td').text)
        l2.append(j.find('a').text)
        l3.append(j.find('td',style="font-weight: bold;").text)

In [None]:
#Making a new dataframe which will contain the scraped data.
population_df=pd.DataFrame({'Country':l2,'Population':l3})

In [None]:
#Total 235 contries are present in our scraped data.
population_df['Country'].shape

We have to choose those countries from our scraped data which are present in our dataset.(line-60 to line-66 )

In [None]:
l=[]
for i in range(94):
    l.append(population_df[population_df['Country']==df2a['country'][i]].values)

In [None]:
for i in range(len(l)):
    if l[i].shape[0]==0:
        to_fill=np.array([[np.nan],[0]])
        arr=l[i]
        l[i]=np.append(arr,to_fill).reshape(1,2)

In [None]:
sl=[]
for i in range(len(l)):
    sl.append(str(l[i][0][1]))

In [None]:
df2a=df2a.assign(population=sl)

In [None]:
df2a

In [None]:
#Their are some countries with population=0. So, I have made a list which contain those countries.
missing_countries_pop=[]
for i in range(df2a.shape[0]):
    if df2a.loc[i][2]=='0.0':
        missing_countries_pop.append(df2a.loc[i][0])

In [None]:
missing_countries_pop

In [None]:
#Adding population values of each contries with population=0
miss_con_pop=['10,600,000','97,857','1,207,359','63,155','44,543']

In [None]:
d=dict(zip(df2a['country'],df2a['population']))

In [None]:
j=0
for i in d:
    if d[i]=='0.0':
        d[i]=miss_con_pop[j]
        j+=1

In [None]:
df2a['population']=df2a['country'].map(d)

In [None]:
df2a['daily_vaccinations']=df2a[['daily_vaccinations']].astype(int)

In [None]:
la=[]
for i in range(df2a.shape[0]):
    la.append(int(float(df2a['population'][i].replace(',',''))))

In [None]:
df2b=df2a.drop('population',axis=1)

In [None]:
df2b=df2b.assign(population=la)

In [None]:
df2b

In [None]:
#Function to calculate the vaccination by population ratio.
def per_vacc(data_frame1):
    x=[]
    for i in range(data_frame1.shape[0]):
        x.append(data_frame1.iloc[i][1]*100/int(data_frame1.iloc[i][2]))
    new_x=np.round(x,decimals=3)
    return new_x

In [None]:
#Provided df2b as an argument in our function.
per=per_vacc(df2b)

In [None]:
#Assigning new column 'percentage' to our df2b dataframe.
df2b=df2b.assign(percentage=per)

In [None]:

fig = px.bar(df2b.head(10), y='percentage',color='percentage',x='country',hover_data=['population','daily_vaccinations'],
             title='Top 10 countries with vaccinations by population ratio',
            labels={'daily_vaccinations':'Total vaccinations'})
fig.show()

Israel has the highest percentage in terms of number of vaccinations provided given the population of Israel.

In [None]:
for col in df2b.columns:
    df2b[col] = df2b[col].astype(str)
df2b['text'] =df2b['country'] + '<br>' + \
            ' Total vaccinations '  + df2b['daily_vaccinations'] + '<br>' + \
            ' Population ' + df2b['population']
fig = go.Figure(data=go.Choropleth(
    locationmode = "country names",
    locations = df2b['country'],
    z =df2b['percentage'],
    text=df2b['text'],
    colorscale='Reds',
    autocolorscale=True,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_title = 'percentage',
))

fig.update_layout(
    title_text='Vaccinations by Population ratio distribution of each country.',
    geo=dict(
        showcoastlines=True
    )
)