In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import randint
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
train= pd.read_csv('../input/covid-world-vaccination-progress/country_vaccinations.csv')
train

In [None]:
train.info()

In [None]:
train["date"]= pd.to_datetime(train.date)

 # **Examining features** 

In [None]:
train.describe()

In [None]:
train.info()

# **Addressing questions related to vaccines used in each country**

which country used which vaccines?
how many vaccines have been developed?
what is the most commonly used vaccine? 

In [None]:
vaccines=train['vaccines'].unique()
vaccines

Here we are using string manipulation to identify vaccines uniquely: 

In [None]:
vaccineList=[]
for vaccine in vaccines:
    temp= vaccine.split(', ')
    vaccineList=vaccineList+temp  

In [None]:
vaccineList= pd.unique(vaccineList).tolist()
vaccineList

In [None]:
print('The total number of vaccines used so far is' ,len(vaccineList))

Group by each country to identify the vaccines used in each country:

In [None]:
vaccines = train.groupby("country").max()['vaccines']
vaccines

In [None]:
vaccines_country= vaccines.to_frame()
vaccines_country

The corresponding vaccines for each country:

In [None]:
count_vaccines = vaccines.str.split(', ')
count_vaccines

Count the number of vaccines used in each country: 

In [None]:
country_vaccine= count_vaccines.to_frame()
country_vaccine=country_vaccine.reset_index()
country_vaccine

Number of vaccines each country uses: 

In [None]:
# create Length column
country_vaccine['no_of_vaccines'] = country_vaccine.vaccines.map(len)
country_vaccine.sort_values(by=['no_of_vaccines'], inplace=True, ascending=False)
country_vaccine

Number of vaccines each country uses: 

In [None]:
x= country_vaccine.groupby('no_of_vaccines').count()['country'].to_frame()
x

In [None]:
 country_vaccine.head(20)

Number of vaccines used in Egypt: 

In [None]:
country_vaccine[country_vaccine['country']=='Egypt']

Another represenatation of the vaccines used in each country: 

In [None]:
df = pd.DataFrame(columns=vaccineList)
df['country']= train['country'].unique()
df.reset_index(drop=True, inplace=True)
df

In [None]:
countries=df['country'].unique()

In [None]:
count=0
for i in countries:
    for j in count_vaccines[count]:
        df.at[df['country']==i,j]=1
    count+=1

In [None]:
df.set_index('country',inplace=True)
# df.reset_index()
df

In [None]:
x=df.count().sort_values(ascending=False)

Identifying the number of country that use each vaccine:

In [None]:
fig,ax = plt.subplots(figsize=(20,10))
ax = sns.barplot(x = x.index, y =x.values, color='lightgreen')
ax.set_title("How many countries use a certain vaccine", fontsize=20)
ax.set_xlabel("Vaccine", fontsize=15)
ax.set_ylabel("n", fontsize=15)

plt.show()

We can deduce that Oxford/AstraZeneca is the most commonly used. 

# **In which country is the vaccination more advanced?**

In [None]:
#Top countries with most vaccinations
train.groupby("country")["total_vaccinations"].max().sort_values(ascending= False).head(20)

In [None]:
#barplot visualization of top countries with most vaccinations
x= train.groupby("country")["total_vaccinations"].mean().sort_values(ascending= False).head(20)
sns.set_style("whitegrid")
plt.figure(figsize= (8,8))
ax= sns.barplot(x=x.values,y=x.index)
ax.set_xlabel("Total vaccinations(mean)")
plt.show()

In [None]:
#Countries with best daily average vaccinations
x= train.groupby("country")["daily_vaccinations"].mean().sort_values(ascending= False).head(20)
#daily vaccinations barplot
plt.figure(figsize= (8,8))
ax= sns.barplot(x=x.values,y=x.index)

ax.set_xlabel("daily vaccinations(avg)")
plt.show()

We can deduce that China, United States, India, United Kingdom, and Brazil are doing great efforts in vaccination.

Timeline showing the change in daily vaccination for each country: 

In [None]:
# t=train.country.isin(t['country'])
fig = px.line(train, x = 'date', y ='daily_vaccinations', color = 'country')
fig.update_layout(
    title={
            'text' : "Daily vaccination trend",
            'y':0.95,
            'x':0.5
        },
    xaxis_title="Date",
    yaxis_title="Daily Vaccinations"
)

fig.show()

Mean number of people vaccinated and fully vaccinated in the top 5 countries: 

In [None]:
people_vaccinated_country = train.groupby("country").max().sort_values(by="people_vaccinated",ascending=False)[['people_vaccinated','people_fully_vaccinated']].head(5)
people_vaccinated_country.head()

Percentage of people who are fully vaccinated: 

In [None]:
people_vaccinated_country.reset_index()
people_vaccinated_country['people_fully_vaccinated/%']= (people_vaccinated_country['people_fully_vaccinated']/people_vaccinated_country['people_vaccinated'])*100
people_vaccinated_country.sort_values(by="people_fully_vaccinated/%", ascending=False)

To measure the vaccination progress in a country, we will perform analysis on **Total vaccinations per hundred**, **Total number of people vaccinated per hundred**, 
**Total number of people fully vaccinated per hundred**, and **Daily vaccinations per million**. 

Total vaccinations per hundred - ratio (in percent) between vaccination number and total population up to the date in the country;
Total number of people vaccinated per hundred - ratio (in percent) between population immunized and total population up to the date in the country;
Total number of people fully vaccinated per hundred - ratio (in percent) between population fully immunized and total population up to the date in the country;

In [None]:
#First five countrys
vaccinated_per_hundred = train.groupby("country").max().sort_values(by="total_vaccinations_per_hundred",ascending=False)
vaccinated_per_hundred.head()[['total_vaccinations_per_hundred','people_vaccinated_per_hundred','people_fully_vaccinated_per_hundred']]

In [None]:
labels = vaccinated_per_hundred.head(20).index
total_vaccinations_per_hundred = vaccinated_per_hundred['total_vaccinations_per_hundred'].head(20)
people_vaccinated_per_hundred = vaccinated_per_hundred['people_vaccinated_per_hundred'].head(20)
people_fully_vaccinated_per_hundred = vaccinated_per_hundred['people_fully_vaccinated_per_hundred'].head(20)

x = np.arange(len(labels)) 

width = 0.40   
fig, ax = plt.subplots(figsize=(20,12))

ax.bar(labels, total_vaccinations_per_hundred , width, label='Total vaccinations per hundred')
ax.bar(labels, people_vaccinated_per_hundred, width,label='People vaccinated per hundred')
ax.bar(labels, people_fully_vaccinated_per_hundred, width,label='People fully vaccinated per hundred')

ax.set_ylabel('n / hundread',fontsize=18)
ax.set_xlabel('Country',fontsize=18)
ax.set_title("Vaccinations per hundred per country - 20th",fontsize=22)
ax.set_xticks(x)
ax.set_xticklabels(labels,fontsize=14,rotation = 70)

ax.legend()

plt.show()

Measuring the gap between the number of people vaccinated and the people fully vaccinated.

In [None]:
#First five countrys

vaccinated_per_hundred = train.groupby("country").max().sort_values(by="total_vaccinations_per_hundred",ascending=False)
vaccinated_per_hundred=vaccinated_per_hundred[['total_vaccinations_per_hundred','people_vaccinated_per_hundred','people_fully_vaccinated_per_hundred']].head()
vaccinated_per_hundred['people_fully_vaccinated_from_total_vaccinated/%']= (vaccinated_per_hundred['people_fully_vaccinated_per_hundred']/vaccinated_per_hundred['people_vaccinated_per_hundred'])*100

vaccinated_per_hundred