In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Inspiration**

Track COVID-19 vaccination in the World, answer instantly to your questions :

    Which country is using what vaccine?

    In which country the vaccination programme is more advanced?

    Where are vaccinated more people per day?

    Where are vaccinated more people per country?

**Content**

The data (country vaccinations) contains the following information:

    1. Country- this is the country for which the vaccination information is provided;
    2. Country ISO Code - ISO code for the country;
    Date - date for the data entry; for some of the dates we have only the daily vaccinations, for others, only the (cumulative) total;
    3. Total number of vaccinations - this is the absolute number of total immunizations in the country;
    4. Total number of people vaccinated - a person, depending on the immunization scheme, will receive one or more (typically 2) vaccines; at a certain moment, the number of vaccination might be larger than the number of people;
    5. Total number of people fully vaccinated - this is the number of people that received the entire set of immunization according to the immunization scheme (typically 2); at a certain moment in time, there might be a certain number of people that received one vaccine and another number (smaller) of people that received all vaccines in the scheme;
    6. Daily vaccinations (raw) - for a certain data entry, the number of vaccination for that date/country;
    7. Daily vaccinations - for a certain data entry, the number of vaccination for that date/country;
    8. Total vaccinations per hundred - ratio (in percent) between vaccination number and total population up to the date in the country;
    9. Total number of people vaccinated per hundred - ratio (in percent) between population immunized and total population up to the date in the country;
    10. Total number of people fully vaccinated per hundred - ratio (in percent) between population fully immunized and total population up to the date in the country;
    11. Number of vaccinations per day - number of daily vaccination for that day and country;
    Daily vaccinations per million - ratio (in ppm) between vaccination number and total population for the current date in the country;
    12. Vaccines used in the country - total number of vaccines used in the country (up to date);
    13.Source name - source of the information (national authority, international organization, local organization etc.);
    14. Source website - website of the source of information.
    
    
There is a second file added recently (country vaccinations by manufacturer), with the following columns:
    
    Location - country;

    Date - date;

    Vaccine - vaccine type;

    Total number of vaccinations - total number of vaccinations / current time and vaccine type.

**THE STEPS TO BE FOLLOWED ARE ----**

1. Data Sourcing and Understanding
2. Data Cleaning
3. Explorating Data Analysis


Let's begin!!

**DATA SOURCING AND UNDERSTANDING**

In [None]:
#to avoid warnings
import warnings
warnings.filterwarnings('ignore')

#os
import os

#linear algebra libraries
import numpy as np, pandas as pd
import pandas_profiling as pp

#libraries for plotting graphs
import matplotlib.pyplot as plt, seaborn as sns, matplotlib
import plotly.express as px
import plotly.figure_factory as ff

#for hypothesis checking
from scipy.stats import mannwhitneyu

#for data preparation
from sklearn.preprocessing import scale, StandardScaler
from sklearn import linear_model
from sklearn import metrics
import statsmodels.api as sm

In [None]:
#loading the file
vaccine = pd.read_csv("/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv")
vaccine.head(10)

In [None]:
#getting a profile summary
pp.ProfileReport(vaccine)

In [None]:
vaccine.info()

In [None]:
vaccine.describe()

In [None]:
vaccine.columns

In [None]:
#country wrt vaccine details
cntry_vacc_df = pd.read_csv("/kaggle/input/covid-world-vaccination-progress/country_vaccinations_by_manufacturer.csv")
cntry_vacc_df.head(10)

In [None]:
#getting profile summary
pp.ProfileReport(cntry_vacc_df)

In [None]:
cntry_vacc_df.info()

In [None]:
cntry_vacc_df.describe()

**DATA CLEANING**

**Handling Datatypes**

In [None]:
#datatype analysis
vaccine.dtypes

In [None]:
# datetime datatype
vaccine[['date']] = vaccine[['date']].apply(pd.to_datetime)

In [None]:
#checking
vaccine.dtypes

In [None]:
# datetime datatype
cntry_vacc_df[['date']] = cntry_vacc_df[['date']].apply(pd.to_datetime)

In [None]:
#checking
cntry_vacc_df.dtypes

*Datatypes went well now!!!*

**Handling Duplicacy and Null Values**

From the definition of the following columns -

    Total number of vaccinations - this is the absolute number of total immunizations in the country;

    Total number of people vaccinated - a person, depending on the immunization scheme, will receive one or more (typically 2) vaccines; at a certain moment, the number of vaccination might be larger than the number of people;

    Total number of people fully vaccinated - this is the number of people that received the entire set of immunization according to the immunization scheme (typically 2); at a certain moment in time, there might be a certain number of people that received one vaccine and another number (smaller) of people that received all vaccines in the scheme;

    Daily vaccinations (raw) - for a certain data entry, the number of vaccination for that date/country;

    Daily vaccinations - for a certain data entry, the number of vaccination for that date/country;

    Total vaccinations per hundred - ratio (in percent) between vaccination number and total population up to the date in the country;

    Total number of people vaccinated per hundred - ratio (in percent) between population immunized and total population up to the date in the country;

    Total number of people fully vaccinated per hundred - ratio (in percent) between population fully immunized and total population up to the date in the country;

    Number of vaccinations per day - number of daily vaccination for that day and country


Let's check each column more closely!!

DATASET -- COUNTRY VACCINATION MANUFACTURERS DETAILS

In [None]:
#checking the variables which has true null values
true_null_1 = cntry_vacc_df.isnull().any()[cntry_vacc_df.isnull().any()==True]
true_null_1

DATASET -- COUNTRY VACCINATION DETAILS

In [None]:
#checking the variables which has true null values
true_null = vaccine.isnull().any()[vaccine.isnull().any()==True]
true_null

In [None]:
#checking for null values
vaccine.isnull().sum(axis=0)

In [None]:
#null values on percentage wise
nulldetails = round(100*(vaccine.isnull().sum()/len(vaccine.index)), 2)

#taking 50% on majority of NaN values in the column
nulldetails.loc[nulldetails >= 50]

In [None]:
#null values on percentage wise
nulldetails1 = round(100*(vaccine.isnull().sum()/len(vaccine.index)), 2)

#taking 50% as maximum of NaN values in the column
nulldetails1.loc[nulldetails1 <= 50]

We have quite large cardinality and missing data. Let's go with the first important column - total_vaccinations AND people_vaccinated

In [None]:
# COLUMN --- total_vaccinations
vaccine = vaccine.drop(vaccine[vaccine.total_vaccinations.isna()].index)

# COLUMN -- people_vaccinated
vaccine = vaccine.drop(vaccine[vaccine.people_vaccinated.isna()].index)

# checking NA values
vaccine.isna().sum()

We have other similar variables like people_vaccinated_per_hundred AND total_vaccinations_per_hundred, etc. Let's see how they are related with the above two features.

In [None]:
#checking correlation
corr = vaccine.corr()

#mask
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

#heatmap
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(9, 6))
    ax = sns.heatmap(corr, annot=True, mask=mask, vmax=.3, square=True)

We can say that - 

    1. people_vaccinated and people_vaccinated_per_hundred greatly correlates with total_vaccinations and total_vaccinations_per_hundred.

    2. Likewise, daily_vaccinations and daily_vaccinations_per_million greatly correlates with people_vaccinated and people_vaccinated_per_hundred.

    3. people_fully_vaccinated and people_fully_vaccinated_per_hundred greatly correlates with total_vaccinations and total_vaccinations_per_hundred.

    4. daily_vaccinations_raw greatly correlates with daily_vaccinations.

**HOW TO DEAL WITH IT?**

We can check for the hypothesis value and find out what's the p-value of their corelation and then decide whether to keep the column or delete it. Let's see what are the different hypothesis technique that can be proceed with column distributions ---

    1. Here we can try Mann-Whithey U test for the same.
    2. Two-sample Z test, can be used for checking two independent data groups and deciding whether sample mean of two group is equal or not.

Reference - https://machinelearningmastery.com/statistical-hypothesis-tests-in-python-cheat-sheet/

Let's go with first one, as it will have concised code.


**Mann-Whitney U Test**

Tests whether the distributions of two independent samples are equal or not.

Assumptions----
    
    Observations in each sample are independent and identically distributed (iid).

    Observations in each sample can be ranked.

Interpretation----

    H0: the distributions of both samples are equal.

    H1: the distributions of both samples are not equal.

Conclusion----

    if p_value < 0.05:

        print('Reject Null Hypothesis (Significant difference between two samples)')
    else:

        print('Do not Reject Null Hypothesis (No significant difference between two samples)')

In [None]:
#function for hypothesis check

def hypothesis_check(data1, data2) :
    
    # perform mann whitney test
    stat, p_value = mannwhitneyu(data1, data2)
    print('Statistics = %.2f, \np-value = %.13f' % (stat, p_value))
    
    # Level of significance
    alpha = 0.05
    
    # conclusion
    if p_value < alpha:
        print('Reject Null Hypothesis (Significant difference between two samples)')
    else:
        print('Do not Reject Null Hypothesis (No significant difference between two samples)')

In [None]:
#function to use fillna in thr column if we "Reject Null Hypothesis"

def fill_na(column):
    for c in column :
        vaccine[column] = vaccine[column].fillna(0)

In [None]:
#checking total_vaccinations AND people_vaccinated
hypothesis_check(vaccine['total_vaccinations'], vaccine['people_vaccinated'])

In [None]:
#checking total_vaccinations_per_hundred AND people_vaccinated_per_hundred
hypothesis_check(vaccine['total_vaccinations_per_hundred'], vaccine['people_vaccinated_per_hundred'])

In [None]:
#checking people_vaccinated AND daily_vaccinations
hypothesis_check(vaccine['people_vaccinated'], vaccine['daily_vaccinations'])

In [None]:
#checking people_vaccinated_per_hundred AND daily_vaccinations_per_million
hypothesis_check(vaccine['people_vaccinated_per_hundred'], vaccine['daily_vaccinations_per_million'])

In [None]:
#checking people_fully_vaccinated AND total_vaccinations
hypothesis_check(vaccine['people_fully_vaccinated'], vaccine['total_vaccinations'])

In [None]:
#checking people_fully_vaccinated_per_hundred AND total_vaccinations_per_hundred
hypothesis_check(vaccine['people_fully_vaccinated_per_hundred'], vaccine['total_vaccinations_per_hundred'])

In [None]:
#checking daily_vaccinations_raw AND daily_vaccinations
hypothesis_check(vaccine['daily_vaccinations_raw'], vaccine['daily_vaccinations'])

Hence, we can conclude that, p-values are much less than 0.05, which means we will reject our hypothesis. So, let's just fill missing values with zeros.

In [None]:
vaccine.columns

In [None]:
#columns to be filled with 0
cols = ['total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated',
       'daily_vaccinations_raw', 'daily_vaccinations',
       'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred', 'daily_vaccinations_per_million']

#filling NA values
fill_na(cols)

#check any NA values
vaccine.isna().sum()

Everything went well so far!!! Moving Forward now!!

**Handling any replacement in column values**

iso_code : ISO code for the country.

In [None]:
#iso code for country
vaccine[vaccine.iso_code.isna()].country.unique()

In [None]:
#fillna country values for systematic values
vaccine[vaccine.country == 'England'] = vaccine[vaccine.country == 'England'].fillna('GB-ENG')
vaccine[vaccine.country == 'Northern Ireland'] == vaccine[vaccine.country == 'Northern Ireland'].fillna('GB-NIR')
vaccine[vaccine.country == 'Scotland'] = vaccine[vaccine.country == 'Scotland'].fillna('GB-SCT')
vaccine[vaccine.country == 'Wales'] = vaccine[vaccine.country == 'Wales'].fillna('GB-WLS')
vaccine = vaccine.fillna('NC')

In [None]:
vaccine['iso_code'].unique()

In [None]:
vaccine.shape

In [None]:
vaccine.info()

Now we have cleaned dataset to proceed further!!!!

Let's do some data preparation!!!!

**DATA PREPARATION**

In [None]:
cntry_vaccine = vaccine.groupby(["country", "iso_code", "vaccines"])['total_vaccinations', 
                                                                       'total_vaccinations_per_hundred',
                                                                      'daily_vaccinations',
                                                                      'daily_vaccinations_per_million',
                                                                      'people_vaccinated',
                                                                      'people_vaccinated_per_hundred',
                                                                       'people_fully_vaccinated', 
                                                                       'people_fully_vaccinated_per_hundred'
                                                                      ].max().reset_index()

cntry_vaccine.columns = ["Country", "iso_code", "Vaccines", "Total vaccinations", "Percent", "Daily vaccinations", 
                           "Daily vaccinations per million", "People vaccinated", "People vaccinated per hundred",
                           'People fully vaccinated', 'People fully vaccinated percent']

In [None]:
cntry_vaccine.head(10)

In [None]:
cntry_vaccine.info()

In [None]:
vaccines_grp = cntry_vaccine.Vaccines.unique()

for v in vaccines_grp:
    countries_grp = cntry_vaccine.loc[cntry_vaccine.Vaccines==v, 'Country'].values
    print(f"Vaccines: {v}: \nCountries: {list(countries_grp)}\n")

In [None]:
cntry_vaccine_time = vaccine[["country", "vaccines", "date", 'total_vaccinations', 
                                'total_vaccinations_per_hundred',  'people_vaccinated','people_vaccinated_per_hundred',
                               'daily_vaccinations','daily_vaccinations_per_million', 
                                'people_fully_vaccinated', 'people_fully_vaccinated_per_hundred'
                               ]].dropna()

cntry_vaccine_time.columns = ["Country", "Vaccines", "Date", 'Total vaccinations', 'Percent', 'People vaccinated', 'People percent',
                               "Daily vaccinations", "Daily vaccinations per million", 
                                'People fully vaccinated', 'People fully vaccinated percent']

In [None]:
countries = ['Austria', 'Belgium', 'Bulgaria','Croatia', 'Cyprus', 'Czechia', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany',
             'Greece', 'Hungary','India' , 'Ireland', 'Israel', 'Italy', 'Latvia','Lithuania', 'Luxembourg', 'Malta',
             'Netherlands', 'Norway','Poland', 'Portugal', 'Romania', 'Serbia', 'Slovakia', 'Spain', 'Sweden',
             'United Kingdom', 'United States', 'China']

Proceeding to the EDA part!!!

**EXPLORATORY DATA ANALYSIS**

In [None]:
cntry_vaccine.columns

**Univariate analysis**

In [None]:
def distplot_check(column):
    plt.title('Checking Outliers with distplot()')
    sns.distplot(column, bins=10)
    plt.show()

In [None]:
def treat_outlier(column):
    #taking log of the values in both the axes
    cntry_vaccine[column] = np.log1p(cntry_vaccine[column])

In [None]:
distplot_check(cntry_vaccine["Total vaccinations"])
treat_outlier("Total vaccinations")

In [None]:
distplot_check(cntry_vaccine["Daily vaccinations"])
treat_outlier("Daily vaccinations")

In [None]:
distplot_check(cntry_vaccine["Daily vaccinations per million"])
treat_outlier("Daily vaccinations per million")

In [None]:
distplot_check(cntry_vaccine["People vaccinated"])
treat_outlier("People vaccinated")

In [None]:
distplot_check(cntry_vaccine["People vaccinated per hundred"])
treat_outlier("People vaccinated per hundred")

In [None]:
distplot_check(cntry_vaccine["People fully vaccinated"])
treat_outlier("People fully vaccinated")

In [None]:
distplot_check(cntry_vaccine["People fully vaccinated percent"])
treat_outlier("People fully vaccinated percent")

Outliers are treated well compared to previous analysis.

**Bivariate Analysis**

In [None]:
cntry_vaccine.columns

**How vaccines are related to other parameters?**

In [None]:
def boxplot_check(title_name, col1, col2):
    fig = plt.figure(figsize=(10, 5))
    plt.title(title_name)
    plt.xticks(rotation = 90)
    sns.boxplot(cntry_vaccine[col1], cntry_vaccine[col2])
    plt.show()

In [None]:
boxplot_check("Vaccines Vs Percent", "Vaccines", "Percent")

In [None]:
boxplot_check("Vaccines Vs Daily vaccinations per million", "Vaccines", "Daily vaccinations per million")

In [None]:
boxplot_check("Vaccines Vs People vaccinated per hundred", "Vaccines", "People vaccinated per hundred")

In [None]:
boxplot_check("Vaccines Vs People fully vaccinated percent", "Vaccines", "People fully vaccinated percent")

*Insights -

Top vaccines used for the people fully vaccinated, daily vaccinations and used percentage are Oxford/AstraZeneca and Pfizer/BioTech, Sinovac.*

**Multivariate Analysis**

In [None]:
cntry_vaccine.columns

**Which vaccination scheme is used most in a country-wise?**

In [None]:
def treemap_plot(value, tree_title):
    
    fig = px.treemap(cntry_vaccine, path = ['Vaccines', 'Country'], 
                 values = value,
                title = tree_title)
    fig.show()

In [None]:
val1 = 'Total vaccinations'
tit1 = "Total vaccinations per country, grouped by vaccine scheme"

treemap_plot(val1, tit1)

*Insights -

Total Vaccinations as per the vaccines are done in the Bangladesh, Nigeria, Myanmar, Sri Lanka, Ghana, etc. under Oxford/AstraZeneca.*

**Where are vaccinated more people per day?**

In [None]:
val2 = 'Daily vaccinations'
tit2 = "Daily vaccinations per country, grouped by vaccine scheme"

treemap_plot(val2, tit2)

*Insights -

Daily Vaccinations are proceeded rapidly in the countries like Bangladesh, Ghana, Bhutan, etc. with the large number of the vaccinations of Oxford/AstaZeneca.*

**Where are vaccinated more people per country?**

In [None]:
val3 = 'People vaccinated'
tit3 = "People vaccinated per country, grouped by vaccine scheme"

treemap_plot(val3, tit3)

*Insights -

Oxford/AstraZeneca is the vaccine used by many countries like Bangladesh, Nigeria, Sri Lanka, Bhutan, etc. where majority of the people are vaccinated.*

**Heatmap**

In [None]:
cntry_vaccine.shape

In [None]:
#checking correlation
corr1 = cntry_vaccine.corr()

#mask
mask = np.zeros_like(corr1)
mask[np.triu_indices_from(mask)] = True

#heatmap
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(9, 6))
    ax = sns.heatmap(corr1, annot=True, mask=mask, vmax=.3, square=True)

*Highly correlated variables are -

    1. Total Vaccinations and People Vaccinated
    2. Daily Vaccinations and Total Vaccinations
    3. People Vaccinated and Daily Vaccinations*

**---------------------------------------- by Sakshi Maharana -----------------------------------------------**

**THANK YOU FOR COMING SO FAR!!!**

PS. IF YOU LIKED THE KERNEL, DO GIVE ME AN UPVOTE!!