# Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
import matplotlib.pyplot as plt # data visualization 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Acquire

In [None]:
# Reading and viewing the dataset 
vacc_df = pd.read_csv("/kaggle/input/vaccination-data/vaccination-data(26-08-2021).csv")
vacc_df.head()

In [None]:
# Number of observations and variables
vacc_df.shape

In [None]:
# Summary information about the dataset
vacc_df.info()

In [None]:
# Number of missing values by columns in the dataset
vacc_df.isna().sum()

In [None]:
# Are there any duplicate rows?
vacc_df.duplicated().sum()

In [None]:
# Numerical statistics of numerical variables
vacc_df.describe().T

# Prepare

In [None]:
# The WHO_REGION value (OTHER) of Liechtenstein has been set to "EURO" so that it does not break the integrity.
print(vacc_df["WHO_REGION"].value_counts())
vacc_df.iloc[117, 2] = "EURO"

In [None]:
# Making the necessary type conversions
vacc_df["WHO_REGION"] = vacc_df["WHO_REGION"].astype("category")
vacc_df["DATA_SOURCE"] = vacc_df["DATA_SOURCE"].astype("category")
vacc_df["DATE_UPDATED"] = pd.to_datetime(vacc_df["DATE_UPDATED"])
vacc_df["FIRST_VACCINE_DATE"] = pd.to_datetime(vacc_df["FIRST_VACCINE_DATE"])

In [None]:
# Country population
vacc_df["COUNTRY_POPULATION"] = (100 * vacc_df["PERSONS_FULLY_VACCINATED"]) / vacc_df["PERSONS_FULLY_VACCINATED_PER100"]

# Explore

### Countries that use the most different types of vaccines

In [None]:
vacc_df.iloc[:, [0, 2, 3, -4, -3, -2, -1]].sort_values("NUMBER_VACCINES_TYPES_USED", ascending = False).head(10)

### Number of countries and vaccination averages by region

In [None]:
pd.DataFrame(vacc_df.WHO_REGION.value_counts())

In [None]:
vacc_df.groupby("WHO_REGION").mean()

### REPORTING vs OWID

In [None]:
pd.DataFrame(vacc_df.DATA_SOURCE.value_counts())

### Countries with the highest vaccination rates

In [None]:
columns = vacc_df.columns[5:11]
top_countries = {"FEATURE":[], "COUNTRY": [], "VALUE":[]}
for col in columns:
    top_df = vacc_df.sort_values(col, ascending = False).head(3)[["COUNTRY", col]]
    for idx in range(3):
        top_countries["FEATURE"] += [col]
        top_countries["COUNTRY"] += [top_df.iloc[idx][0]]
        top_countries["VALUE"] += [top_df.iloc[idx][1]]
top_countries = pd.DataFrame(top_countries).set_index(["FEATURE","COUNTRY"])
top_countries

### Earliest and latest known vaccination start onset

In [None]:
vacc_df[~vacc_df["FIRST_VACCINE_DATE"].isna()].sort_values("FIRST_VACCINE_DATE").head(5)[["COUNTRY", "FIRST_VACCINE_DATE"]]

In [None]:
vacc_df[~vacc_df["FIRST_VACCINE_DATE"].isna()].sort_values("FIRST_VACCINE_DATE").tail(5)[["COUNTRY", "FIRST_VACCINE_DATE"]]

### How many countries use which vaccine?

In [None]:
# Aşıyı kullanan ülke sayısı
vacc_dict = {}
vacc_used_df = vacc_df[~vacc_df.VACCINES_USED.isna()]
for i in vacc_used_df.VACCINES_USED:
    for j in i.split(','):
        j = j.strip()
        if j not in vacc_dict:
            vacc_dict[j] = 1
        else:
            vacc_dict[j] += 1
            
vacc_used = pd.DataFrame({"VACCINE_TYPE": vacc_dict.keys(), "NUM_OF_COUNTRIES_USING": vacc_dict.values()}).set_index("VACCINE_TYPE").sort_values("NUM_OF_COUNTRIES_USING", ascending = False)
vacc_used.head(10)

# Visualize

### Correlation map 

In [None]:
fig, ax = plt.subplots(figsize = (12,10))
sns.heatmap(vacc_df.corr())
plt.show()

### How many countries administered a full dose of vaccine to what percentage of their population?

In [None]:
fig, ax = plt.subplots(figsize = (12,9))
g = sns.histplot(x= "PERSONS_FULLY_VACCINATED_PER100", data= vacc_df, kde=True, bins = 12)
g.set_xticks([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]) 
g.set_xticklabels(["0", "10", "20", "30", "40", "50", "60", "70", "80", "90", "100", "110", "120"])
plt.show()


### Relationships between at least 1 field and full dose fields

In [None]:
sns.lmplot(x = "PERSONS_VACCINATED_1PLUS_DOSE", y = "PERSONS_FULLY_VACCINATED", data = vacc_df, col = "WHO_REGION", col_wrap = 3)
plt.show()

### Proportion of areas with at least 1 dose of vaccination

In [None]:
fig, ax = plt.subplots(figsize = (12, 8))
sns.boxplot(x = "PERSONS_VACCINATED_1PLUS_DOSE_PER100", y = "WHO_REGION", data = vacc_df, palette = "pastel")
plt.show()

### Countries with the highest full dose vaccination rates

In [None]:
fig, ax = plt.subplots(figsize = (12, 8))
vacc_top_fully_per100 = vacc_df[vacc_df["PERSONS_FULLY_VACCINATED_PER100"] >= 70].sort_values("PERSONS_FULLY_VACCINATED_PER100", ascending = False)
sns.barplot(x = "PERSONS_FULLY_VACCINATED_PER100", y = "COUNTRY", data = vacc_top_fully_per100)
plt.show()

### Countries with a population of more than 10 million that have more than half of their population fully vaccinated

In [None]:
vacc_10m_50 = vacc_df[(vacc_df["PERSONS_FULLY_VACCINATED_PER100"] >= 50) & (vacc_df["COUNTRY_POPULATION"] >= 10_000_000)]
fig, ax = plt.subplots(figsize = (10, 6))
g = sns.barplot(x = "PERSONS_FULLY_VACCINATED_PER100", y = "COUNTRY", data = vacc_10m_50.sort_values("PERSONS_FULLY_VACCINATED_PER100", ascending = False))
plt.show()

### Countries with the most total vaccinations

In [None]:
fig, ax = plt.subplots(figsize = (12, 8))
vacc_top_total = vacc_df[vacc_df["TOTAL_VACCINATIONS"] >= vacc_df["TOTAL_VACCINATIONS"].quantile(0.96)].sort_values("TOTAL_VACCINATIONS", ascending = False)
sns.barplot(x = "TOTAL_VACCINATIONS", y = "COUNTRY", data = vacc_top_total)
plt.show()

### Countries with the most vaccines in 1 day

In [None]:
vacc_ndays_df = vacc_df[~vacc_df["FIRST_VACCINE_DATE"].isna()].iloc[:, [0, 4, 5, -3]]
vacc_ndays_df["NUM_OF_DAYS_PASSED"] = vacc_ndays_df["DATE_UPDATED"] - vacc_ndays_df["FIRST_VACCINE_DATE"]
vacc_ndays_df["NUM_OF_DAYS_PASSED"] = vacc_ndays_df["NUM_OF_DAYS_PASSED"].astype("str").str.strip(" days").astype("int")
vacc_ndays_df["NUM_OF_VACC_PER_DAY"] = round(vacc_ndays_df["TOTAL_VACCINATIONS"] / vacc_ndays_df["NUM_OF_DAYS_PASSED"]).astype("int")
vacc_ndays_df = vacc_ndays_df.sort_values("NUM_OF_VACC_PER_DAY", ascending = False)
vacc_ndays_top = vacc_ndays_df.head(10)
fig, ax = plt.subplots(figsize = (12, 8))
sns.barplot(x = "NUM_OF_VACC_PER_DAY", y = "COUNTRY", data = vacc_ndays_top, palette = "Greens_r")
plt.show()

### Most used vaccines 

In [None]:
vacc_used_mt20 = vacc_used[vacc_used["NUM_OF_COUNTRIES_USING"] > 20]
fig, ax = plt.subplots(figsize = (12, 9))
sns.barplot(y = vacc_used_mt20.index, x = "NUM_OF_COUNTRIES_USING",data = vacc_used_mt20, palette = "Blues_r")
plt.show()

### OWID provides vaccination data for the European region

In [None]:
print(vacc_df[vacc_df["WHO_REGION"] == "EURO"]["DATA_SOURCE"].value_counts())
fig, ax = plt.subplots(figsize = (9, 5))
sns.countplot(x = "DATA_SOURCE", data = vacc_df, hue = "WHO_REGION")
plt.show()

### Relationships between vaccination values per 100 people

In [None]:
vars_100 = ["TOTAL_VACCINATIONS_PER100", "PERSONS_VACCINATED_1PLUS_DOSE_PER100", "PERSONS_FULLY_VACCINATED_PER100"]
sns.pairplot(vacc_df, vars= vars_100, hue = "WHO_REGION", height = 3.2)
plt.show()