# Track the progress of COVID-19 vaccination

### Task Details
Answer to questions like:

* What vaccines are used and in which countries?
* What country is vaccinated more people?
* What country is vaccinated a larger percent from its population?

In [None]:
# Import Dependencies
%matplotlib inline

# Data Manipulaiton
import numpy as np
import pandas as pd

# Data Visualisation
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Plotly Libraris
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

#SQL
import sqlite3, csv
from sqlalchemy import create_engine

# Ignoring warning 
import warnings
warnings.filterwarnings('ignore')

print("Setup Complete")

In [None]:
covid_filepath = '../input/covid-world-vaccination-progress/country_vaccinations.csv'
covid_df = pd.read_csv(covid_filepath)
population_filepath = '../input/population-by-country-2020/population_by_country_2020.csv'
population_df = pd.read_csv(population_filepath)
print('Import complete')

# Data Description

The data contains the following information:

* Country- this is the country for which the vaccination information is provided;
* Country ISO Code - ISO code for the country;
* Date - date for the data entry; for some of the dates we have only the daily vaccinations, for others, only the (cumulative) total;
* Total number of vaccinations - this is the absolute number of total immunizations in the country;
* Total number of people vaccinated - a person, depending on the immunization scheme, will receive one or more (typically 2) vaccines; at a certain moment, the number of vaccination might be larger than the number of people;
* Total number of people fully vaccinated - this is the number of people that received the entire set of immunization according to the immunization scheme (typically 2); at a certain moment in time, there might be a certain number of people that received one vaccine and another number (smaller) of people that received all vaccines in the scheme;
* Daily vaccinations (raw) - for a certain data entry, the number of vaccination for that date/country;
* Daily vaccinations - for a certain data entry, the number of vaccination for that date/country;
* Total vaccinations per hundred - ratio (in percent) between vaccination number and total population up to the date in the country;* 
* Total number of people vaccinated per hundred - ratio (in percent) between population immunized and total population up to the date in the country;
* Total number of people fully vaccinated per hundred - ratio (in percent) between population fully immunized and total population up to the date in the country;
* Number of vaccinations per day - number of daily vaccination for that day and country;
* Daily vaccinations per million - ratio (in ppm) between vaccination number and total population for the current date in the country;
* Vaccines used in the country - total number of vaccines used in the country (up to date);
* Source name - source of the information (national authority, international organization, local organization etc.);
* Source website - website of the source of information;

In [None]:
# Is there any missing data?
missingno.matrix(covid_df, figsize=(30,10))

There is seemingly a large amount of missing information across the board.

In [None]:
covid_df.isnull().sum()

In [None]:
covid_df.tail()

In [None]:
print("Number of unique dates is: {}" .format(len(covid_df['date'].unique())))

In [None]:
print("Dates in the dataset range from {0} to {1}".format(covid_df['date'].min(), covid_df['date'].max()))

# Using SQL to manipulate dataframe

In [None]:
# Converting CSV using pandas.DataFrame.to_sql
engine = create_engine('sqlite://',  echo = False) 
population_df.to_sql('population_sql', con = engine)
covid_df.to_sql('covid_sql', con = engine)

In [None]:
sql_query = '''SELECT name FROM sqlite_master WHERE type='table';'''
print(engine.execute(sql_query).fetchall())

In [None]:
# Creating subtable
sql_query = """
            SELECT DISTINCT country, vaccines
            FROM covid_sql
            ORDER BY country
            """
country_vaccines = pd.read_sql_query(sql_query, con = engine)

# Stats per country
# Total Vaccinations
sql_query = """
            SELECT DISTINCT country, MAX(total_vaccinations) AS total_vaccinations
            FROM covid_sql
            GROUP BY country
            ORDER BY total_vaccinations DESC
            """
total = pd.read_sql_query(sql_query, con = engine)
sql_query = """
            SELECT DISTINCT country, MAX(total_vaccinations) AS total_vaccinations
            FROM covid_sql
            GROUP BY country
            ORDER BY total_vaccinations DESC
            LIMIT 10
            """
top10_total = pd.read_sql_query(sql_query, con = engine)

# People fully vaccinated
sql_query = """
            SELECT DISTINCT country, MAX(people_fully_vaccinated) AS people_fully_vaccinated
            FROM covid_sql
            GROUP BY country
            ORDER BY people_fully_vaccinated DESC
            LIMIT 10
            """
top10_people_full = pd.read_sql_query(sql_query, con = engine)

sql_query = """
            SELECT DISTINCT country, MAX(people_fully_vaccinated) AS people_fully_vaccinated
            FROM covid_sql
            GROUP BY country
            ORDER BY people_fully_vaccinated DESC
            """
people_full = pd.read_sql_query(sql_query, con = engine)


sql_query = """
            SELECT DISTINCT country, MAX(people_vaccinated) AS people_vaccinated
            FROM covid_sql
            GROUP BY country
            ORDER BY people_vaccinated DESC
            LIMIT 10
            """
top10_people = pd.read_sql_query(sql_query, con = engine)

# Daily vaccinations
sql_query = """
            SELECT DISTINCT country, MAX(daily_vaccinations) AS daily_vaccinations
            FROM covid_sql
            GROUP BY country
            ORDER BY daily_vaccinations DESC
            """
daily = pd.read_sql_query(sql_query, con = engine)

sql_query = """
            SELECT DISTINCT country, MAX(daily_vaccinations) AS daily_vaccinations
            FROM covid_sql
            GROUP BY country
            ORDER BY daily_vaccinations DESC
            LIMIT 10
            """
top10_daily = pd.read_sql_query(sql_query, con = engine)

# Joining 'Population (2020)' from 'populaiton_by_country'
sql_query = """
            SELECT covid_sql.*, population_sql.'Population (2020)' AS population
            FROM covid_sql 
            INNER JOIN population_sql 
                ON population_sql.'Country (or dependency)' = covid_sql.country
            """
vaccinations_population = pd.read_sql_query(sql_query, con = engine)

In [None]:
vaccinations_population.to_sql('vaccinations_population', con = engine)  

In [None]:
# Percentage of populaiton vaccinated
sql_query = """
            SELECT DISTINCT country, (MAX(people_vaccinated)/MAX(population))*100 AS percentage_vaccinated
            FROM vaccinations_population
            GROUP BY country
            ORDER BY percentage_vaccinated DESC
            LIMIT 15;
            """
percent_vaccinated = pd.read_sql_query(sql_query, con = engine)

sql_query = """
            SELECT DISTINCT country, (MAX(people_fully_vaccinated)/MAX(population))*100 AS 'fully_vaccinated_(%)'
            FROM vaccinations_population
            WHERE people_fully_vaccinated > 0
            GROUP BY country
            ORDER BY 'fully_vaccinated_(%)' DESC
            """
percent_fully_vaccinated = pd.read_sql_query(sql_query, con = engine)

In [None]:
# Estimated number of days until 100% population is vaccinated
sql_query = """
            SELECT DISTINCT country, ((MAX(population) - MAX(people_fully_vaccinated))/MAX(daily_vaccinations))*
               (MAX(total_vaccinations)/MAX(people_fully_vaccinated)) AS estimated_days_until_100_percent_vaccinations
            FROM vaccinations_population
            WHERE people_fully_vaccinated > 0
            GROUP BY country
            ORDER BY 'fully_vaccinated_(%)' DESC
            """
estimated_days_until_100_percent_vaccinations = pd.read_sql_query(sql_query, con = engine)

# Vaccines Used Per Country

In [None]:
# Splitting strings in 'vaccines'
a = country_vaccines['vaccines']
country_vaccines['split']=[i.rsplit(',',3)[0] for i in a]
country_vaccines['split'].tail()

In [None]:
# Vaccines used in each country
country_vaccines

### Issue with code:
Need to split vaccine companies in 'vaccines'.

In [None]:
# Populatiry of each vaccine (1 per country)
plt.figure(figsize=(10,20))
sns.countplot(y =country_vaccines['vaccines'])
plt.title('Number of Countries using Vaccine')
plt.ylabel('Vaccine Name')
plt.xlabel('Number of Countries');

# Data Visulisation

**Visulisations for:**
* Total vaccinations
* People Vaccinated
* Percentage of people vaccinated
* People fully vaccinated
* Percentage of people fully vaccinated
* Daily vaccinations
* Estimated days until 100% of population is vaccinated

## Total Vaccinations

In [None]:
fig = px.treemap(total,path = ['country'],values = 'total_vaccinations')
fig.update_layout(title='Total Vaccinations for Each Country',title_x=0.5)
fig.show()

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(x = top10_total["country"], y = top10_total['total_vaccinations'])
plt.title('Top 10 Countries with Total Vaccinations')

## People Vaccinated

In [None]:
fig = go.Figure(data=[go.Scatter(
    x=top10_people['country'], y=top10_people['people_vaccinated'],
    mode='markers',
    marker=dict(
        color=top10_people['people_vaccinated'],
        size=top10_people['people_vaccinated']*0.000005,
        showscale=True
    ))])

fig.update_layout(title='Top 10 Countries with People Vaccinated',xaxis_title="Country",yaxis_title="People Vaccinated",title_x=0.5)
fig.show()

In [None]:
fig = go.Figure(go.Funnel(
    x=percent_vaccinated["percentage_vaccinated"],
    y=percent_vaccinated["country"] ))
fig.update_layout(title='Top 10 Countries with Highest Percent of Population Vaccinated (%)',yaxis_title="Country",xaxis_title=" Percent of Population Vaccinated ",title_x=0.5)
fig.show()

In [None]:
# Creating function to get the name of the dataframe
def get_df_name(data):
    name =[x for x in globals() if globals()[x] is data][0]
    return name

In [None]:
def auto_plot(target_column, data, title, figsize=(20,10)):
    
    plt.figure(figsize=figsize)
    sns.barplot(x=data['country'], y = data[target_column])
    plt.title('Top 10 Countries for {}'.format(title))
    plt.xlabel('Country')
    

## People Fully Vaccinated

In [None]:
auto_plot('people_fully_vaccinated', top10_people_full, title='People Fully Vaccinated')

In [None]:
fig = px.treemap(daily,path = ['country'],values = 'daily_vaccinations')
fig.update_layout(title='Daily Vaccinations by Country',title_x=0.5)
fig.show()

In [None]:
percent_fully_vaccinated_10 = percent_fully_vaccinated.sort_values(by=['fully_vaccinated_(%)'], ascending = False)[:10]

In [None]:
percent_fully_vaccinated_10

In [None]:
auto_plot('fully_vaccinated_(%)', percent_fully_vaccinated_10, title='Percent of Population Fully Vaccinated')

In [None]:
percent_fully_vaccinated = percent_fully_vaccinated.sort_values(by=['fully_vaccinated_(%)'], ascending = False)

In [None]:
fig = px.treemap(percent_fully_vaccinated,path = ['country'],values = 'fully_vaccinated_(%)')
fig.update_layout(title='Percent of Population Fully Vaccinated',title_x=0.5)
fig.show()

## Daily Vaccinations

In [None]:
auto_plot('daily_vaccinations', top10_daily, title = 'Daily Vaccinations')

## Estimated Number of Days Until 100% Population Vaccinated

In [None]:
estimated_days_until_100_percent_vaccinations_10 = estimated_days_until_100_percent_vaccinations.sort_values(by=['estimated_days_until_100_percent_vaccinations'], ascending = True)[:10]

In [None]:
auto_plot('estimated_days_until_100_percent_vaccinations', estimated_days_until_100_percent_vaccinations_10, title = 'Fewest Estimated Days Until 100% Population Vaccinated')

In [None]:
estimated_days_until_100_percent_vaccinations_10

For further visualisations on this dataset visit:

https://public.tableau.com/profile/joseph.awonusi#!/vizhome/COVID-19VaccinationProgress/Dashboard1