In [None]:
# linear algebra
import numpy as np
# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sb
import altair as alt

color = '#5e8b7e'

sb.set_style('ticks')
%matplotlib inline


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_mnf = pd.read_csv("../input/covid-world-vaccination-progress/country_vaccinations_by_manufacturer.csv")
df = pd.read_csv("../input/covid-world-vaccination-progress/country_vaccinations.csv")

In [None]:
df.head(3)

## Vaccine Manufacturer Dataset

First let's explore and visualize the `vaccine manufacturer dataset`.

In [None]:
df_mnf.head(10)

Let's first take a look at the vaccination manufacturer dataset:

In [None]:
df_mnf.dtypes

In [None]:
df_mnf.location.unique()

In [None]:
df_mnf.vaccine.unique()

In [None]:
df_mnf.isnull().sum()

We can see that this dataset does not have any missing information.

In [None]:
df_mnf.vaccine.value_counts()

### Vaccines used of each manufacturer:

In [None]:
# vaccine count per manufacturer
vaccine_count = df_mnf.groupby('vaccine')['total_vaccinations'].mean().\
                to_frame().reset_index()

vaccine_count.sort_values(by=['total_vaccinations'], ascending=False)

Plotting the vaccine used of each manufacturer in different countries using Matplotlib:

In [None]:
df_mnf.head(-5)

In [None]:
plt.figure(figsize=(10, 7))
plt.bar(vaccine_count['vaccine'], vaccine_count['total_vaccinations'], color=color)
plt.title("Number of different vaccines used")
plt.xticks(rotation=45)

plt.show()

Now let's plot the same graph using Altair:

In [None]:
# remember this is showing mean values and not actual values
alt.Chart(vaccine_count).mark_bar().encode(
    alt.X('vaccine:N', sort='-y'),
    alt.Y('total_vaccinations:Q'),
    alt.Color('vaccine:N'),
    alt.Tooltip(['vaccine', 'total_vaccinations'])
).properties(
    width = 600,
    height = 450
)

First notice that `altair` is more interactive (*try hovering your mouse pointer over the chart*) and is better looking when compared to `matplotlib`. We will be using altair in our next visualizations.

> From the figure above, we can clearly see that `Pfizer/BioNTech` is the most used vaccine followed by `Moderna`.

### Vaccination done in different countries:

In [None]:
df_mnf.head(10)

In [None]:
# vaccinations done in different countries based on manufacturers
country = df_mnf.groupby(['location'])['total_vaccinations'].mean().\
            to_frame().reset_index()

country.head(8)

In [None]:
alt.Chart(country).mark_bar().encode(
    alt.X('location:N', sort='-y'),
    alt.Y('total_vaccinations:Q'),
    alt.Color('location:N', legend=None),
    alt.Tooltip(['location', 'total_vaccinations'])
).properties(
    width=700,
    height=400
)

In [None]:
# data vaccination usa
usa = df_mnf[df_mnf['location'] == "United States"]
usa.head()

In [None]:
chart = alt.Chart(usa).mark_bar().encode(
    alt.X('average(total_vaccinations)'),
    alt.Y('location:N'),
    alt.Color('vaccine:N'),
    alt.Tooltip(['vaccine', 'sum(total_vaccinations)'])
).properties(
    width=700,
    height=50
)

chart

In [None]:
alt.data_transformers.disable_max_rows()
chart = alt.Chart(df_mnf).mark_bar().encode(
    alt.X('vaccine:N'),
    alt.Y('average(total_vaccinations)'),
    alt.Column('location'),
    alt.Tooltip(['location', 'vaccine', 'average(total_vaccinations)'])
)

chart

### Vaccines used in United States:

In [None]:
# Vaccines used in USA
print("Vaccines used in United States:\n")
vacc_list = list(usa.vaccine.unique())

for x in range(len(vacc_list)):
  print(f"{x+1}) {vacc_list[x]}")

In [None]:
pfizer_usa = alt.Chart(usa, title='Vaccinations over time in the US').mark_line().encode(
    alt.X('date:T'),
    alt.Y('total_vaccinations:Q'),
    alt.Tooltip(['vaccine', 'total_vaccinations']),
    alt.Color('vaccine'),
).properties(
    width=600,
    height=500
)

pfizer_usa

In [None]:
fig = plt.figure(figsize=(30, 30))

for i, v in enumerate(df_mnf['location'].unique()):
    temp_df = df_mnf[df_mnf['location'] == v]
    fig.add_subplot(8, 4, i+1)
    sb.lineplot(data=temp_df, x=temp_df['date'], y=temp_df['total_vaccinations'], hue='vaccine')
    plt.title(v)
    if len(temp_df['date'][::20]) > 15:
        plt.xticks(temp_df['date'][::40], rotation=90)
    else:
        plt.xticks(temp_df['date'][::20], rotation=90)
    
plt.tight_layout()
plt.show()

In [None]:
df.head()

In [None]:
# all the countries present in dataset
df.country.unique()

# Data on India

In [None]:
# data for India
df_india = df[df['country'] == 'India']

df_india.head()

In [None]:
# shape of dataset
df_india.shape

In [None]:
# total null values in each column
df_india.isnull().sum()

In [None]:
# drop the null values
df_india = df_india.dropna()
df_india.isnull().sum()

In [None]:
df_india.head(3)

In [None]:
# data types of each column
df_india.dtypes

We need to change the data-type of date column from object to datetime.

In [None]:
# changing to datetime
df_india.date = pd.to_datetime(df_india.date)

df_india.date.dtype

In [None]:
# total vaccinations in India
total = alt.Chart(df_india).mark_line(size=3, point=True, fill='lightgrey').encode(
    alt.X('date:T', axis=alt.Axis(title="Date")),
    alt.Y('total_vaccinations:Q', axis=alt.Axis(title="Total Vaccinations")),
    alt.Color('vaccines:N', legend=alt.Legend(orient='right')),
    alt.Tooltip(['date','total_vaccinations']),
).properties(
    title='Total Vaccinations Over Time in India',
    width = 500,
    height = 400
)
# people fully vaccinated
people_fully = alt.Chart(df_india).mark_line(size=3, point=True, fill='lightgrey').encode(
    alt.X('date:T', axis=alt.Axis(title="Date")),
    alt.Y('people_fully_vaccinated:Q', axis=alt.Axis(title="People Fully Vaccinated")),
    alt.Color('vaccines:N', legend=alt.Legend(orient='right')),
    alt.Tooltip(['date','people_fully_vaccinated']),
).properties(
    title='People Fully Vaccinated Over Time in India',
    width = 500,
    height = 400
)
# daily vaccinaitons
daily = alt.Chart(df_india).mark_line(size=3, point=True, fill='lightgrey').encode(
    alt.X('date:T', axis=alt.Axis(title="Date")),
    alt.Y('daily_vaccinations:Q', axis=alt.Axis(title="Daily Vaccinations")),
    alt.Color('vaccines:N', legend=alt.Legend(orient='right')),
    alt.Tooltip(['date','daily_vaccinations']),
).properties(
    title='Daily Vaccinations Over Time in India',
    width = 500,
    height = 400
)

# People Fully Vaccinated Per Hundred
per_hundred = alt.Chart(df_india).mark_line(size=3, point=True, fill='lightgrey').encode(
    alt.X('date:T', axis=alt.Axis(title="Date")),
    alt.Y('people_fully_vaccinated_per_hundred:Q', axis=alt.Axis(title="People Fully Vaccinated Per Hundred")),
    alt.Color('vaccines:N', legend=alt.Legend(orient='right')),
    alt.Tooltip(['date','people_fully_vaccinated_per_hundred']),
).properties(
    title='People Fully Vaccinated per hundred in India',
    width = 500,
    height = 400
)

chart = total | people_fully
chart &= daily | per_hundred

chart

# Data on United States

In [None]:
df_usa = df[df['country'] == 'United States']

df_usa.head()

In [None]:
# handling missing values
df_usa.isnull().sum()

In [None]:
# dropping rows with missing values
df_usa = df_usa.dropna()

df_usa.isnull().sum()

In [None]:
df_usa.date.dtype

In [None]:
# converting to datetime
df_usa.date = pd.to_datetime(df_usa.date)

df_usa.date.dtype

In [None]:
df_usa.head(3)

In [None]:
# total vaccinations in United States
total = alt.Chart(df_usa).mark_line(size=3, point=True, fill='lightgrey').encode(
    alt.X('date:T', axis=alt.Axis(title="Date")),
    alt.Y('total_vaccinations:Q', axis=alt.Axis(title="Total Vaccinations")),
    alt.Color('vaccines:N', legend=alt.Legend(orient='right')),
    alt.Tooltip(['date','total_vaccinations']),
).properties(
    title='Total Vaccinations Over Time in United States',
    width = 500,
    height = 400
)
# people fully vaccinated
people_fully = alt.Chart(df_usa).mark_line(size=3, point=True, fill='lightgrey').encode(
    alt.X('date:T', axis=alt.Axis(title="Date")),
    alt.Y('people_fully_vaccinated:Q', axis=alt.Axis(title="People Fully Vaccinated")),
    alt.Color('vaccines:N', legend=alt.Legend(orient='right')),
    alt.Tooltip(['date','people_fully_vaccinated']),
).properties(
    title='People Fully Vaccinated Over Time in United States',
    width = 500,
    height = 400
)
# daily vaccinaitons
daily = alt.Chart(df_usa).mark_line(size=3, point=True, fill='lightgrey').encode(
    alt.X('date:T', axis=alt.Axis(title="Date")),
    alt.Y('daily_vaccinations:Q', axis=alt.Axis(title="Daily Vaccinations")),
    alt.Color('vaccines:N', legend=alt.Legend(orient='right')),
    alt.Tooltip(['date','daily_vaccinations']),
).properties(
    title='Daily Vaccinations Over Time in United States',
    width = 500,
    height = 400
)

# People Fully Vaccinated Per Hundred
per_hundred = alt.Chart(df_usa).mark_line(size=3, point=True, fill='lightgrey').encode(
    alt.X('date:T', axis=alt.Axis(title="Date")),
    alt.Y('people_fully_vaccinated_per_hundred:Q', axis=alt.Axis(title="People Fully Vaccinated Per Hundred")),
    alt.Color('vaccines:N', legend=alt.Legend(orient='right')),
    alt.Tooltip(['date','people_fully_vaccinated_per_hundred']),
).properties(
    title='People Fully Vaccinated per hundred in United States',
    width = 500,
    height = 400
)

chart = total | people_fully
chart &= daily | per_hundred
chart

In [None]:
df_usa

In [None]:
# total people vaccinated in usa
# lets plot a pie plot
df_usa.iloc[-1]

In [None]:
df_usa.iloc[-1]['people_fully_vaccinated']

In [None]:
# last value in dataset
print(f"Data till date: {df_usa['date'][25303]}")

people_fully_vaccinated = int(df_usa.iloc[-1]['people_fully_vaccinated'])
remaining_pop_1 = 328000000-people_fully_vaccinated
print(f"people_fully_vaccinated: {people_fully_vaccinated}, \
        Remaining Population: {remaining_pop_1}")

people_vaccinated = int(df_usa.iloc[-1]['people_vaccinated'])
remaining_pop_2 = 328000000-people_vaccinated
print(f"people vaccinated: {people_vaccinated}, \
        Remaining Population: {remaining_pop_2}")

print(f"Populatoin of USA (approx): {328000000}")

In [None]:
fig = plt.figure(figsize=(18, 10))

# first pie chart
fig.add_subplot(1,2,1)
# data
x = [people_vaccinated,remaining_pop_2]
# labels
labels = ['People Vaccinated (USA)', 'People yet to be vaccinated']
explode = (0.1, 0)
plt.pie(x, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=50)
plt.title('People Vaccinated in USA (completely or partially)')

# second pie chart
fig.add_subplot(1,2,2)
# data
x = [people_fully_vaccinated,remaining_pop_1]
# labels
labels = ['People Fully Vaccinated (USA)', 'People not vaccinated fully']
explode = (0.1, 0)
plt.pie(x, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=50)
plt.title('People Fully Vaccinated in USA')

plt.show()