In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
from matplotlib import style

style.use('dark_background')

font = {'family' : 'sans-serif',
        'weight' : 'bold',
        'size'   : 20}

plt.rc('font', **font)
plt.rc('xtick', labelsize=20) 
plt.rc('ytick', labelsize=20)
plt.rcParams['figure.figsize'] = (15, 8)

In [None]:
df = pd.read_csv('/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv', parse_dates = ['date'])

In [None]:
df.shape

In [None]:
df.head()

In [None]:
ind = df[df['country'] == 'India']

ind.isna().sum()

There are as many as 15 variables in the dataset; nonetheless, I make use of 5 variables, namely; the date, number of people vaccinated and number of people vaccinated per hundred. As per the documentation (description of the data); 2 columns namely; the number of people fully vaccinated, and number of people vaccinated per hundred which is the count of those people that have taken the complete dose (taken vaccine a couple of times). Thus, I include the latter 2 also.

That said, it would be wise to eliminate or substitute the null values of the aforesaid variables. That I do later.

In [None]:
def GroupByCountryAndDaily(df):
    groups = df.groupby('country')
    dailies = {}
    for name, group in groups:
        dailies[name] = GroupByDay(group)
    return dailies

def GroupByDay(df, function = np.mean):
    grouped = df[['date', 'people_vaccinated_per_hundred', 'people_fully_vaccinated', 'people_vaccinated', 'people_fully_vaccinated_per_hundred', 'daily_vaccinations_raw']].groupby('date')
    daily = grouped.aggregate(function)
    daily['date'] = daily.index
    return daily

Next, I group the data by country and then take the mean of values in all 4 columns based on the date column. I do it, for I might be interested in analyzing various countries together. Also, there might be multiple records for a given day that must be aggregated based on their average. 

In [None]:
ind.loc[:, 'people_fully_vaccinated'].fillna(0.0, inplace = True)
ind.loc[:, 'people_fully_vaccinated_per_hundred'].fillna(0.0, inplace = True)
ind.loc[:, 'daily_vaccinations_raw'].fillna(0.0, inplace = True)

In [None]:
ind.isna().sum()

In [None]:
dailies = GroupByCountryAndDaily(ind)

In [None]:
ind_df = pd.DataFrame(dailies['India'])

In [None]:
ind_df.dropna(0, inplace = True)
ind_df.head()

In [None]:
ind_df.iloc[:, 1:3] = ind_df.iloc[:, 1:3] / 1000
ind_df.iloc[:, 4] = ind_df.iloc[:, 4] / 1000

In [None]:
columns = {'people_vaccinated_per_hundred': 'people vaccinated per hundred',
           'people_fully_vaccinated': 'people fully vaccinated',
           'people_vaccinated': 'people vaccinated',
           'people_fully_vaccinated_per_hundred': 'people fully vaccinated per hundred',
           'daily_vaccinations_raw': 'daily vaccinations raw'}

ind_df.rename(columns = columns, inplace = True)
ind_df.head()

# Analysis of the number and the percentage of people vaccinated 

In [None]:
ind_df[['people vaccinated per hundred', 'people fully vaccinated per hundred']].plot(ylabel = '% vaccinated', linewidth = 3)
plt.grid()

In [None]:
ind_df[['people vaccinated', 'people fully vaccinated']].plot(ylabel = '(in 1000) vaccinated', linewidth = 3)
plt.grid()

# Analytic distribution of number of people vaccinated

In [None]:
def EvalCdf(sample, x):
    count = 0
    for i in sample:
        if i <= x:
            count += 1
    return count / len(sample)

In [None]:
vaccinated = sorted(ind_df['people vaccinated'].values)
cdf = [EvalCdf(vaccinated, x) for x in vaccinated]

plt.plot(vaccinated, cdf, linewidth = 3)
plt.grid()

The number of people being vaccinated has been increasing exponentially (the graph demonstrates an exponential distribution).

# Analysis of the number of people being vaccinated daily

In [None]:
plt.bar(ind_df.index[1:], ind_df['daily vaccinations raw'][1:].values, label = 'daily vaccinations')
plt.xlabel('date')
plt.ylabel('(in 1000)daily vaccination numbers')
plt.axhline(y = np.median(ind_df['daily vaccinations raw'].values), color = 'maroon', ls = ':', linewidth = 10, label = 'population mean')
plt.legend(loc = 9)
plt.xticks(rotation = 90)
plt.grid()