In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Covid World Vaccination Progress**

# **Importing Libraries**

In [None]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
import statsmodels.tsa.stattools as sts
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.graphics.tsaplots as sgt
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from itertools import product
pd.options.display.max_rows=10
sns.set()

# **Importing Data**

In [None]:
raw_data=pd.read_csv('/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv')


# **Data Preprocessing**

In [None]:
dfc=raw_data.copy()
dfc.info()

In [None]:
dfc.isnull().sum()

* Chaging dates to datetime format
* Interpolating values for numerical data

In [None]:
dfc.date=pd.to_datetime(dfc.date, format='%Y-%m-%d')
for country in dfc.country.unique():
    dfc.loc[dfc.country==country,'total_vaccinations':'daily_vaccinations_per_million']=dfc.loc[dfc.country==country,'total_vaccinations':'daily_vaccinations_per_million'].interpolate(axis=0)
    dfc.loc[dfc.country==country,'total_vaccinations':'daily_vaccinations_per_million']=dfc.loc[dfc.country==country,'total_vaccinations':'daily_vaccinations_per_million'].fillna(value=0,axis=0)
    



* Filling NA values with 0

In [None]:
dfc.loc[:,'total_vaccinations':'daily_vaccinations']=dfc.loc[:,'total_vaccinations':'daily_vaccinations'].astype(int)
dfc.loc[:,'total_vaccinations_per_hundred':'daily_vaccinations_per_million']=dfc.loc[:,'total_vaccinations_per_hundred':'daily_vaccinations_per_million'].astype(float)

* Assgning country codes

In [None]:
dfc.loc[dfc.country=='England','iso_code']=dfc.loc[dfc.country=='England','iso_code'].fillna('GB-ENG')
dfc.loc[dfc.country=='Northern Cyprus','iso_code']=dfc.loc[dfc.country=='Northern Cyprus','iso_code'].fillna('CYP')
dfc.loc[dfc.country=='Northern Ireland','iso_code']=dfc.loc[dfc.country=='Northern Ireland','iso_code'].fillna('GB-NIR')
dfc.loc[dfc.country=='Scotland','iso_code']=dfc.loc[dfc.country=='Scotland','iso_code'].fillna('GB-SCT')
dfc.loc[dfc.country=='Wales','iso_code']=dfc.loc[dfc.country=='Wales','iso_code'].fillna('GB-WLS')

In [None]:
dfc.isnull().sum()

In [None]:
dfc.head()

In [None]:
print('Data availabe from:' , dfc.date.min(),' to: ',dfc.date.max())
print('Total countires available in data:\t', dfc.country.unique().size)
print('Types of vaccines in data:\t',dfc.vaccines.unique().size)

# **Data Visualisation**

**Country Wise Total Vaccinations**

In [None]:
total_vaccinations=dfc.groupby(['country']).max().sort_values(by=['total_vaccinations'],ascending=False)

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(data=total_vaccinations,x=total_vaccinations.index,y='total_vaccinations',hue_order='total_vaccinations')
plt.xlabel('country',fontsize=20)
plt.ylabel('total vaccinations',fontsize=20)
plt.xticks(rotation=90)
plt.title('country wise total vaccinations',fontsize=30)
plt.show()

* As can be seen from the bar graph the leading country in terms of vaccination is the United states followed by China and the United Kingdom.

* There are many countries on the extreme right of chart with very low total vaccinations.It can be noted that population of these countries is low.

**Country wise Average Daily Vaccinations per Million**

In [None]:
mean_daily_vacc=dfc.groupby(['country']).mean().sort_values(by=['daily_vaccinations'],ascending=False)

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(data=mean_daily_vacc,x=mean_daily_vacc.index,y='daily_vaccinations',hue_order='daily_vaccinations')
plt.xlabel('country',fontsize=20)
plt.ylabel('daily vaccinations',fontsize=20)
plt.xticks(rotation=90)
plt.title('country wise average daily vaccinations',fontsize=30)
plt.show()

* From chart we can see daily average daily vaccinations happening in the country. Daily vaccinations in America is more than China and India, which are most populated countries.

**Total Vaccinations Per Hundred**

In [None]:
tot_vacc_hundred=dfc.groupby(['country']).max().sort_values(by=['total_vaccinations_per_hundred'],ascending=False)

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(data=tot_vacc_hundred,x=tot_vacc_hundred.index,y='total_vaccinations_per_hundred',hue_order='total_vaccinations_per_hundred')
plt.xlabel('country',fontsize=20)
plt.ylabel('daily vaccinations',fontsize=20)
plt.xticks(rotation=90)
plt.title('Total Vaccinations Per Hundred',fontsize=30)
plt.show()

* Ratio of total vaccinations population to total population of the country. 
* We can see the countries which has very less number of total vaccinated populations has maximum part of its countries population vaccinated. 
* United states, China and India which had maximum vaccinated population dont have maximum part of their population vaccinated

**Country Wise Daily Vaccinations**

In [None]:
plt.figure(figsize=(20,8))
sns.lineplot(data=dfc,x='date',y='total_vaccinations',hue='country')
plt.xlabel('Date',fontsize=20)
plt.ylabel('daily vaccinations',fontsize=20)
plt.title('Country wise Daily Vaccinations',fontsize=30)
plt.legend(bbox_to_anchor=(1.05,0.6,0.5,0.4),loc=2,borderpad=0)
plt.show()

* Daily vaccinations done by countries

 **People Vaccinated vs Fully Vaccinated**

In [None]:
dfc_vac_vs_fullvac=dfc.groupby(['date']).sum()

In [None]:
vac=mpatches.Patch(color='#4c72b0',label='People Vaccinated')
full_vac=mpatches.Patch(color='#dd8452',label='People fully Vaccinated')
plt.figure(figsize=(20,8))
sns.lineplot(data=dfc_vac_vs_fullvac,x=dfc_vac_vs_fullvac.index,y=dfc_vac_vs_fullvac.people_vaccinated)
sns.lineplot(data=dfc_vac_vs_fullvac,x=dfc_vac_vs_fullvac.index,y=dfc_vac_vs_fullvac.people_fully_vaccinated)
plt.xlabel('Date',fontsize=20)
plt.ylabel('daily vaccinations',fontsize=20)
plt.title('Country wise Daily Vaccinations',fontsize=30)
plt.legend(handles=[vac,full_vac],loc='upper left')
plt.show()

**Types of Vaccines**

In [None]:
dfc_vacc=dfc.groupby(['vaccines']).max().sort_values(by=['total_vaccinations'],ascending=False).reset_index()[['vaccines','total_vaccinations']]

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(data=dfc_vacc,x='vaccines',y='total_vaccinations',hue_order='total_vaccinations')
plt.xlabel('Vaccines',fontsize=20)
plt.xticks(rotation=90)
plt.ylabel('Total Vaccinations',fontsize=20)
plt.title('Types of Vaccines',fontsize=30)
plt.show()

* Moderma,Pfizer/BioNTech are most used vaccines.

**Country Wise used Vaccines**

In [None]:
dfc.groupby(['country']).max().reset_index()[['country','vaccines','total_vaccinations']].sort_values(by='total_vaccinations',ascending=False).reset_index()

# **Forecasting**

In [None]:
dfc_tsa=dfc.groupby(['date']).max().reset_index()[['date','total_vaccinations']].set_index('date').astype(int)

In [None]:
plt.figure(figsize=(20,8))
dfc_tsa.plot()
plt.show()

Doing adfuller test on data

In [None]:
sts.adfuller(dfc_tsa)

pvalue>0.05 so transformation needed to make it stationary

In [None]:
dfc_tsa['D1']=dfc_tsa.total_vaccinations.diff(1)

In [None]:
sts.adfuller(dfc_tsa.D1[1:])

In [None]:
dfc_tsa['D2']=dfc_tsa.total_vaccinations.diff(2)

In [None]:
sts.adfuller(dfc_tsa.D2[2:])

Doing log transformation to make data stationary

In [None]:
dfc_tsa['log1']=np.log(dfc_tsa.total_vaccinations)

In [None]:
plt.figure(figsize=(20,8))
dfc_tsa.log1.plot()
plt.show()

observing trend and seasonal decomposition

In [None]:
ses_dec=seasonal_decompose(dfc_tsa.log1,period=1)
ses_dec.plot()
plt.show()

In [None]:
sts.adfuller(dfc_tsa.log1)

Ploting acf and pacf 

In [None]:
sgt.plot_acf(dfc_tsa.log1,lags=40,zero=False)
plt.show()

In [None]:
sgt.plot_pacf(dfc_tsa.log1,lags=30,zero=False,method='ols')
plt.show()

From ACF plot we can say upto 4th lag is significant.
we will choose parameters in the range 0-4

In [None]:

results=[]
best_aic=float(math.inf)
for p in list(product(np.arange(0, 5), np.arange(0, 5), np.arange(0, 5), np.arange(0, 2), np.arange(0, 5), np.arange(0, 3))):
    try:
        model=SARIMAX(dfc_tsa.log1['2020-12-15':],order=(p[0],0,p[1]),seasonal_order=(p[2],p[3],p[4],p[5]))
        result=model.fit(disp=False)
    except:
        continue
    aic=result.aic
    if aic<best_aic:
        optimal_model=result
        best_aic=aic
        best_param=p
        results.append([best_param,best_aic])
        

In [None]:
results

In [None]:
print('best parameter is:',best_param,' with AIC value:',best_aic)

Model Summary

In [None]:
optimal_model.summary()


Checking residual lag

In [None]:
sgt.plot_acf(optimal_model.resid,lags=40,zero=False)
plt.show()

 Comparing our model with real data

In [None]:

plt.figure(figsize=(20,8))
dfc_tsa.total_vaccinations.plot()
np.exp(optimal_model.fittedvalues).plot(color='r')
plt.show()

Making predicitons

In [None]:
plt.figure(figsize=(20,8))
dfc_tsa.total_vaccinations.plot()
np.exp(optimal_model.predict(start=len(dfc_tsa.total_vaccinations)-5,end=len(dfc_tsa.total_vaccinations)+10,dynamic=False)).plot()
plt.show()


# **Thankyou** 