### Loading Libraries

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Importing the dataset

In [None]:
data1 = pd.read_csv("../input/covid-world-vaccination-progress/country_vaccinations.csv", parse_dates=True)
data2 = pd.read_csv("../input/covid-world-vaccination-progress/country_vaccinations_by_manufacturer.csv", parse_dates=True)

### Descriptive Statistics

In [None]:
data1.head()

In [None]:
data1.tail()

In [None]:
data1['date'] = pd.to_datetime(data1['date'])

In [None]:
data2.head()

In [None]:
data2['date'] = pd.to_datetime(data2['date'])

In [None]:
print("data 1",data1.shape, "data2", data2.shape)

In [None]:
data1.info()

In [None]:
data1.isnull().sum()

### Dealing With Null Values

1. By deleting the null values

In [None]:
data1_del_na = data1.dropna()

In [None]:
data1_del_na.info()
# As we can see the amount of data become very small.(less than half of original data)

2. By imputting the values using simple imputer

In [None]:
data1['total_vaccinations']=data1['total_vaccinations'].fillna(data1['total_vaccinations'].mean())
data1['people_vaccinated']=data1['people_vaccinated'].fillna(data1['people_vaccinated'].mean())
data1['daily_vaccinations_raw']=data1['daily_vaccinations_raw'].fillna(data1['daily_vaccinations_raw'].mean())
data1['daily_vaccinations']=data1['daily_vaccinations'].fillna(data1['daily_vaccinations'].mean())
data1['total_vaccinations_per_hundred']=data1['total_vaccinations_per_hundred'].fillna(data1['total_vaccinations_per_hundred'].mean())
data1['people_vaccinated_per_hundred']=data1['people_vaccinated_per_hundred'].fillna(data1['people_vaccinated_per_hundred'].mean())
data1['people_fully_vaccinated_per_hundred']=data1['people_fully_vaccinated_per_hundred'].fillna(data1['people_fully_vaccinated_per_hundred'].mean())
data1['daily_vaccinations_per_million']=data1['daily_vaccinations_per_million'].fillna(data1['daily_vaccinations_per_million'].mean())

In [None]:
data2.info()
# Vaccine by manufaturer has no missing data

In [None]:
data1.describe()

In [None]:
data2.describe()

### Some Visualization

In [None]:
plt.figure(figsize=(16,10))
sns.lineplot(data=data1)

In [None]:
data_apply = data1.groupby("country").apply(lambda df : df.loc[df.total_vaccinations.idxmax()])
data_apply
# Countries with total vaccinations

In [None]:
data1['year'] = pd.DatetimeIndex(data1['date']).year
data1['month'] = pd.DatetimeIndex(data1['date']).month

In [None]:
data1['date'].value_counts().sort_values().plot.line()

In [None]:
data1['date'].value_counts().resample('M').sum().plot.line()
# From the plot, near in march the vaccination was at peak

In [None]:
from pandas.plotting import lag_plot
plt.figure(figsize =(10,10))
lag_plot(data1['date'].tail(500))

In [None]:
x=data1.groupby(['country']).count()
x=x.sort_values(by='vaccines',ascending=False)
x=x.iloc[0:20].reset_index()
x
# #plot
plt.figure(figsize=(8,4))
ax= sns.barplot(x.vaccines, x.country, alpha=0.8)
plt.title("Country V/s Vaccine ")
plt.ylabel('Country', fontsize=12)
plt.xlabel('Vaccine', fontsize=12)
plt.show()

In [None]:
# Vaccines with their value counts and country, min and max to they are vaccinated.
data1.groupby("vaccines").country.agg([len,min,max])
# This shows "Moderna, Oxford/AstraZeneca, Pfizer/BioNTech" is the maximum used vaccine 

In [None]:
# Vaccines with their value counts and number of min and max people vaccinated.
data1.groupby("vaccines").people_vaccinated.agg([len,min,max])

In [None]:
ts=data1.groupby(["date"])["daily_vaccinations"].sum()
ts.astype('float')
plt.figure(figsize=(16,8))
plt.title('Daily Vaccination')
plt.xlabel('Date')
plt.ylabel('Total Vaccinated')
plt.plot(ts);

In [None]:
sort_vaccine = data1.vaccines.value_counts().head(15)
sort_vaccine.head(15)

In [None]:
vaccine_gp = data1.groupby(["vaccines"])
vac_gp_max = vaccine_gp.max()
max_vaccine = vac_gp_max.reset_index()
max_vaccine

In [None]:
plt.figure(figsize=(10,10))
sns.kdeplot(data=data1['total_vaccinations'], label='total vaccine')
sns.kdeplot(data=data1['people_vaccinated'], label = 'vaccinated')
sns.kdeplot(data=data1['people_fully_vaccinated'],label = 'fully vaccinated')
plt.legend()

In [None]:
plt.figure(figsize=(12,12))
sns.barplot(y="vaccines", x= "people_vaccinated", data = data1)
plt.xlabel("People Vaccinated",color = "Red", fontsize=20)
plt.ylabel("Vaccines",color = "Red", fontsize=20)
plt.title(" Vaccines V/S People Vaccinated",color = "Red", fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(12,12))
sns.barplot(x='total_vaccinations',y='vaccines',data=data1)
plt.xlabel("Total Vaccination",color = "Red", fontsize=20)
plt.ylabel("Vaccines",color = "Red", fontsize=20)
plt.title(" Vaccines V/S Total Vaccination",color = "Red", fontsize=20)
plt.show()

In [None]:
country_gp = data1.groupby(['country'])
country_gp_max = country_gp.max('people_vaccinated')
max_country = country_gp_max.reset_index()
max_country

In [None]:
plt.figure(figsize=(35,35))
sns.pairplot(data=data1)

In [None]:
correlation = data1.corr()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(correlation, annot=True)

In [None]:
plt.figure(figsize=(10,5))
sns.regplot(x="total_vaccinations",y="daily_vaccinations_per_million", data=data1)

In [None]:
plt.figure(figsize=(20,20))
sns.lmplot(x='total_vaccinations',y='people_vaccinated',data=data1)