In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
country_vacc = pd.read_csv('/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv')
vacc_manu = pd.read_csv('/kaggle/input/covid-world-vaccination-progress/country_vaccinations_by_manufacturer.csv')

In [None]:
vacc_manu.describe()

In [None]:
vacc_manu.head()

In [None]:
# look for null values
vacc_manu.isna().sum()

In [None]:
vacc = vacc_manu.drop(columns=['date', 'location'], axis=1)
print(vacc.head())
new_vacc = vacc.groupby(['vaccine'], sort=False)['total_vaccinations'].sum()
print(new_vacc.head())
#new_vacc.describe()

In [None]:
# OR DF CAN BE ACHIEVED USING PIVOT TABLE
df1 = pd.pivot_table(vacc, index=[ 'vaccine'], values=['total_vaccinations'], aggfunc=np.sum)
print(df1.head())
print(df1.info())

In [None]:
plt.figure(figsize=(12,10))
sns.histplot(df1, x='vaccine', y='total_vaccinations',legend=True)

In [None]:
# vaccines per country
#vacc_manu.location.unique()
vacc_per_country = vacc_manu.drop(['date'], axis=1)
#vacc_per_country.head()
df2 = pd.pivot_table(vacc_manu, index=[ 'location','vaccine'],values=['total_vaccinations'], aggfunc=np.sum)
df2.head()

In [None]:
# draw a histogram with number of vaccinnes per vaccine in each country
plt.figure(figsize=(12,10)) 
df2.plot.bar()
plt.show()

In [None]:
# group the data for location and vaccine and count number of times a particular vaccine was used.
df3 = vacc_per_country.drop('total_vaccinations', axis=1)
df = df3.groupby(['location', 'vaccine']).size().reset_index(name='Freq')

font = {'family': 'calibri',
        'color':  'darkblue',
        'weight': 'bold',
        'size': 12,
        }

fig, ax = plt.subplots(figsize=(20,8))
#ind = 0
for i, loc in enumerate(df.location.unique()):   
    ax = fig.add_subplot(5,3,i+1 )
    #for vac in df['vaccine'].loc[df['location'] == loc]:
    sns.barplot(data=df[df['location'] == loc],x='vaccine', y='Freq', ax=ax )
    #df[df['location'] == loc].plot.bar(x='vaccine', y='Freq')
    plt.title(loc)
    plt.xlabel('Vaccines Types',fontdict=font)
    plt.ylabel('days #', fontdict=font)

fig.tight_layout()
fig.show()

**#above graph shows that most of the countries are using Biontech/Pfizer as the most preferred vaccine followed by Moderna******

In [None]:
# lets look at vaccination in each country dataset
country_vacc.head()

In [None]:
# lets look at empty values
country_vacc.isna().sum()

In [None]:
country_vacc.describe()

In [None]:
country_vacc.info()

In [None]:
# source_name and source_website does not seem to have any value add, remove them
# replace all nan with 0.0

country_vacc_new = country_vacc.replace(np.nan, 0.0)
country_vacc_new = country_vacc_new.drop(['source_name', 'source_website', 'vaccines', 'country'], axis=1)
country_vacc_new.head()

In [None]:
# split the date column in month, date and year
country_vacc_new[['year', 'month', 'day']] = country_vacc_new['date'].str.split('-', expand=True)

In [None]:
# drop unwanted columns
country_vacc_new = country_vacc_new.drop(['date', 'day'], axis=1)
country_vacc_new.head()

In [None]:
# lets group the data on monthly basis and sum it up
df4 = country_vacc_new.groupby(['iso_code', 'month', 'year']).sum().reset_index()
df4.head()

In [None]:
# visualize the data for each month
font = {'family': 'calibri',
        'color':  'darkblue',
        'weight': 'bold',
        'size': 12,
        }

fig, ax = plt.subplots(figsize=(20,35))

for i, month in enumerate(df4.month.unique()):
    ax = fig.add_subplot(1,6,i+1)
    sns.barplot(data=df4[df4['month'] == month],y='iso_code', x='people_vaccinated', ax=ax )
    year = df4[df4['month'] == month].year.unique()
    plt.title(month+'/'+year)
    plt.xlabel('Country codes',fontdict=font)
    plt.ylabel('No.of people vaccinated', fontdict=font)

fig.tight_layout()
fig.show()
    

**from above diagram shows that USA already started vaccinating since december 2020 and is going the strongest 
 amongst other countries, followed by India, followed by Isreal**

In [None]:
# lets sum up the data yearly wise and see which country has vaccinated most

# 1. drop the month column
df5 = country_vacc_new.drop(['month'], axis=1)

# 2. Group the data based on iso code
df5 = df5.groupby(['iso_code']).sum().reset_index()
#df5.head()

# 3. draw a histrogram to see maximum number of people immunized country wise
plt.figure(figsize=(15,45))
sns.barplot(data=df5, y='iso_code', x='people_fully_vaccinated')
plt.plot()


**again we see, USA is leading with maximum number of people vaccinated, followed ba India**

In [None]:
# 3. draw a histrogram to see maximum number of people immunized per population country wise
plt.figure(figsize=(15,45))
sns.barplot(data=df5, y='iso_code', x='people_fully_vaccinated_per_hundred')
plt.plot()

**** here we see that Isreal is doing really well, vaccinating people in terms of population, followed by Gibraltar, Chile and Bahrain********
**US is doing OK, but India is really going very slow**