In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/covid-world-vaccination-progress/country_vaccinations.csv')
df.head()

In [None]:
# Replace the data column into datetime type

df['date'] = pd.to_datetime(df['date'], format = "%Y-%m-%d")
df = df.fillna(0)
df.info()

Theres few thing that I want to analyze in this data:
1. What's the most popular type of vaccines used/ distributed?
2. What countries have done the most vaccination? 
3. What countries have the most vaccinated people?
4. The progress of vaccination by time

In [None]:
# imoprt necessary packages
import matplotlib.pyplot as plt
import seaborn as sns

**1. What's the most popular type of vaccines used/ distributed?**

In [None]:
df['vaccines'].unique()

In [None]:
# Compare the number of country that use each type of vaccines
country_vaccines = df[['country', 'vaccines']].drop_duplicates().set_index('country')
country_vaccines = country_vaccines['vaccines'].str.get_dummies(sep=', ')
countries_each_vac  = pd.DataFrame(dict( country = country_vaccines.columns , num_vacs = [sum(country_vaccines[x]) for x in country_vaccines.columns] )).sort_values(by='num_vacs', ascending = False).reset_index()

In [None]:
# plotting the data

plt.style.use('ggplot')
plt.bar(countries_each_vac.country, height = countries_each_vac['num_vacs'])
plt.xticks(rotation = 90)
plt.show()

    As we can see some countries used mutiple vaccines, thus this bar plot only represent the distribution of types of vaccine. Note that there could be multiple caountries that use multiple vaccines.

    And by this plot we can also see that 'Pfizer/BioNTech' is the vaccines that most of the countries used or distributed.

**2.What countries have done the most vaccination?**

In [None]:
# Grouping data by country to see the vaccination

total_vac = df.groupby('country')['total_vaccinations'].max().dropna().reset_index().sort_values(by = 'total_vaccinations',ascending = False).reset_index(drop = True)

# Plotting the Data

plt.bar(total_vac.head(15)['country'], height = total_vac.head(15)['total_vaccinations'])
plt.yticks([x*(10**7) for x in np.arange(7)], ['0', '10', '20', '30', '40','50','60'])
plt.xticks(rotation = 90)
plt.title('Countries With most Vaccination')
plt.ylabel('Number of Vaccinations (in million)')
plt.xlabel('Country Name')
plt.show()

**3. What countries have done the most vaccination/ have the most vaccinated people?**

In [None]:
# Grouping data by country of the vaccinated people

total_ppl_vac = df.groupby('country')['people_vaccinated'].max().dropna().reset_index().sort_values(by = 'people_vaccinated',ascending = False).reset_index(drop = True)

# Plot the data

plt.bar(total_ppl_vac.head(15)['country'], height = total_ppl_vac.head(15)['people_vaccinated'])
plt.yticks([x*(10**7) for x in np.arange(5)], ['0', '1M', '2M', '3M', '4M'])
plt.xticks(rotation = 90)
plt.title('Countries With most Vaccinated people ')
plt.ylabel('Number of People (in million)')
plt.xlabel('Country Name')
plt.show()

**4.The progress of vaccination by time**

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# DAILY VACCINATION

daily_vac = df.groupby('date')['daily_vaccinations_raw'].sum()

#compute the linear regression
reg = LinearRegression()
X = daily_vac.index.factorize()[0].reshape(-1,1)
reg.fit(X, daily_vac.values)
y = reg.predict(X)

#plot the data
fig, ax = plt.subplots()
ax.plot(daily_vac)
ax.plot(daily_vac.index,y)
plt.title("Daily Vaccination in the world\n({} - {})".format(daily_vac.index[0].date(),daily_vac.index[-1].date()))
ax.set_yticklabels (['','','0', '1', '2', '3', '4', '5', '6'])
ax.set_ylabel('Vaccination (million)')

    as we see it seems many countries do the vaccination every couple of days or per week, not every single day as we see the graph peak on interval of time. Thus Im gonna try to plot the data on weekly terms.

In [None]:
# WEEKLY VACCINATION

weekly_vac = df[df['date']< "2021-02-22"].groupby(pd.Grouper(key= 'date', freq= '1W'))['daily_vaccinations_raw'].sum()

# Compute the linear regression
reg2 = LinearRegression()
X2 = weekly_vac.index.factorize()[0].reshape(-1,1)
reg2.fit(X2, weekly_vac.values)
y2 = reg2.predict(X2)
fig, ax = plt.subplots()
ax.plot(weekly_vac)
ax.plot(weekly_vac.index,y2)
plt.title("Weekly Vaccination in the world\n({} - {})".format(weekly_vac.index[0].date(),weekly_vac.index[-1].date()))
y_label = [''] + [int((i/2-0.5)*10) for i in range(9)]
print(y_label)
ax.set_yticklabels (y_label)
ax.set_ylabel('Vaccination (million)')

This is my first upload, i hope it would useful for me and anyone who see it. Im new in this field and I probably did some mistake, im very open to suggestion or correction. i would be very happy.

Thank you