In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
covid_data = pd.read_csv('/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv')
covid_data.describe()
covid_data.head(15)

In [None]:
covid_data.shape

covid_data['country'].value_counts()

In [None]:
missing_values = covid_data.isnull().sum()
missing_values[0:15]

In [None]:
total_cells = np.product(covid_data.shape)
total_missing = missing_values.sum()
percent_missing = (total_missing / total_cells) * 100
print(percent_missing)

Initially I would like to have lists of countries using each of the vaccines. We can create a new DataFrame by dropping most of the timeseries data as well as some of the columns we do not need.

In [None]:
non_timeseries = covid_data.drop(['date','total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'daily_vaccinations_raw', 'daily_vaccinations','total_vaccinations_per_hundred','people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred','daily_vaccinations_per_million'], axis=1)
non_timeseries = non_timeseries.drop_duplicates(subset=['country'])
non_timeseries.head(15)
non_timeseries.sample(20)

In [None]:
#Creating lists of countries which use each particular vaccine


cnbg = list(non_timeseries.country[(non_timeseries.vaccines.str.contains('CNBG'))])
print('\nCountries using CNBG: ', cnbg)

covaxin= list(non_timeseries.country[(non_timeseries.vaccines.str.contains('Covaxin'))])
print('\nCountries using Covaxin: ', covaxin)

moderna = list(non_timeseries.country[(non_timeseries.vaccines.str.contains('Moderna'))])
print('\nCountries using Moderna: ', moderna)

oxford_astra = list(non_timeseries.country[(non_timeseries.vaccines.str.contains('Oxford/AstraZeneca'))])
print('\nCountries using Oxford/AstraZeneca: ', oxford_astra)

pfizer_bio = list(non_timeseries.country[(non_timeseries.vaccines.str.contains('Pfizer/BioNTech'))])
print('\nCountries using Pfizer/BioNTech: ', pfizer_bio)

sinopharm = list(non_timeseries.country[(non_timeseries.vaccines.str.contains('Sinopharm'))])
print('\nCountries using Sinopharm: ', cnbg)

sinovac = list(non_timeseries.country[(non_timeseries.vaccines.str.contains('Sinovac'))])
print('\nCountries using Sinovac: ', sinovac)

sputnik = list(non_timeseries.country[(non_timeseries.vaccines.str.contains('Sputnik'))])
print('\nCountries using Sputnik V: ', sputnik)


Now that I have clean lists of which countries use which vaccines (satiating my personal curiosity), I can start focusing on the timeseries. I will start making changes to the data like dropping some of the columns I don't plan on using such as the ISO code, and the source of the vaccination information. 

In [None]:

covid_data = covid_data.drop(['iso_code', 'source_name', 'source_website'], axis=1)
covid_data.head(15)

population = {'Algeria':43050000, 'Argentina':44940000, 'Austria': 8859000, 'Bahrain': 1641000, 'Belgium': 11460000 , 'Bermuda': 63918, 'Brazil': 211000000, 'Bulgaria': 7000000, 'Canada': 37590000, 'Chile': 18950000 , 'China': 1398000000,  'Costa Rica': 5048000, 'Croatia': 4076000 , 'Cyprus': 875899 , 'Czechia': 10650000, 'Denmark': 5806000 , 'Ecuador': 17370000, 'England': 55980000  , 'Estonia': 1325000, 'Finland': 5518000 , 'France' : 67060000 , 'Germany': 83020000, 'Gibraltar': 33701 , 'Greece': 10720000, 'Hungary': 9773000, 'Iceland': 356991, 'India': 1366000000 , 'Indonesia': 270600000, 'Ireland': 4904000, 'Isle of Man': 84584, 'Israel': 9053000, 'Italy': 60360000, 'Kuwait': 4207000 , 'Latvia': 1920000 , 'Lithuania': 2794000 , 'Luxembourg': 613894, 'Malta': 502653 , 'Mexico': 127600000, 'Morocco':36470000 , 'Myanmar': 54050000 ,  'Netherlands': 17280000, 'Northern Cyprus': 326000, 'Northern Ireland':1885000 , 'Norway': 5328000 , 'Oman': 4975000, 'Palau':18008, 'Panama': 42460000, 'Poland': 37970000 , 'Portugal': 10280000, 'Romania': 19410000, 'Russia': 144400000,  'Saudi Arabia': 34270000 , 'Scotland': 5454000, 'Serbia': 6945000, 'Seychelles': 97625, 'Singapore': 5704000, 'Slovakia':5450000 , 'Slovenia': 2081000, 'Spain':46940000 , 'Sri Lanka': 21800000, 'Sweden': 10230000, 'Switzerland':8545000 , 'Turkey': 82000000 ,'United Arab Emirates':9771000, 'United Kingdom': 66650000 , 'United States': 328200000 , 'Wales': 3136000}

for key, value in population.items():
    print('\n', key, ' : ', value)

In [None]:
#Adding a total population column to each country
        
covid_data['total_population'] = covid_data['country'].map(population)
        
covid_data.sample(20)

In [None]:
pop_test = population
pop_test = dict.fromkeys(population, 0)

z = covid_data.groupby(['country'])['people_fully_vaccinated'].sum()


In [None]:
non_timeseries = non_timeseries.drop(['source_name','source_website', 'iso_code'], axis=1)
non_timeseries.reset_index(inplace=True)
non_timeseries['total_population'] = non_timeseries['country'].map(population)
#non_timeseries['partially_vaccinated']
#non_timeseries['fully_vaccinated'] = covid_data['people_fully_vaccinated_per_hundred'].max()
non_timeseries['total_vaccinated'] = non_timeseries['country'].map(z)
non_timeseries['percent_vaccinated'] = (non_timeseries['total_vaccinated']/non_timeseries['total_population'])*100
non_timeseries.head(30)


In [None]:
opsd_daily = opsd_daily.set_index('Date')