##Import relevant libraries

In [44]:
import numpy as np
import pandas as pd

##Observing the dataset

In [45]:
#Using the locations.csv file, we will create a dataframe and assign it to the variable vaccines.
vaccines = pd.read_csv("https://raw.githubusercontent.com/siglimumuni/Datasets/master/Covid/locations.csv")

In [46]:
#Next, we check the first 5 rows of our dataframe for a quick look at the structure.
vaccines.head()

Unnamed: 0,location,iso_code,vaccines,last_observation_date,source_name,source_website
0,Afghanistan,AFG,"BBIBP-CorV, Oxford/AstraZeneca, Pfizer/BioNTech",2021-06-14,World Health Organization,https://covid19.who.int/
1,Albania,ALB,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac, ...",2021-06-14,Ministry of Health,https://shendetesia.gov.al/vaksinimi-anticovid...
2,Algeria,DZA,"Oxford/AstraZeneca, Sputnik V",2021-06-06,Ministry of Health,https://english.aawsat.com/home/article/301347...
3,Andorra,AND,"Oxford/AstraZeneca, Pfizer/BioNTech",2021-06-07,Government of Andorra,https://www.govern.ad/covid19_newsletter/
4,Angola,AGO,Oxford/AstraZeneca,2021-06-15,World Health Organization,https://covid19.who.int/


In [47]:
#Using the .info() method, we can look up more specific information about the dataset. 
vaccines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   location               216 non-null    object
 1   iso_code               216 non-null    object
 2   vaccines               216 non-null    object
 3   last_observation_date  216 non-null    object
 4   source_name            216 non-null    object
 5   source_website         216 non-null    object
dtypes: object(6)
memory usage: 10.2+ KB


##Cleaning the dataset

In [48]:
#We can begin by dropping the columns we don't need.
vaccines.drop(['last_observation_date','source_name','source_website'],axis=1,inplace=True)

In [49]:
vaccines.head()


Unnamed: 0,location,iso_code,vaccines
0,Afghanistan,AFG,"BBIBP-CorV, Oxford/AstraZeneca, Pfizer/BioNTech"
1,Albania,ALB,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac, ..."
2,Algeria,DZA,"Oxford/AstraZeneca, Sputnik V"
3,Andorra,AND,"Oxford/AstraZeneca, Pfizer/BioNTech"
4,Angola,AGO,Oxford/AstraZeneca


####The vaccine list per country is bundled in one column, making it hard to analyze. We can create dummy variables by assigning a column to each vaccine. For each row (country), we can then assign a 1 or 0 under each vaccine, depending on whether or not that vaccine has been in use.

In [50]:
#First we extract a complete list of vaccines. For simplicity sake, we will combine the Sinopharm/Beijing,Sinopharm/Wuhan and Sinopharm/HayatVax vaccines
# under 'Sinopharm'
vaccines_list = ['BBIBP-CorV','Oxford/AstraZeneca','Sinovac','SputnikV','Sinopharm','Pfizer/BioNTech',
                 'Johnson&Johnson','Moderna','Covaxin','CanSino','Abdala','Soberana02','QazVac','EpiVacCorona','RBD-Dimer']


In [51]:
#We create a custom function. This will be used to add the relevant columns to our dataset.
def createcolumns(columnlist):
  for vac in columnlist:
    vaccines[vac] = 0

In [52]:
#Apply the function to our dataset
createcolumns(vaccines_list)

In [53]:
#All relevant columns have been added with initial variables set to 0.
vaccines.head()

Unnamed: 0,location,iso_code,vaccines,BBIBP-CorV,Oxford/AstraZeneca,Sinovac,SputnikV,Sinopharm,Pfizer/BioNTech,Johnson&Johnson,Moderna,Covaxin,CanSino,Abdala,Soberana02,QazVac,EpiVacCorona,RBD-Dimer
0,Afghanistan,AFG,"BBIBP-CorV, Oxford/AstraZeneca, Pfizer/BioNTech",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Albania,ALB,"Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac, ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Algeria,DZA,"Oxford/AstraZeneca, Sputnik V",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Andorra,AND,"Oxford/AstraZeneca, Pfizer/BioNTech",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Angola,AGO,Oxford/AstraZeneca,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


####Next, for each country, we would need to assign a 1 to every vaccine which is in use in that country. This would involve a few steps.

In [54]:
#First, we need to remove all white spaces under the vaccines column.
vaccines['vaccines'] = vaccines['vaccines'].str.replace(' ','')

In [55]:
#Then we need to combine all Sinopharm vaccines under the common umbrella 'Sinopharm'.
vaccines['vaccines'] = vaccines['vaccines'].str.replace('Sinopharm/Beijing','Sinopharm')
vaccines['vaccines'] = vaccines['vaccines'].str.replace('Sinopharm/Wuhan','Sinopharm')
vaccines['vaccines'] = vaccines['vaccines'].str.replace('Sinopharm/HayatVax','Sinopharm')

In [56]:
#Finally, using a nested for loop, we assign a 1 for all relevant countries per vaccine in the dataset.
for i in range(len(vaccines)):
  for vac in vaccines['vaccines'].iloc[i].split(','):
    vaccines[vac].iloc[i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [57]:
#All relevant 1s have been assigned.
vaccines.head()

Unnamed: 0,location,iso_code,vaccines,BBIBP-CorV,Oxford/AstraZeneca,Sinovac,SputnikV,Sinopharm,Pfizer/BioNTech,Johnson&Johnson,Moderna,Covaxin,CanSino,Abdala,Soberana02,QazVac,EpiVacCorona,RBD-Dimer
0,Afghanistan,AFG,"BBIBP-CorV,Oxford/AstraZeneca,Pfizer/BioNTech",1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,Albania,ALB,"Oxford/AstraZeneca,Pfizer/BioNTech,Sinovac,Spu...",0,1,1,1,0,1,0,0,0,0,0,0,0,0,0
2,Algeria,DZA,"Oxford/AstraZeneca,SputnikV",0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
3,Andorra,AND,"Oxford/AstraZeneca,Pfizer/BioNTech",0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
4,Angola,AGO,Oxford/AstraZeneca,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [58]:
#We can drop the 'vaccines' and 'iso_code' columns as we won't be needing those.
vaccines.drop(['iso_code','vaccines'],axis=1,inplace=True)

In [59]:
#The dataset is ready for use now.
vaccines.head()

Unnamed: 0,location,BBIBP-CorV,Oxford/AstraZeneca,Sinovac,SputnikV,Sinopharm,Pfizer/BioNTech,Johnson&Johnson,Moderna,Covaxin,CanSino,Abdala,Soberana02,QazVac,EpiVacCorona,RBD-Dimer
0,Afghanistan,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,Albania,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0
2,Algeria,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
3,Andorra,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
4,Angola,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


##Vaccinations dataset

In [60]:
#Read in the dataset and assign it to 'vaccinations'.
vaccinations = pd.read_csv("https://raw.githubusercontent.com/siglimumuni/Datasets/master/Covid/vaccinations.csv")

In [61]:
#View the first 5 rows of the dataset
vaccinations.head()

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,35.0
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,35.0
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,35.0
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,35.0


In [62]:
#check the structure of the dataset
vaccinations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26938 entries, 0 to 26937
Data columns (total 12 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   location                             26938 non-null  object 
 1   iso_code                             26938 non-null  object 
 2   date                                 26938 non-null  object 
 3   total_vaccinations                   15997 non-null  float64
 4   people_vaccinated                    15216 non-null  float64
 5   people_fully_vaccinated              12264 non-null  float64
 6   daily_vaccinations_raw               13545 non-null  float64
 7   daily_vaccinations                   26686 non-null  float64
 8   total_vaccinations_per_hundred       15997 non-null  float64
 9   people_vaccinated_per_hundred        15216 non-null  float64
 10  people_fully_vaccinated_per_hundred  12264 non-null  float64
 11  daily_vaccinations_per_milli

####To make our dataset usable, we will need to extract only the latest date for each country as it contains an accumulation of all relevant data.

In [63]:
#First, we extract world data with all date fields for use in building a time series chart.
world = vaccinations[vaccinations['location'] == 'World']

In [64]:
world.head()

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
26542,World,OWID_WRL,2020-12-02,0.0,0.0,,,,0.0,0.0,,
26543,World,OWID_WRL,2020-12-03,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0
26544,World,OWID_WRL,2020-12-04,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0
26545,World,OWID_WRL,2020-12-05,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0
26546,World,OWID_WRL,2020-12-06,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0


In [65]:
#To extract only the latest date for the rest of our location fielsd, we begin by making a list of all unique locations in the dataset.
country_list = vaccinations['location'].unique()

In [66]:
country_list

array(['Afghanistan', 'Africa', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Asia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia', 'Bonaire Sint Eustatius and Saba',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands',
       'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire",
       'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czechia',
       'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'England',
       'Equatorial Guinea', 'Estonia', 'Eswatini', 'Ethiopia', 'Europe',
 

In [67]:
vaccinations.columns

Index(['location', 'iso_code', 'date', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated',
       'daily_vaccinations_raw', 'daily_vaccinations',
       'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred',
       'daily_vaccinations_per_million'],
      dtype='object')

In [68]:
#Next we create a function that will extract information from the row with the latest date per country.

def extractlatest(country):
  return list(vaccinations[vaccinations['location']== country].sort_values('date',ascending=False).iloc[0])

In [69]:
#We use a for loop to apply function to each country in our country_list. We then append the information retrieved to a list.
latest_dates = []
for country in country_list:
  latest_dates.append(extractlatest(country))

In [70]:
#Next, we create a dataframe from our list. 
vaccinations = pd.DataFrame(latest_dates,columns=vaccinations.columns)

In [71]:
vaccinations.head()

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
0,Afghanistan,AFG,2021-06-14,662003.0,484737.0,177266.0,,3272.0,1.7,1.25,0.46,84.0
1,Africa,OWID_AFR,2021-06-15,42911552.0,30254871.0,11835115.0,443728.0,508611.0,3.2,2.26,0.88,379.0
2,Albania,ALB,2021-06-14,842535.0,507764.0,334771.0,6105.0,5550.0,29.28,17.64,11.63,1929.0
3,Algeria,DZA,2021-06-06,2500000.0,2500000.0,,,22664.0,5.7,5.7,,517.0
4,Andorra,AND,2021-06-07,41473.0,30535.0,10938.0,,794.0,53.68,39.52,14.16,10276.0


###Finally the location column contains regions and income classes mixed with countries. To facilitate our analysis, we will seperate entries belonging to regions and income groupings into seperate dataframes.


In [72]:
#Create a list of all entries in the locations column that are regions.
regions_list = ['Africa', 'Asia', 'Europe','European Union','North America','South America','United Kingdom','Oceania','World']
#Create a list of all entries in the locations column that are income groupings.
income_list = ['High income','Low income', 'Lower middle income','Upper middle income']


In [73]:
#We create a custom function to extract rows with regional entries, and then delete them.
def extractregion(region):
  return list(vaccinations[vaccinations['location']== region].iloc[0])
  
  

In [74]:
#We use a for loop to apply function to each region in our regions_list. We then append the information retrieved to a list and delete the region
#from the vaccinations dataset
regions_data = []
for reg in regions_list:
  regions_data.append(extractregion(reg))
  vaccinations.drop(vaccinations[vaccinations['location']==reg].index,inplace=True)

In [75]:
#We no longer have any regions in the locations column.
vaccinations['location']

0            Afghanistan
2                Albania
3                Algeria
4                Andorra
5                 Angola
             ...        
222                Wales
223    Wallis and Futuna
225                Yemen
226               Zambia
227             Zimbabwe
Name: location, Length: 219, dtype: object

In [76]:
#We can now create a regions dataset from our regions list
vaccinations_by_region = pd.DataFrame(regions_data,columns=vaccinations.columns)

In [77]:
vaccinations_by_region.head()

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
0,Africa,OWID_AFR,2021-06-15,42911550.0,30254871.0,11835115.0,443728.0,508611.0,3.2,2.26,0.88,379.0
1,Asia,OWID_ASI,2021-06-15,1429771000.0,984473259.0,345046218.0,27677214.0,24079401.0,30.82,21.22,7.44,5190.0
2,Europe,OWID_EUR,2021-06-15,429177500.0,272268496.0,162451602.0,1437290.0,4138187.0,57.32,36.37,21.7,5527.0
3,European Union,OWID_EUN,2021-06-15,303691600.0,199160141.0,110066905.0,1301977.0,3288005.0,68.26,44.76,24.74,7390.0
4,North America,OWID_NAM,2021-06-15,398318600.0,237897165.0,171731970.0,1805480.0,2271300.0,67.28,40.18,29.01,3836.0


In [78]:
#Now we extract all rows in the locations column that are income groupings and create a new dataframe.
def extractincome(incgroup):
  return list(vaccinations[vaccinations['location']== incgroup].iloc[0])

In [79]:
#We use a for loop to apply function to each income group in our income_list. We then append the information retrieved to a list and delete the region
#from the vaccinations dataset
income_group_data = []
for group in income_list:
  income_group_data.append(extractincome(group))
  vaccinations.drop(vaccinations[vaccinations['location']==group].index,inplace=True)

In [80]:
income_groups = pd.DataFrame(income_group_data,columns=vaccinations.columns)

In [81]:
income_groups

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
0,High income,OWID_HIC,2021-06-15,859555200.0,521539167.0,332256679.0,4976681.0,7699904.0,68.05,41.29,26.31,6096.0
1,Low income,OWID_LIC,2021-06-15,7273403.0,6522357.0,701819.0,34384.0,63424.0,0.94,0.84,0.09,82.0
2,Lower middle income,OWID_LMC,2021-06-15,349026400.0,272196746.0,75230408.0,3864075.0,4423423.0,11.27,8.79,2.43,1428.0
3,Upper middle income,OWID_UMC,2021-06-15,1238189000.0,832206789.0,329377832.0,24329421.0,21026968.0,46.64,31.35,12.41,7920.0


In [82]:
#Our 4 dataframes are now ready for export
vaccines.head()
vaccinations.head()
vaccinations_by_region.head()
income_groups.head()

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
0,High income,OWID_HIC,2021-06-15,859555200.0,521539167.0,332256679.0,4976681.0,7699904.0,68.05,41.29,26.31,6096.0
1,Low income,OWID_LIC,2021-06-15,7273403.0,6522357.0,701819.0,34384.0,63424.0,0.94,0.84,0.09,82.0
2,Lower middle income,OWID_LMC,2021-06-15,349026400.0,272196746.0,75230408.0,3864075.0,4423423.0,11.27,8.79,2.43,1428.0
3,Upper middle income,OWID_UMC,2021-06-15,1238189000.0,832206789.0,329377832.0,24329421.0,21026968.0,46.64,31.35,12.41,7920.0


In [83]:
#from google.colab import files
#vaccines.to_csv('vaccines.csv') 
#files.download('vaccines.csv')

In [84]:
#from google.colab import files
#vaccinations.to_csv('vaccinations.csv') 
#files.download('vaccinations.csv')

In [85]:
#from google.colab import files
#vaccinations_by_region.to_csv('vaccinations_by_region.csv') 
#files.download('vaccinations_by_region.csv')

In [86]:
#from google.colab import files
#income_groups.to_csv('income_groups.csv') 
#files.download('income_groups.csv')

In [87]:
#from google.colab import files
#world.to_csv('world.csv')
#files.download('world.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>