# Project Milestone 3

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

## Loading Dataset

In [3]:
# get url
wikiurl = "https://en.wikipedia.org/wiki/Deployment_of_COVID-19_vaccines"
response = requests.get(wikiurl)

In [4]:
# parse data
soup = BeautifulSoup(response.text, "html.parser")
vaccine_table = soup.find("table", {"class":"wikitable"})
vaccine_table

<table class="wikitable sortable sortunder tpl-blanktable plainrowheaders plainrowheadersbg sticky-col2" style="text-align: right; margin-top: 0;">
<caption><a class="mw-selflink selflink">COVID-19 vaccine distribution</a> by country<sup class="reference" id="cite_ref-Template:COVID-19_datab_18-0"><a href="#cite_note-Template:COVID-19_datab-18">[17]</a></sup>
</caption>
<tbody><tr class="sticky-row">
<th class="unsortable" style="padding-bottom: 1em;">
</th>
<th scope="col" style="padding-bottom: 1em;">Location
</th>
<th scope="col" style="padding-bottom: 1em;">Vaccinated<sup class="reference" id="cite_ref-19"><a href="#cite_note-19">[a]</a></sup>
</th>
<th scope="col" style="padding-bottom: 1em;">Percent<sup class="reference" id="cite_ref-20"><a href="#cite_note-20">[b]</a></sup>
</th></tr>
<tr class="sorttop static-row-header">
<td data-sort-value="World" style="text-align: center;"><img alt="" data-file-height="20" data-file-width="20" decoding="async" height="16" src="//upload.wiki

In [5]:
# turn into html
vaccines = pd.read_html(str(vaccine_table))
vaccines = pd.DataFrame(vaccines[0])
vaccines

Unnamed: 0.1,Unnamed: 0,Location,Vaccinated[a],Percent[b]
0,,World[c][d],5441863836,68.24%
1,,China[e],1303832000,91.44%
2,,India,1026977435,72.47%
3,,European Union[f],337933911,75.07%
4,,United States[g],267032055,80.43%
...,...,...,...,...
218,,Niue,1650,102.23%
219,,Tokelau,1494,78.92%
220,,Pitcairn Islands,47,100.00%
221,,North Korea,0,0.00%


In [6]:
vaccines.tail()

Unnamed: 0.1,Unnamed: 0,Location,Vaccinated[a],Percent[b]
218,,Niue,1650,102.23%
219,,Tokelau,1494,78.92%
220,,Pitcairn Islands,47,100.00%
221,,North Korea,0,0.00%
222,.mw-parser-output .reflist{font-size:90%;margi...,.mw-parser-output .reflist{font-size:90%;margi...,.mw-parser-output .reflist{font-size:90%;margi...,.mw-parser-output .reflist{font-size:90%;margi...


## Transformation 1- Drop Unneeded Column

The leftmost column on Wikipedia is a graphic of the countries' flags. This is unneeded in our analysis so it will be dropped.

In [7]:
vaccines.columns

Index(['Unnamed: 0', 'Location', 'Vaccinated[a]', 'Percent[b]'], dtype='object')

In [8]:
# drop emoji column
vaccines.drop("Unnamed: 0", axis = 1, inplace=True)

In [9]:
# view table
vaccines

Unnamed: 0,Location,Vaccinated[a],Percent[b]
0,World[c][d],5441863836,68.24%
1,China[e],1303832000,91.44%
2,India,1026977435,72.47%
3,European Union[f],337933911,75.07%
4,United States[g],267032055,80.43%
...,...,...,...
218,Niue,1650,102.23%
219,Tokelau,1494,78.92%
220,Pitcairn Islands,47,100.00%
221,North Korea,0,0.00%


## Transformation 2- Remove Bad Row

The final row in the dataset is html code that somehow sneaked its way into our data set. We will remove it.

In [10]:
# remove last row
vaccines.drop(vaccines.tail(1).index, inplace=True)

In [11]:
vaccines

Unnamed: 0,Location,Vaccinated[a],Percent[b]
0,World[c][d],5441863836,68.24%
1,China[e],1303832000,91.44%
2,India,1026977435,72.47%
3,European Union[f],337933911,75.07%
4,United States[g],267032055,80.43%
...,...,...,...
217,Montserrat,2104,47.63%
218,Niue,1650,102.23%
219,Tokelau,1494,78.92%
220,Pitcairn Islands,47,100.00%


## Transformation 3- Remove Footnotes from Columns

Some of the countries and two columns have footnote markers. To make the data more readable, we will remove all such instances. For our first transformation, we will focus on the headers.

In [12]:
# remove footnoes on headers
vaccines.rename(columns={"Vaccinated[a]": "Vaccinated", "Percent[b]": "Percent"}, inplace=True)
# view table
vaccines

Unnamed: 0,Location,Vaccinated,Percent
0,World[c][d],5441863836,68.24%
1,China[e],1303832000,91.44%
2,India,1026977435,72.47%
3,European Union[f],337933911,75.07%
4,United States[g],267032055,80.43%
...,...,...,...
217,Montserrat,2104,47.63%
218,Niue,1650,102.23%
219,Tokelau,1494,78.92%
220,Pitcairn Islands,47,100.00%


## Transformation 4- Remove Footnotes from Rows

Similar to the last transformation I will be removing the footnote markers, this time on the rows. This will make the end data more readable and assist when merging the other datasets together.

In [13]:
# remove footnotes from Rows
vaccines["Location"] = vaccines.Location.str.replace("\[[a-z]+\]", "", regex=True)

In [14]:
# view table
vaccines

Unnamed: 0,Location,Vaccinated,Percent
0,World,5441863836,68.24%
1,China,1303832000,91.44%
2,India,1026977435,72.47%
3,European Union,337933911,75.07%
4,United States,267032055,80.43%
...,...,...,...
217,Montserrat,2104,47.63%
218,Niue,1650,102.23%
219,Tokelau,1494,78.92%
220,Pitcairn Islands,47,100.00%


## Transformation 5- Fix Column Types

The Vaccinated and Percent columns are both type object. However, since they are numbers we should try and convert them into integers. This will involve removing the percentage icon as well.

In [15]:
# remove percents from Percent column
vaccines["Percent"] = vaccines.Percent.str.replace("%", "", regex=True)

In [16]:
# convert to right type
vaccines[["Percent", "Vaccinated"]] = vaccines[["Percent", "Vaccinated"]].astype(str).astype(float)

In [17]:
# view table
vaccines

Unnamed: 0,Location,Vaccinated,Percent
0,World,5.441864e+09,68.24
1,China,1.303832e+09,91.44
2,India,1.026977e+09,72.47
3,European Union,3.379339e+08,75.07
4,United States,2.670321e+08,80.43
...,...,...,...
217,Montserrat,2.104000e+03,47.63
218,Niue,1.650000e+03,102.23
219,Tokelau,1.494000e+03,78.92
220,Pitcairn Islands,4.700000e+01,100.00


In [18]:
# Vaccinated looks ugly so lets make the column per million
vaccines["Vaccinated"] = vaccines["Vaccinated"].divide(1000000)

## Final Dataset


In [19]:
# view table
vaccines

Unnamed: 0,Location,Vaccinated,Percent
0,World,5441.863836,68.24
1,China,1303.832000,91.44
2,India,1026.977435,72.47
3,European Union,337.933911,75.07
4,United States,267.032055,80.43
...,...,...,...
217,Montserrat,0.002104,47.63
218,Niue,0.001650,102.23
219,Tokelau,0.001494,78.92
220,Pitcairn Islands,0.000047,100.00


In [20]:
vaccines.to_csv(r'C:\Users\Stewart\Documents\GitHub\dsc540\vaccines_cleaned.csv', index=False)

## Ethical Implications of Data Wrangling

Compared to the previous milestone, the data cleaning steps this milestone do not come with many ethical implications. In fact the only real ethical implication was when I removed the footnotes from the table. Those footnotes inclue important information on the data itself. For instance it qualifies what counts as being vaccinated and for locations specifies territories included within its count. This info, while burdensome from a data analysis perspective, is still necessary information for a holistic understanding of the data. Someone trying to find information on Vatican City vaccination rates would be surprised not to find it on this table if it weren't for the footnote marking it as part of Italy. Depending on the depths of one analysis it would raise ethical concerns to omit this information if such information changes how one views the conclusions. Of all the ethical problems that can arise from data wrangling however, this is a minor one.


Looking forward though, I can see that I am definitely going to have to make some strong choices about which countries to include when merging all the datasets. Since some territories are not included in one or the other, I will have to be careful in thinking through which to keep and which to discard.