In [1]:
require("httr")
require("rvest")

Loading required package: httr

Loading required package: rvest



In [2]:
library(httr)
library(rvest)

In [8]:
get.data <- function() {
    url <- "https://en.wikipedia.org/w/index.php"
    params <- list(
        title = "Template:COVID-19_testing_by_country"
    )
    response <- GET(url = url, query = params)
    return(response)
}

In [9]:
# Call the get_wiki_covid19_page function and print the response
get.data()

Response [https://en.wikipedia.org/w/index.php?title=Template%3ACOVID-19_testing_by_country]
  Date: 2021-11-25 19:19
  Status: 200
  Content-Type: text/html; charset=UTF-8
  Size: 409 kB
<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Template:COVID-19 testing by country - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames...
"CS1 German-language sources (de)","CS1 Azerbaijani-language sources (az)","C...
"CS1 uses Japanese-language script (ja)","CS1 Japanese-language sources (ja)"...
"COVID-19 pandemic templates"],"wgPageContentLanguage":"en","wgPageContentMod...
"Q87325019","wgGENewcomerTasksGuidanceEnabled":true,"wgGEAskQuestionEnabled":...
...

In [10]:
url <- "https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country"

Get the root html node from the http response in task 1 

In [11]:
html.node <- read_html(url)
print(html.node)

{html_document}
<html class="client-nojs" lang="en" dir="ltr">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-10 ns-subject ...


In [13]:
root.node <- read_html(url)
print(root.node)

{html_document}
<html class="client-nojs" lang="en" dir="ltr">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-10 ns-subject ...


In [14]:
table.node <- html_node(root.node, "table")
print(table.node)

{html_node}
<table class="wikitable plainrowheaders sortable collapsible autocollapse">
[1] <caption>\n<style data-mw-deduplicate="TemplateStyles:r1054937957">.mw-pa ...
[2] <tbody>\n<tr>\n<th>Country or region\n</th>\n<th>Date<sup id="cite_ref-1" ...


In [16]:
df <- html_table(table.node)
print(head(df, 20))

[90m# A tibble: 20 × 9[39m
   `Country or region` `Date[a]`   Tested      `Units[b]` `Confirmed(cases)`
   [3m[90m<chr>[39m[23m               [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m      [3m[90m<chr>[39m[23m             
[90m 1[39m Afghanistan         17 Dec 2020 154,767     samples    49,621            
[90m 2[39m Albania             18 Feb 2021 428,654     samples    96,838            
[90m 3[39m Algeria             2 Nov 2020  230,553     samples    58,574            
[90m 4[39m Andorra             15 Nov 2021 225,568     samples    15,907            
[90m 5[39m Angola              12 Mar 2021 399,228     samples    20,981            
[90m 6[39m Antigua and Barbuda 6 Mar 2021  15,268      samples    832               
[90m 7[39m Argentina           22 Nov 2021 25,918,391  samples    5,315,989         
[90m 8[39m Armenia             22 Nov 2021 2,311,186   samples    334,347           
[90m 9[39m Australia    

In [18]:
summary(df)

 Country or region    Date[a]             Tested            Units[b]        
 Length:173         Length:173         Length:173         Length:173        
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
 Confirmed(cases)   Confirmed /tested,% Tested /population,%
 Length:173         Length:173          Length:173          
 Class :character   Class :character    Class :character    
 Mode  :character   Mode  :character    Mode  :character    
 Confirmed /population,%     Ref.          
 Length:173              Length:173        
 Class :character        Class :character  
 Mode  :character        Mode  :character  

In [22]:
preprocess.df <- function(df) {
    # remove the world row
    df <- df[!(df$`Country or region` == "World"), ]
    # remove the last row
    df <- df[1:172, ]
    # remove the Units and Ref columns
    df["Ref."] <- NULL
    df["Units[b]"] <- NULL

    # rename the columns
    names(df) <- c("Country", "Date", "Tested", "Confirmed", "Confirmed.Test.Ratio", "Tested.Population.Ratio", "Confirmed.Population.Ratio")

    # convert column data types
    df$Country <- as.factor(df$Country)
    df$Date <- as.factor(df$Date)
    df$Tested <- as.numeric(gsub(",","", df$Tested))
    df$Confirmed <- as.numeric(gsub(",","", df$Confirmed))
    df$Confirmed.Test.Ratio <- as.numeric(gsub(",","", df$Confirmed.Test.Ratio))
    df$Tested.Population.Ratio <- as.numeric(gsub(",","", df$Tested.Population.Ratio))
    df$Confirmed.Population.Ratio <- as.numeric(gsub(",","", df$Confirmed.Population.Ratio))

    return(df)
}

In [23]:
df <- preprocess.df(df)
print(head(df, 20))

[90m# A tibble: 20 × 7[39m
   Country             Date   Tested Confirmed Confirmed.Test.… Tested.Populati…
   [3m[90m<fct>[39m[23m               [3m[90m<fct>[39m[23m   [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m            [3m[90m<dbl>[39m[23m            [3m[90m<dbl>[39m[23m
[90m 1[39m Afghanistan         17 De… 1.55[90me[39m5     [4m4[24m[4m9[24m621            32.1              0.4 
[90m 2[39m Albania             18 Fe… 4.29[90me[39m5     [4m9[24m[4m6[24m838            22.6             15   
[90m 3[39m Algeria             2 Nov… 2.31[90me[39m5     [4m5[24m[4m8[24m574            25.4              0.53
[90m 4[39m Andorra             15 No… 2.26[90me[39m5     [4m1[24m[4m5[24m907             7.1            291   
[90m 5[39m Angola              12 Ma… 3.99[90me[39m5     [4m2[24m[4m0[24m981             5.3              1.3 
[90m 6[39m Antigua and Barbuda 6 Mar… 1.53[90me[39m4       832             5.4             15.9 