<a href="https://colab.research.google.com/github/rachel0201/data-science/blob/master/R_webscraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## import libraries

In [None]:
require("httr")
require("rvest")

In [None]:
library(httr)
library(rvest)

## TASK 1: Get a `COVID-19 pandemic` Wiki page using HTTP request

In [None]:
get_wiki_covid19_page <- function() {
   
  # Wiki page base
  wiki_base_url <- "https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country"
  # read html
  wiki_covid19_page <- read_html(wiki_base_url)
  return(wiki_covid19_page)
}

In [None]:
covid19_page <- get_wiki_covid19_page()
covid19_page

{html_document}
<html class="client-nojs" lang="en" dir="ltr">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-10 ns-subject ...

## TASK 2: Extract COVID-19 testing data table from the wiki HTML page

In [None]:
root_node <- html_node(covid19_page,"table")
root_node

{html_node}
<table class="wikitable plainrowheaders sortable collapsible autocollapse">
[1] <caption>\n<style data-mw-deduplicate="TemplateStyles:r1054937957">.mw-pa ...
[2] <tbody>\n<tr>\n<th>Country or region\n</th>\n<th>Date<sup id="cite_ref-1" ...

In [None]:
df_table <- html_table(root_node)
df_table
summary(df_table)

Country or region,Date[a],Tested,Units[b],Confirmed(cases),"Confirmed /tested,%","Tested /population,%","Confirmed /population,%",Ref.
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Afghanistan,17 Dec 2020,154767,samples,49621,32.1,0.40,0.13,[1]
Albania,18 Feb 2021,428654,samples,96838,22.6,15.0,3.4,[2]
Algeria,2 Nov 2020,230553,samples,58574,25.4,0.53,0.13,[3][4]
Andorra,15 Nov 2021,225568,samples,15907,7.1,291,20.5,[5]
Angola,12 Mar 2021,399228,samples,20981,5.3,1.3,0.067,[6]
Antigua and Barbuda,6 Mar 2021,15268,samples,832,5.4,15.9,0.86,[7]
Argentina,19 Nov 2021,25854639,samples,5313607,20.6,57.0,11.7,[8]
Armenia,19 Nov 2021,2289360,samples,332713,14.5,77.6,11.3,[9]
Australia,19 Nov 2021,46318354,samples,195618,0.42,185,0.78,[10]
Austria,19 Nov 2021,105325185,samples,1026931,0.98,1183,11.5,[11]


 Country or region    Date[a]             Tested            Units[b]        
 Length:173         Length:173         Length:173         Length:173        
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
 Confirmed(cases)   Confirmed /tested,% Tested /population,%
 Length:173         Length:173          Length:173          
 Class :character   Class :character    Class :character    
 Mode  :character   Mode  :character    Mode  :character    
 Confirmed /population,%     Ref.          
 Length:173              Length:173        
 Class :character        Class :character  
 Mode  :character        Mode  :character  

## TASK 3: Pre-process and export the extracted data frame

In [None]:
preprocess_covid_data_frame <- function(data_frame) {
    
    shape <- dim(data_frame)

    # Remove the World row
    data_frame<-data_frame[!(data_frame$`Country or region`=="World"),]
    # Remove the last row
    data_frame <- data_frame[1:172, ]
    
    # We dont need the Units and Ref columns, so can be removed
    data_frame["Ref."] <- NULL
    data_frame["Units[b]"] <- NULL
    
    # Renaming the columns
    names(data_frame) <- c("country", "date", "tested", "confirmed", "confirmed.tested.ratio", "tested.population.ratio", "confirmed.population.ratio")
    
    # Convert column data types
    data_frame$country <- as.factor(data_frame$country)
    data_frame$date <- as.factor(data_frame$date)
    data_frame$tested <- as.numeric(gsub(",","",data_frame$tested))
    data_frame$confirmed <- as.numeric(gsub(",","",data_frame$confirmed))
    data_frame$'confirmed.tested.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.tested.ratio`))
    data_frame$'tested.population.ratio' <- as.numeric(gsub(",","",data_frame$`tested.population.ratio`))
    data_frame$'confirmed.population.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.population.ratio`))
    
    return(data_frame)
}

In [None]:
preprocess_data <- preprocess_covid_data_frame(df_table)

In [None]:
# Get working directory
wd <- getwd()
# Get exported 
file_path <- paste(wd, sep="", "/covid.csv")
# File path
print(file_path)
file.exists(file_path)
# export data frame to a csv file
write.csv(preprocess_data, file_path)

[1] "/content/covid.csv"


## Task4: Get the subset of the extracted data frame

In [None]:
# Read covid_data_frame_csv from the csv file
csv_file <- read.csv("/content/covid.csv")

In [None]:
# Get the 5th to 10th rows, with two "country" "confirmed" columns
columns <- c("country", "confirmed" )
csv_file[5:10,columns]

Unnamed: 0_level_0,country,confirmed
Unnamed: 0_level_1,<chr>,<int>
5,Angola,20981
6,Antigua and Barbuda,832
7,Argentina,5313607
8,Armenia,332713
9,Australia,195618
10,Austria,1026931


## Task 5: Calculate worldwide COVID testing positive ratio

In [None]:
# calculate the total confirmed case
total_confirmed <- sum(csv_file[,"confirmed"])
# calculate the total test
total_test <- sum(csv_file[,"tested"])
print("worldwide COVID confirmed cases")
total_confirmed
print("worldwide COVID tested cases")
total_test
# calculate the testing positive ratio'
ratio <- total_confirmed/ total_test
print("worldwide COVID testing positive ratio")
ratio


[1] "worldwide COVID confirmed cases"


[1] "worldwide COVID tested cases"


[1] "worldwide COVID testing positive ratio"


## Task 6:  Get a country list which reported their testing data

In [None]:
country_list <- csv_file[,"country"]

In [None]:
new_country_in <- sort(as.character(country_list),decreasing=FALSE)
new_country <- sort(as.character(country_list),decreasing=TRUE)
new_country

## TASK 7: Identify countries names with a specific pattern

In [None]:
grep("United.+",country_list,value = TRUE)

## TASK 8: Pick two countries you are interested, and then review their testing data

In [None]:
compare_co <- c("country", "confirmed", "confirmed.population.ratio")
compare_country <- csv_file[1:2,compare_co]
compare_country

Unnamed: 0_level_0,country,confirmed,confirmed.population.ratio
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,Afghanistan,49621,0.13
2,Albania,96838,3.4


## TASK 9: Compare which one of the selected countries has a larger ratio of confirmed cases to population

In [None]:
if(compare_country[1,"confirmed.population.ratio"] > compare_country[2,"confirmed.population.ratio"]) {
  print(compare_country[1,"country"])
}else{
  print(compare_country[2,"country"])
}
print(" has higher COVI-19 infection risk")


[1] "Albania"
[1] " has higher COVI-19 infection risk"


## TASK 10: Find countries with confirmed to population ratio rate less than a threshold

In [None]:
country_10 <- csv_file[, compare_co]
for (i in range(nrow(country_10["confirmed.population.ratio"]))){
  if (country_10[1,"confirmed.population.ratio"] > 0.01){
    print(country_10[1,"country"])
  }
}


[1] "Afghanistan"
[1] "Afghanistan"
