# Downloading

In [1]:
a <- scan("http://www2.census.gov/acs2013_3yr/pums/", "character")
## remove junks
a <- a[grep("csv", a)]                  
## extract the file names
a <-  substring(a, 7, 17)                    
## get the links to the zip files
baseurl <- "http://www2.census.gov/acs2013_3yr/pums/"
aurl <- paste(baseurl, a,sep="")                    
## Download the files automatically
for (i in seq_along(a)) {
    print(paste0("Downloading ", a[i]))
    download.file(aurl[i], a[i])
    ## To avoid overwhelming the web traffic,
    ## Add pauses between downloads to mimic human clicking 
    Sys.sleep(10)
}

[1] "Downloading csv_hak.zip"
[1] "Downloading csv_hal.zip"


# Regular Expressions

In [2]:
myDNA <- c("ATCG", "CCATCG", "ATGCG", "ATTTCCG", "ATTTTTTCGGGG",
           "ATTCGGGATCG")

## Match

In [3]:
################################################################
## Match:  Find if any string matches our wanted pattern
################################################################
##matches ATCG exactly, not really!
myDNA[grep("ATCG", myDNA)]

## a  solution for exact match
myDNA[grep("^ATCG$", myDNA)]

## match either ATTCG  or ATCG
myDNA[grep("ATT?CG", myDNA)]

## match the above two cases and more T in between
myDNA[grep("ATT*CG", myDNA)]

## match all the * cases except ATCG
myDNA[grep("ATT+CG", myDNA)]
myDNA[grep("AT+CG", myDNA)]


## RE is greedy by default 
myDNA[grep(".*A.*", myDNA)]
myDNA[grep("A", myDNA)]

myDNA[grep(".+A.+", myDNA)]
myDNA[grep("^[AT]+.*[G]+$", myDNA)]

## Replace

In [4]:
################################################################
## Replace: Replaced matched patterns 
################################################################

## either ATTCG  or ATCG
sub("ATT?CG", "haha", myDNA)

sub("AT+CG", "haha", myDNA)

gsub("AT+CG", "haha", myDNA)

## you can duplicate matched strings as well
sub("(AT+CG)", "\\1_\\1_haha", myDNA)

## Split

In [5]:
################################################################
## Split: split around ATCG
################################################################
strsplit(myDNA, "AT+CG")

## Match Details

In [6]:
################################################################
## Matching details: matches occur where, what and etc 
################################################################
## first matches
regexpr("AT+CG", myDNA)

## multiple matches
gregexpr("AT+CG", myDNA)


## Special Conventions

In [8]:
################################################################
##  R special convention: \ 
################################################################
strsplit("myDNA.txt", ".") 
# strsplit("myDNA.txt", "\.") # error
strsplit("myDNA.txt", "\\.")

## More Complex Examples

In [9]:
################################################################
## A More Complex Example
################################################################
## an efficient way to extract multiple patterns in brackets
x <- "http://en.wikipedia.org/wiki/Data_science"
m <- regexec("^(([^:]+)://)?([^:/]+)(/[^/]+)(/.*)", x)
m
regmatches(x, m)

In [10]:
x <- "https://sph.uth.edu/dept/bads/"
m <- regexec("^(([^:]+)://)?([^:/]+)(/[^/]+)(/.*)", x)
m
regmatches(x, m)

# Web Scraping

In [12]:
library(RCurl)
theurl <- "https://en.wikipedia.org/wiki/COVID-19"
a <- getURL(theurl)
## format a (e.g. adding line breaks) for better readability
a <- readLines(tc <- textConnection(a)); close(tc)
a[grep("IFR estimate per age group", a)] 

In [16]:
library(XML)
tbl <- readHTMLTable(a)
tbl[2]   

V1,V2
<chr>,<chr>
Age group,IFR
0â€“34,0.004%
35â€“44,0.068%
45â€“54,0.23%
55â€“64,0.75%
65â€“74,2.5%
75â€“84,8.5%
85â€‰+,28.3%


# Test

In [7]:
test_strings <- c("Zab", "H", "aZc", "b", "XXcc")
regexpr("^[A-Z][a-z]*", test_strings)
regexpr("^[A-Z][a-z]+", test_strings)
regexpr("[^A-Z][^a-z]+", test_strings)