Skip to content
master
Go to file
Code

Latest commit

 

Git stats

Files

Permalink
Failed to load latest commit information.
Type
Name
Latest commit message
Commit time
R
 
 
 
 
 
 
man
 
 
src
 
 
 
 
 
 
 
 
 
 
 
 
 
 

README.md

sparkwarc - WARC files in sparklyr

Install

Install using with:

devtools::install_github("javierluraschi/sparkwarc")

Intro

The following example loads a very small subset of a WARC file from Common Crawl, a nonprofit 501 organization that crawls the web and freely provides its archives and datasets to the public.

library(sparkwarc)
library(sparklyr)
library(DBI)
library(dplyr)
sc <- spark_connect(master = "local")
## * Using Spark: 2.1.0
spark_read_warc(
  sc,
  "warc",
  system.file("samples/sample.warc.gz", package = "sparkwarc"),
  repartition = 8)
SELECT count(value)
FROM WARC
WHERE length(regexp_extract(value, '<html', 0)) > 0
count(value)
6
cc_regex <- function(ops) {
  ops %>%
    filter(regval != "") %>%
    group_by(regval) %>%
    summarize(count = n()) %>%
    arrange(desc(count)) %>%
    head(100)
}

cc_stats <- function(regex) {
  tbl(sc, "warc") %>%
    transmute(regval = regexp_extract(value, regex, 1)) %>%
    cc_regex()
}
cc_stats("http-equiv=\"Content-Language\" content=\"(.*)\"")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##   regval count
##    <chr> <dbl>
## 1  ru-RU     5
cc_stats("<script .*src=\".*/(.+)\".*")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##                            regval count
##                             <chr> <dbl>
## 1                           08.js     5
## 2                           ga.js     5
## 3 jquery.formtips.1.2.2.packed.js     5
## 4   jquery-ui-1.7.2.custom.min.js     5
## 5             jquery-1.4.2.min.js     5
## 6                        start.js     5
## 7           jquery.equalHeight.js     5
## 8                      lytebox.js     5
## 9                      plusone.js     5
cc_stats("<([a-zA-Z]+)>")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##      regval count
##       <chr> <dbl>
##  1       li    53
##  2     span    26
##  3       th    18
##  4        p    17
##  5       ul    16
##  6       tr    13
##  7   strong     7
##  8    title     6
##  9     body     6
## 10     head     6
## 11      div     6
## 12 noscript     5
## 13    table     3
## 14       td     3
## 15       br     1
## 16    style     1
cc_stats(" ([a-zA-Z]{5,10}) ")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##      regval count
##       <chr> <dbl>
##  1  counter    10
##  2   PUBLIC     6
##  3   return     6
##  4  Banners     5
##  5   widget     5
##  6 function     5
##  7   Banner     5
##  8    solid     2
##  9    Nutch     1
## 10   Domain     1
## 11    visit     1
## 12    crawl     1
## 13 Registry     1
## 14   Parked     1
## 15   Format     1
## 16 priceUAH     1
## 17   domain     1
cc_stats("<meta .*keywords.*content=\"([^,\"]+).*")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##                               regval count
##                                <chr> <dbl>
## 1                                Лес     1
## 2                           Вип Степ     1
## 3                       domain names     1
## 4 Регистрация-ликвидация предприятий     1
## 5                            Свобода     1
## 6                               Foxy     1
cc_stats("<script .*src=\".*/([^/]+.js)\".*")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##                            regval count
##                             <chr> <dbl>
## 1 jquery.formtips.1.2.2.packed.js     5
## 2                           08.js     5
## 3                           ga.js     5
## 4           jquery.equalHeight.js     5
## 5                      lytebox.js     5
## 6                      plusone.js     5
## 7   jquery-ui-1.7.2.custom.min.js     5
## 8             jquery-1.4.2.min.js     5
## 9                        start.js     5
spark_disconnect(sc)

Querying 1GB

warc_big <- normalizePath("~/cc.warc.gz")           # Name a 5GB warc file
if (!file.exists(warc_big))                         # If the file does not exist
  download.file(                                    # download by
    gsub("s3n://commoncrawl/",                      # mapping the S3 bucket url
         "https://commoncrawl.s3.amazonaws.com/",   # into a adownloadable url
         sparkwarc::cc_warc(1)), warc_big)          # from the first archive file
config <- spark_config()
config[["spark.memory.fraction"]] <- "0.9"
config[["spark.executor.memory"]] <- "10G"
config[["sparklyr.shell.driver-memory"]] <- "10G"

sc <- spark_connect(master = "local", config = config)
## * Using Spark: 2.1.0
spark_read_warc(
  sc,
  "warc",
  warc_big,
  repartition = 8)

df <- data.frame(list(a = list("a,b,c")))

SELECT count(value)
FROM WARC
WHERE length(regexp_extract(value, '<([a-z]+)>', 0)) > 0
count(value)
6336761
SELECT count(value)
FROM WARC
WHERE length(regexp_extract(value, '<html', 0)) > 0
count(value)
74519
cc_stats("http-equiv=\"Content-Language\" content=\"([^\"]*)\"")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##    regval count
##     <chr> <dbl>
##  1     en   533
##  2  en-us   323
##  3     ru   150
##  4     es   127
##  5  en-US   105
##  6     fr    95
##  7     de    92
##  8     pl    71
##  9     cs    48
## 10     ja    45
## # ... with 90 more rows
cc_stats("WARC-Target-URI: http://([^/]+)/.*")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##                        regval count
##                         <chr> <dbl>
##  1    www.urbandictionary.com   156
##  2                 my-shop.ru    69
##  3 hfboards.hockeysfuture.com    69
##  4      www.greatlakes4x4.com    66
##  5        www.opensecrets.org    60
##  6         www.summitpost.org    57
##  7             brainly.com.br    57
##  8         www.mobileread.com    54
##  9          www.genealogy.com    54
## 10               shop.ccs.com    51
## # ... with 90 more rows
cc_stats("<([a-zA-Z]+)>")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##    regval   count
##     <chr>   <dbl>
##  1     li 2492324
##  2   span  506471
##  3     tr  440658
##  4      p  432221
##  5     td  398106
##  6     ul  258962
##  7    div  211937
##  8 script  198504
##  9     br  196993
## 10 strong  152675
## # ... with 90 more rows
cc_stats("<meta .*keywords.*content=\"([a-zA-Z0-9]+).*")
## # Source:     lazy query [?? x 2]
## # Database:   spark_connection
## # Ordered by: desc(count)
##    regval count
##     <chr> <dbl>
##  1  width   285
##  2   http   235
##  3   free   110
##  4   text   110
##  5    The   100
##  6  index    91
##  7  https    85
##  8  SKYPE    59
##  9      1    55
## 10   news    48
## # ... with 90 more rows
spark_disconnect(sc)

Querying 1TB

By running sparklyr in EMR, one can configure an EMR cluster and load about ~5GB of data using:

sc <- spark_connect(master = "yarn-client")
spark_read_warc(sc, "warc", cc_warc(1, 1))

tbl(sc, "warc") %>% summarize(n = n())
spark_disconnect_all()

To read the first 200 files, or about ~1TB of data, first scale the cluster, consider maximizing resource allocation with the followin EMR config:

[
  {
    "Classification": "spark",
    "Properties": {
      "maximizeResourceAllocation": "true"
    }
  }
]

Followed by loading the [1, 200] file range with:

sc <- spark_connect(master = "yarn-client")
spark_read_warc(sc, "warc", cc_warc(1, 200))

tbl(sc, "warc") %>% summarize(n = n())
spark_disconnect_all()

To query ~1PB for the entire crawl, a custom script would be needed to load all the WARC files.

About

Load WARC files into Apache Spark with sparklyr

Resources

License

Packages

No packages published
You can’t perform that action at this time.