Permalink
Browse files

Added gutenberg_works function to do some common filtering of gutenbe…

…rg_metadata.

Changes to how Gutenberg header and footer are stripped. Also made optional with strip argument

Various cleanups of data-raw scripts.
  • Loading branch information...
dgrtwo committed Apr 28, 2016
1 parent 5f3d4e4 commit 3443397e80c8e335334b232ae1ec2f294759b867
View
@@ -3,4 +3,5 @@
export(gutenberg_download)
export(gutenberg_get_mirror)
export(gutenberg_strip)
export(gutenberg_works)
import(dplyr)
View
@@ -30,17 +30,30 @@
#' library(dplyr)
#' library(stringr)
#'
#' gutenberg_metadata
#'
#' gutenberg_metadata %>%
#' count(author, sort = TRUE)
#'
#' # look for Shakespeare, excluding collections (containing "Works") and translations
#' shakespeare_metadata <- gutenberg_metadata %>%
#' filter(author == "Shakespeare, William",
#' language == "en",
#' !str_detect(title, "Works"),
#' has_text) %>%
#' has_text,
#' !str_detect(rights, "Copyright")) %>%
#' distinct(title)
#'
#' \dontrun{
#' shakespeare_works <- gutenberg_download(shakespeare_metadata$gutenberg_id)
#' }
#'
#' # note that gutenberg_works() function is a shortcut to some of the above
#'
#' shakespeare_metadata2 <- gutenberg_works(author == "Shakespeare, William",
#' !str_detect(title, "Works"))
#'
#' @seealso \link{gutenberg_works}, \link{gutenberg_authors}, \link{gutenberg_subjects}
"gutenberg_metadata"
@@ -74,13 +87,9 @@
#'
#' sherlock_holmes_subjects
#'
#' sherlock_holmes_metadata <- gutenberg_metadata %>%
#' semi_join(sherlock_holmes_subjects) %>%
#' filter(author == "Doyle, Arthur Conan",
#' language == "en",
#' has_text) %>%
#' arrange(gutenberg_id) %>%
#' distinct(title)
#' sherlock_holmes_metadata <- gutenberg_works() %>%
#' filter(author == "Doyle, Arthur Conan") %>%
#' semi_join(sherlock_holmes_subjects, by = "gutenberg_id")
#'
#' sherlock_holmes_metadata
#'
@@ -89,6 +98,8 @@
#'
#' holmes_books
#' }
#'
#' @seealso \link{gutenberg_metadata}, \link{gutenberg_authors}
"gutenberg_subjects"
@@ -112,4 +123,6 @@
#' \item{aliases}{List column of character vectors of aliases. If there
#' are multiple, they are "/"-delimited}
#' }
#'
#' @seealso \link{gutenberg_metadata}, \link{gutenberg_subjects}
"gutenberg_authors"
View
@@ -1 +1 @@
globalVariables(c(".", "gutenberg_id"))
globalVariables(c(".", "gutenberg_id", "language", "has_text", "gutenberg_metadata"))
View
@@ -2,11 +2,13 @@
#'
#' @param gutenberg_id A vector of Project Gutenberg ID
#' @param mirror Optionally a mirror URL to retrieve the books from
#' @param strip Whether to strip suspected headers and footers
#' @param ... Extra arguments passed to \code{\link{gutenberg_strip}}
#'
#' @import dplyr
#'
#' @export
gutenberg_download <- function(gutenberg_id, mirror = NULL) {
gutenberg_download <- function(gutenberg_id, mirror = NULL, strip = TRUE, ...) {
if (is.null(mirror)) {
mirror <- gutenberg_get_mirror()
}
@@ -47,10 +49,17 @@ gutenberg_download <- function(gutenberg_id, mirror = NULL) {
ret <- full_url %>%
purrr::map(try_download) %>%
purrr::discard(is.null) %>%
purrr::map(gutenberg_strip) %>%
purrr::map_df(~data_frame(text = .), .id = "gutenberg_id") %>%
mutate(gutenberg_id = as.integer(gutenberg_id))
if (strip) {
ret <-
ret %>%
group_by(gutenberg_id) %>%
do(data_frame(text = gutenberg_strip(.$text))) %>%
ungroup()
}
ret
}
@@ -62,33 +71,33 @@ gutenberg_download <- function(gutenberg_id, mirror = NULL) {
#' will also not strip tables of contents, prologues, or other text
#' that appears at the start of a book.
#'
#' @param text A character vector of book text
#' @param text A character vector with lines of a book
#'
#' @export
gutenberg_strip <- function(text) {
text[is.na(text)] <- ""
starting_regex <- "^\\*\\*\\*.*PROJECT GUTENBERG"
start_after <- which(stringr::str_detect(text, starting_regex))[1]
if (!is.na(start_after)) {
text <- tail(text, -(start_after))
}
starting_regex <- "(^\\*\\*\\*.*PROJECT GUTENBERG|END .*SMALL PRINT)"
text <- discard_start_while(text, !stringr::str_detect(text, starting_regex))[-1]
# also discard rest of "paragraph"
text <- discard_start_while(text, text != "")
ending_regex <- "^(End of .*Project Gutenberg.*|\\*\\*\\*.*END OF.*PROJECT GUTENBERG)"
stop_before <- which(stringr::str_detect(text, ending_regex))[1]
if (!is.na(stop_before)) {
text <- head(text, stop_before - 1)
text <- keep_while(text, !stringr::str_detect(text, ending_regex))
# strip empty lines from start and end
text <- discard_start_while(text, text == "")
# also paragraphs at the start that are meta-data
start_paragraph_regex <- "(produced by|prepared by|transcribed from|project gutenberg|^note: )"
while (length(text) > 0 &&
stringr::str_detect(stringr::str_to_lower(text[1]), start_paragraph_regex)) {
# get rid of that paragraph, then the following whitespace
text <- discard_start_while(text, text != "")
text <- discard_start_while(text, text == "")
}
# strip empty lines from start and empty
if (text[1] == "") {
text <- tail(text, -(min(which(text != "")) - 1))
}
if (tail(text, 1) == "") {
text <- head(text, max(which(text != "")))
}
text <- discard_end_while(text, text == "")
text
}
View
@@ -0,0 +1,68 @@
#' Get a filtered table of Gutenberg work metadata
#'
#' Get a table of Gutenberg work metadata that has been filtered by some common
#' (settable) defaults, along with the option to add additional filters
#' This function is for convenience when working with common conditions
#' when pulling a set of books to analyze.
#' For more detailed filtering of the entire Project Gutenberg
#' metadata, use the \link{gutenberg_metadata} and related datasets.
#'
#' @param ... Additional filters, given as expressions using the variables
#' in the \link{gutenberg_metadata} dataset (e.g. \code{author == "Austen, Jane"})
#' @param languages Vector of languages to include (note that it will not
#' return cases with multiple languages unless they are specified)
#' @param only_text Whether the works must have Gutenberg text attached. Works
#' without text (e.g. audiobooks) cannot be downloaded with
#' \code{\link{gutenberg_download}}.
#' @param rights Values to allow in the \code{rights} field. By default allows
#' public domain in the US or "None", while excluding works under copyright.
#' @param distinct Whether to return only one distinct combination of each
#' title and gutenberg_author_id. If multiple occur (that fulfill the other
#' conditions), it uses the one with the lowest ID.
#'
#' @details By default, returns
#'
#' \itemize{
#' \item{English-language works}
#' \item{That are in text format in Gutenberg (as opposed to audio)}
#' \item{Whose text is not under copyright}
#' \item{At most one distinct field for each title/author pair}
#' }
#'
#' @examples
#'
#' gutenberg_works()
#'
#' # filter conditions
#' gutenberg_works(author == "Shakespeare, William")
#'
#' # changing default options
#' gutenberg_works(rights = NULL)
#' gutenberg_works(languages = "de")
#'
#' @export
gutenberg_works <- function(..., languages = "en",
only_text = TRUE,
rights = c("Public domain in the USA.", "None"),
distinct = TRUE) {
ret <- filter(gutenberg_metadata, ...)
if (!is.null(languages)) {
ret <- filter(ret, language %in% languages)
}
if (!is.null(rights)) {
.rights <- rights
ret <- filter(ret, rights %in% .rights)
}
if (only_text) {
ret <- filter(ret, has_text)
}
if (distinct) {
ret <- distinct_(ret, "title", "gutenberg_author_id")
}
ret
}
View
@@ -12,3 +12,32 @@ read_zip_url <- function(url) {
ret
}
#' Discard all values at the start of .x while .p is true
#'
#' @param .x Vector
#' @param .p Logical vector
discard_start_while <- function(.x, .p) {
if (.p[1] && any(!.p)) {
.x <- tail(.x, -(min(which(!.p)) - 1))
}
.x
}
keep_while <- function(.x, .p) {
if (.p[1] && any(.p)) {
.x <- head(.x, min(which(!.p)) - 1)
}
.x
}
#' Discard all values at the start of .x while .p is true
#'
#' @param .x Vector
#' @param .p Logical vector
discard_end_while <- function(.x, .p) {
rev(discard_start_while(rev(.x), rev(.p)))
}
View
@@ -43,8 +43,6 @@ We recommend the [tidytext](https://github.com/juliasilge/tidytext) package for
See the [data-raw](data-raw) directory for the scripts that generate these datasets. As of now, these were generated from [the Project Gutenberg catalog](https://www.gutenberg.org/wiki/Gutenberg:Feeds#The_Complete_Project_Gutenberg_Catalog) on **April 27 2016**.
There are no guarantees about how often the metadata will be updated in the package. If you're interested in works that have just been added or heavily edited on Project Gutenberg, you may want to run the scripts yourself.
#### Do you respect the rules regarding robot access to Project Gutenberg?
Yes! The package respects [these rules](https://www.gutenberg.org/wiki/Gutenberg:Information_About_Robot_Access_to_our_Pages) and complies to the best of our ability. Namely:
View
@@ -1,3 +1,5 @@
This set of scripts produces the three .rda files in the [data](../data) directory. It is only moderately reproducible, as it needs to be run in a particular order and in a few cases even hardcodes paths. Expect it to be updated and improved soon.
If you're interested in parsing and processing the Gutenberg metadata yourself, the only file you really need is [metadata.json.gz](metadata.json.gz) (written by [gitenberg_meta.py](gitenberg_meta.py)), which contains one line with a JSON dictionary for every Project Gutenberg work. The JSON dictionary was produced from the Gutenberg RDF using the [gitberg Python package](https://github.com/gitenberg-dev/gitberg).
There are no guarantees about how often the metadata will be updated in the package. If you're interested in works that have been recently added or had their metadata edited on Project Gutenberg, you may want to run the scripts yourself.
View
@@ -1,8 +1,14 @@
# you first need to run:
### Before running, you first need to run two scrips:
# First:
# python data-raw/gitenberg_metadata.py
# which creates data-raw/metadata.json.gz
# which creates data-raw/metadata.json.gz, and
# sh text_files.sh
# which creates data-raw/
library(purrr)
library(dplyr)
@@ -17,7 +23,7 @@ gutenberg_metadata_raw <- fromJSON(str_c("[", str_c(metadata_lines, collapse = "
jsonlite::flatten() %>%
tbl_df()
# select columns, and combine some list columns into character
# select columns, and combine some list columns into "/"-delimited character
# vector columns
collapse_col <- function(x) {
@@ -27,8 +33,7 @@ collapse_col <- function(x) {
unlist()
}
ids_with_text <- read_lines("data-raw/have_text.txt") %>%
unique() %>%
ids_with_text <- read_lines("data-raw/ids_with_text.txt") %>%
extract_numeric() %>%
as.integer()
View
@@ -1,15 +1,24 @@
# requires the gitberg package
# https://github.com/gitenberg-dev/gitberg
# install with
# pip install xgitberg
### Running
# run with path to epub folder downloaded from
# https://www.gutenberg.org/wiki/Gutenberg:Feeds#The_Complete_Project_Gutenberg_Catalog
# For example:
# python gitenberg_meta.py ~/Downloads/cache/epub
import sys
import os
import json
import gzip
from gitenberg.metadata.pg_rdf import pg_rdf_to_json
# hardcoded for now
infolder = os.path.expanduser(os.path.join("~", "Downloads", "cache", "epub"))
infolder = sys.argv[1]
outfile = os.path.join("data-raw", "metadata.json.gz")
with gzip.GzipFile(outfile, "w") as outf:
Oops, something went wrong.

0 comments on commit 3443397

Please sign in to comment.