From f58016d12651b76f6df95801dc7a389fd7c27d74 Mon Sep 17 00:00:00 2001 From: Carl Boettiger Date: Tue, 14 May 2019 21:42:13 -0400 Subject: [PATCH] Fuzzy filter (#56) * export name_contains and starts_with * include helper fns around fuzzy_filter * update docs --- .Rbuildignore | 1 + NAMESPACE | 5 ++ R/fuzzy_filter.R | 89 +++++++++++++++++++++++++++++- R/td_create.R | 2 +- man/common_contains.Rd | 32 +++++++++++ man/common_starts_with.Rd | 32 +++++++++++ man/name_contains.Rd | 32 +++++++++++ man/name_starts_with.Rd | 31 +++++++++++ notebook/globi.R | 70 +++++++++++------------ tests/testthat/test-fuzzy_filter.R | 21 +++++++ vignettes/{ => articles}/intro.Rmd | 0 11 files changed, 276 insertions(+), 39 deletions(-) create mode 100644 man/common_contains.Rd create mode 100644 man/common_starts_with.Rd create mode 100644 man/name_contains.Rd create mode 100644 man/name_starts_with.Rd rename vignettes/{ => articles}/intro.Rmd (100%) diff --git a/.Rbuildignore b/.Rbuildignore index b2adb94..1668c14 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,3 +20,4 @@ pkgdown paper ^ISSUES\.md$ drafts +^renv$ diff --git a/NAMESPACE b/NAMESPACE index 9701287..bc7ad30 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,11 +5,15 @@ export(by_id) export(by_name) export(by_rank) export(clean_names) +export(common_contains) +export(common_starts_with) export(filter_by) export(fuzzy_filter) export(get_ids) export(get_names) export(mutate_db) +export(name_contains) +export(name_starts_with) export(synonyms) export(taxa_tbl) export(td_connect) @@ -21,6 +25,7 @@ importFrom(DBI,dbDisconnect) importFrom(DBI,dbFetch) importFrom(DBI,dbGetQuery) importFrom(DBI,dbIsValid) +importFrom(DBI,dbListTables) importFrom(DBI,dbSendQuery) importFrom(DBI,dbWriteTable) importFrom(MonetDBLite,MonetDBLite) diff --git a/R/fuzzy_filter.R b/R/fuzzy_filter.R index 9c6041e..de5de9f 100644 --- a/R/fuzzy_filter.R +++ b/R/fuzzy_filter.R @@ -96,9 +96,22 @@ filter_like <- function(db_tbl, input, pattern){ ## Consider creating functions that are explicitly named to create more semantic ## code, rather than relying on setting the behavior in the `by` and `match` ## arguments to `fuzzy_filter`, e.g. : + +#' return all taxa in which scientific name contains the text provided +#' +#' @export +#' @examples +#' \donttest{ +#' \dontshow{ +#' ## All examples use a temporary directory +#' Sys.setenv(TAXADB_HOME=tempdir()) +#' } +#' name_contains("Homo ") +#' } +#' @inheritParams fuzzy_filter name_contains <- function(name, - provider, - db = td_connect, + provider = "itis", + db = td_connect(), ignore_case = TRUE){ fuzzy_filter(name, @@ -109,9 +122,22 @@ name_contains <- function(name, ignore_case = ignore_case) } + +#' scientific name starts with +#' +#' @examples +#' \donttest{ +#' \dontshow{ +#' ## All examples use a temporary directory +#' Sys.setenv(TAXADB_HOME=tempdir()) +#' } +#' name_contains("Homo ") +#' } +#' @inheritParams fuzzy_filter +#' @export name_starts_with <- function(name, provider, - db = td_connect, + db = td_connect(), ignore_case = TRUE){ fuzzy_filter(name, @@ -124,3 +150,60 @@ name_starts_with <- function(name, +#' common name starts with +#' +#' @examples +#' \donttest{ +#' \dontshow{ +#' ## All examples use a temporary directory +#' Sys.setenv(TAXADB_HOME=tempdir()) +#' } +#' common_starts_with("monkey") +#' } +#' @inheritParams fuzzy_filter +#' @export +common_starts_with <- function(name, + provider = "itis", + db = td_connect(), + ignore_case = TRUE){ + + fuzzy_filter(name, + by = "vernacularName", + provider = provider, + match = "starts_with", + db = db, + ignore_case = ignore_case) +} + + +#' common name starts with +#' +#' @examples +#' \donttest{ +#' \dontshow{ +#' ## All examples use a temporary directory +#' Sys.setenv(TAXADB_HOME=tempdir()) +#' } +#' common_contains("monkey") +#' } +#' @inheritParams fuzzy_filter +#' @export +common_contains <- function(name, + provider = "itis", + db = td_connect(), + ignore_case = TRUE){ + + fuzzy_filter(name, + by = "vernacularName", + provider = provider, + match = "contains", + db = db, + ignore_case = ignore_case) +} + + + + + + + diff --git a/R/td_create.R b/R/td_create.R index 268b5a1..8ad0d86 100644 --- a/R/td_create.R +++ b/R/td_create.R @@ -29,7 +29,7 @@ #' @return path where database has been installed (invisibly) #' @export #' @importFrom utils download.file -#' @importFrom DBI dbConnect dbDisconnect +#' @importFrom DBI dbConnect dbDisconnect dbListTables #' @importFrom arkdb unark streamable_readr_tsv #' @importFrom MonetDBLite MonetDBLite #' @importFrom readr cols diff --git a/man/common_contains.Rd b/man/common_contains.Rd new file mode 100644 index 0000000..5d7868e --- /dev/null +++ b/man/common_contains.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fuzzy_filter.R +\name{common_contains} +\alias{common_contains} +\title{common name starts with} +\usage{ +common_contains(name, provider = "itis", db = td_connect(), + ignore_case = TRUE) +} +\arguments{ +\item{name}{vector of names (scientific or common, see \code{by}) to be matched against.} + +\item{provider}{from which provider should the hierarchy be returned? +Default is 'itis'.} + +\item{db}{a connection to the taxadb database. See details.} + +\item{ignore_case}{should we ignore case (capitalization) in matching names? +default is \code{TRUE}.} +} +\description{ +common name starts with +} +\examples{ +\donttest{ + \dontshow{ + ## All examples use a temporary directory + Sys.setenv(TAXADB_HOME=tempdir()) + } +common_contains("monkey") +} +} diff --git a/man/common_starts_with.Rd b/man/common_starts_with.Rd new file mode 100644 index 0000000..2e5d4f1 --- /dev/null +++ b/man/common_starts_with.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fuzzy_filter.R +\name{common_starts_with} +\alias{common_starts_with} +\title{common name starts with} +\usage{ +common_starts_with(name, provider = "itis", db = td_connect(), + ignore_case = TRUE) +} +\arguments{ +\item{name}{vector of names (scientific or common, see \code{by}) to be matched against.} + +\item{provider}{from which provider should the hierarchy be returned? +Default is 'itis'.} + +\item{db}{a connection to the taxadb database. See details.} + +\item{ignore_case}{should we ignore case (capitalization) in matching names? +default is \code{TRUE}.} +} +\description{ +common name starts with +} +\examples{ +\donttest{ + \dontshow{ + ## All examples use a temporary directory + Sys.setenv(TAXADB_HOME=tempdir()) + } +common_starts_with("monkey") +} +} diff --git a/man/name_contains.Rd b/man/name_contains.Rd new file mode 100644 index 0000000..6ceb032 --- /dev/null +++ b/man/name_contains.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fuzzy_filter.R +\name{name_contains} +\alias{name_contains} +\title{return all taxa in which scientific name contains the text provided} +\usage{ +name_contains(name, provider = "itis", db = td_connect(), + ignore_case = TRUE) +} +\arguments{ +\item{name}{vector of names (scientific or common, see \code{by}) to be matched against.} + +\item{provider}{from which provider should the hierarchy be returned? +Default is 'itis'.} + +\item{db}{a connection to the taxadb database. See details.} + +\item{ignore_case}{should we ignore case (capitalization) in matching names? +default is \code{TRUE}.} +} +\description{ +return all taxa in which scientific name contains the text provided +} +\examples{ +\donttest{ + \dontshow{ + ## All examples use a temporary directory + Sys.setenv(TAXADB_HOME=tempdir()) + } +name_contains("Homo ") +} +} diff --git a/man/name_starts_with.Rd b/man/name_starts_with.Rd new file mode 100644 index 0000000..3a941ca --- /dev/null +++ b/man/name_starts_with.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fuzzy_filter.R +\name{name_starts_with} +\alias{name_starts_with} +\title{scientific name starts with} +\usage{ +name_starts_with(name, provider, db = td_connect(), ignore_case = TRUE) +} +\arguments{ +\item{name}{vector of names (scientific or common, see \code{by}) to be matched against.} + +\item{provider}{from which provider should the hierarchy be returned? +Default is 'itis'.} + +\item{db}{a connection to the taxadb database. See details.} + +\item{ignore_case}{should we ignore case (capitalization) in matching names? +default is \code{TRUE}.} +} +\description{ +scientific name starts with +} +\examples{ +\donttest{ + \dontshow{ + ## All examples use a temporary directory + Sys.setenv(TAXADB_HOME=tempdir()) + } +name_contains("Homo ") +} +} diff --git a/notebook/globi.R b/notebook/globi.R index 423a9ca..25f26d4 100644 --- a/notebook/globi.R +++ b/notebook/globi.R @@ -1,8 +1,8 @@ -# FROM: Poelen, Jorrit H. (2018). Global Biotic Interactions: Taxon Graph (Version 0.3.2) [Data set]. +# FROM: Poelen, Jorrit H. (2018). Global Biotic Interactions: Taxon Graph (Version 0.3.2) [Data set]. # Zenodo. http://doi.org/10.5281/zenodo.1250572 library(tidyverse) - +library(MonetDBLite) expect_none <- function(df){ testthat::expect_equal(dim(df)[[1]], 0) } @@ -32,8 +32,8 @@ if(test){ #taxonCache %>% filter(!grepl(":", id)) %>% expect_none() ## Some tests #taxonCache %>% filter(grepl(":", path)) %>% expect_none() ## some paths are ids - taxonCache %>% filter(grepl("\\s", id)) %>% expect_none() - + taxonCache %>% filter(grepl("\\s", id)) %>% expect_none() + pattern = "\\s*\\|\\s*" path_pipes <- taxonCache %>% purrr::transpose() %>% map_int( ~length(str_split(.x$path, pattern)[[1]])) @@ -54,8 +54,8 @@ if(test){ ## taxonCache <- taxonCache[-trouble,] -longform <- function(row, pattern = "\\s*\\|\\s*"){ - row_as_df <- +longform <- function(row, pattern = "\\s*\\|\\s*"){ + row_as_df <- data_frame(id = row$id, name = row$name, rank = row$rank, @@ -65,20 +65,20 @@ longform <- function(row, pattern = "\\s*\\|\\s*"){ commonNames = row$commonNames, externalUrl = row$externalUrl, thumbnailUrl = row$thumbnailUrl) - + } # 3052673 rows. 3,052,673 system.time({ -taxa <- taxonCache %>% - transpose() %>% - map_dfr(longform) %>% - distinct() +taxa <- taxonCache %>% + transpose() %>% + map_dfr(longform) %>% + distinct() }) -## FIXME +## FIXME ## - [ ] standardize case ## - [x] standardize rank names @@ -87,35 +87,35 @@ taxon_rank_list <- read_tsv(paste0("https://raw.githubusercontent.com/", "globalbioticinteractions/nomer/master/nomer/src/main/resources/org/", "globalbioticinteractions/nomer/match/taxon_rank_links.tsv")) -rank_mapper <- taxon_rank_list %>% - select(pathNames = providedName, - rank_level_id = resolvedId, +rank_mapper <- taxon_rank_list %>% + select(pathNames = providedName, + rank_level_id = resolvedId, rank_level = resolvedName) -globi_long <- inner_join(taxa, - rank_mapper, - copy = TRUE) %>% +globi_long <- inner_join(taxa, + rank_mapper, + copy = TRUE) %>% arrange(id) %>% select(-pathNames) %>% # drop the uncorrected names - select(id, + select(id, name, rank, - path, - path_id = pathIds, - path_rank = rank_level, - path_rank_id = rank_level_id, - common_names = commonNames, - external_url = externalUrl, + path, + path_id = pathIds, + path_rank = rank_level, + path_rank_id = rank_level_id, + common_names = commonNames, + external_url = externalUrl, thumbnail_url = thumbnailUrl) -## serious compression ~ about the same. +## serious compression ~ about the same. write_tsv(globi_long, bzfile("data/globi_long.tsv.bz2", compression=9)) -pre_spread <- - globi_long %>% +pre_spread <- + globi_long %>% filter(rank == "species") %>% - select(id, species = name, path, path_rank) %>% - distinct() + select(id, species = name, path, path_rank) %>% + distinct() ## see debug: OTT, WORMS, NCBI, NBN & INAT contain non-unique rank names pre_spread <- pre_spread %>% mutate(row = 1:n()) @@ -127,7 +127,7 @@ uniques <- left_join(tmp, pre_spread, by = c("row", "id")) uniques %>% pull(id) %>% duplicated() %>% any() %>% testthat::expect_false() -globi_wide <- uniques %>% spread(path_rank, path) +globi_wide <- uniques %>% spread(path_rank, path) write_tsv(globi_wide, bzfile("data/globi_hierarchy.tsv.bz2", compression=9)) @@ -137,12 +137,12 @@ write_tsv(globi_wide, bzfile("data/globi_hierarchy.tsv.bz2", compression=9)) ## Find all cases with duplicate identifiers! -has_duplicate_rank <- pre_spread %>% - group_by(id, path_rank) %>% - summarise(l = length(path)) %>% +has_duplicate_rank <- pre_spread %>% + group_by(id, path_rank) %>% + summarise(l = length(path)) %>% filter(l>1) -dups <- pre_spread %>% +dups <- pre_spread %>% semi_join(select(has_duplicate_rank, id, path_rank)) dups diff --git a/tests/testthat/test-fuzzy_filter.R b/tests/testthat/test-fuzzy_filter.R index 35ed336..7c612a9 100644 --- a/tests/testthat/test-fuzzy_filter.R +++ b/tests/testthat/test-fuzzy_filter.R @@ -15,4 +15,25 @@ test_that("we can fuzzy match scientific and common names", { +}) + +test_that("we can fuzzy match scientific and common names", { + + name <- c("woodpecker", "monkey") + df <- common_contains(name, "itis") + expect_is(df, "data.frame") + expect_gt(dim(df)[1], 1) + df <- common_starts_with(name, "itis") + expect_is(df, "data.frame") + + + df <- name_starts_with("Homo ", "itis", + ignore_case = FALSE) + expect_is(df, "data.frame") + expect_gt(dim(df)[1], 1) + df <- name_contains("Homo", "itis") + expect_is(df, "data.frame") + expect_gt(dim(df)[1], 1) + + }) diff --git a/vignettes/intro.Rmd b/vignettes/articles/intro.Rmd similarity index 100% rename from vignettes/intro.Rmd rename to vignettes/articles/intro.Rmd