From f58016d12651b76f6df95801dc7a389fd7c27d74 Mon Sep 17 00:00:00 2001
From: Carl Boettiger <cboettig@gmail.com>
Date: Tue, 14 May 2019 21:42:13 -0400
Subject: [PATCH] Fuzzy filter (#56)

* export name_contains and starts_with

* include helper fns around fuzzy_filter

* update docs
---
 .Rbuildignore                      |  1 +
 NAMESPACE                          |  5 ++
 R/fuzzy_filter.R                   | 89 +++++++++++++++++++++++++++++-
 R/td_create.R                      |  2 +-
 man/common_contains.Rd             | 32 +++++++++++
 man/common_starts_with.Rd          | 32 +++++++++++
 man/name_contains.Rd               | 32 +++++++++++
 man/name_starts_with.Rd            | 31 +++++++++++
 notebook/globi.R                   | 70 +++++++++++------------
 tests/testthat/test-fuzzy_filter.R | 21 +++++++
 vignettes/{ => articles}/intro.Rmd |  0
 11 files changed, 276 insertions(+), 39 deletions(-)
 create mode 100644 man/common_contains.Rd
 create mode 100644 man/common_starts_with.Rd
 create mode 100644 man/name_contains.Rd
 create mode 100644 man/name_starts_with.Rd
 rename vignettes/{ => articles}/intro.Rmd (100%)

diff --git a/.Rbuildignore b/.Rbuildignore
index b2adb94..1668c14 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -20,3 +20,4 @@ pkgdown
 paper
 ^ISSUES\.md$
 drafts
+^renv$
diff --git a/NAMESPACE b/NAMESPACE
index 9701287..bc7ad30 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -5,11 +5,15 @@ export(by_id)
 export(by_name)
 export(by_rank)
 export(clean_names)
+export(common_contains)
+export(common_starts_with)
 export(filter_by)
 export(fuzzy_filter)
 export(get_ids)
 export(get_names)
 export(mutate_db)
+export(name_contains)
+export(name_starts_with)
 export(synonyms)
 export(taxa_tbl)
 export(td_connect)
@@ -21,6 +25,7 @@ importFrom(DBI,dbDisconnect)
 importFrom(DBI,dbFetch)
 importFrom(DBI,dbGetQuery)
 importFrom(DBI,dbIsValid)
+importFrom(DBI,dbListTables)
 importFrom(DBI,dbSendQuery)
 importFrom(DBI,dbWriteTable)
 importFrom(MonetDBLite,MonetDBLite)
diff --git a/R/fuzzy_filter.R b/R/fuzzy_filter.R
index 9c6041e..de5de9f 100644
--- a/R/fuzzy_filter.R
+++ b/R/fuzzy_filter.R
@@ -96,9 +96,22 @@ filter_like <- function(db_tbl, input, pattern){
 ## Consider creating functions that are explicitly named to create more semantic
 ## code, rather than relying on setting the behavior in the `by` and `match`
 ## arguments to `fuzzy_filter`, e.g. :
+
+#' return all taxa in which scientific name contains the text provided
+#'
+#' @export
+#' @examples
+#' \donttest{
+#'   \dontshow{
+#'    ## All examples use a temporary directory
+#'    Sys.setenv(TAXADB_HOME=tempdir())
+#'   }
+#' name_contains("Homo ")
+#' }
+#' @inheritParams fuzzy_filter
 name_contains <- function(name,
-                          provider,
-                          db = td_connect,
+                          provider = "itis",
+                          db = td_connect(),
                           ignore_case = TRUE){
 
   fuzzy_filter(name,
@@ -109,9 +122,22 @@ name_contains <- function(name,
                ignore_case = ignore_case)
 }
 
+
+#' scientific name starts with
+#'
+#' @examples
+#' \donttest{
+#'   \dontshow{
+#'    ## All examples use a temporary directory
+#'    Sys.setenv(TAXADB_HOME=tempdir())
+#'   }
+#' name_contains("Homo ")
+#' }
+#' @inheritParams fuzzy_filter
+#' @export
 name_starts_with <- function(name,
                              provider,
-                             db = td_connect,
+                             db = td_connect(),
                              ignore_case = TRUE){
 
   fuzzy_filter(name,
@@ -124,3 +150,60 @@ name_starts_with <- function(name,
 
 
 
+#' common name starts with
+#'
+#' @examples
+#' \donttest{
+#'   \dontshow{
+#'    ## All examples use a temporary directory
+#'    Sys.setenv(TAXADB_HOME=tempdir())
+#'   }
+#' common_starts_with("monkey")
+#' }
+#' @inheritParams fuzzy_filter
+#' @export
+common_starts_with <- function(name,
+                             provider = "itis",
+                             db = td_connect(),
+                             ignore_case = TRUE){
+
+  fuzzy_filter(name,
+               by = "vernacularName",
+               provider = provider,
+               match = "starts_with",
+               db = db,
+               ignore_case = ignore_case)
+}
+
+
+#' common name starts with
+#'
+#' @examples
+#' \donttest{
+#'   \dontshow{
+#'    ## All examples use a temporary directory
+#'    Sys.setenv(TAXADB_HOME=tempdir())
+#'   }
+#' common_contains("monkey")
+#' }
+#' @inheritParams fuzzy_filter
+#' @export
+common_contains <- function(name,
+                            provider = "itis",
+                            db = td_connect(),
+                            ignore_case = TRUE){
+
+  fuzzy_filter(name,
+               by = "vernacularName",
+               provider = provider,
+               match = "contains",
+               db = db,
+               ignore_case = ignore_case)
+}
+
+
+
+
+
+
+
diff --git a/R/td_create.R b/R/td_create.R
index 268b5a1..8ad0d86 100644
--- a/R/td_create.R
+++ b/R/td_create.R
@@ -29,7 +29,7 @@
 #' @return path where database has been installed (invisibly)
 #' @export
 #' @importFrom utils download.file
-#' @importFrom DBI dbConnect dbDisconnect
+#' @importFrom DBI dbConnect dbDisconnect dbListTables
 #' @importFrom arkdb unark streamable_readr_tsv
 #' @importFrom MonetDBLite MonetDBLite
 #' @importFrom readr cols
diff --git a/man/common_contains.Rd b/man/common_contains.Rd
new file mode 100644
index 0000000..5d7868e
--- /dev/null
+++ b/man/common_contains.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fuzzy_filter.R
+\name{common_contains}
+\alias{common_contains}
+\title{common name starts with}
+\usage{
+common_contains(name, provider = "itis", db = td_connect(),
+  ignore_case = TRUE)
+}
+\arguments{
+\item{name}{vector of names (scientific or common, see \code{by}) to be matched against.}
+
+\item{provider}{from which provider should the hierarchy be returned?
+Default is 'itis'.}
+
+\item{db}{a connection to the taxadb database. See details.}
+
+\item{ignore_case}{should we ignore case (capitalization) in matching names?
+default is \code{TRUE}.}
+}
+\description{
+common name starts with
+}
+\examples{
+\donttest{
+  \dontshow{
+   ## All examples use a temporary directory
+   Sys.setenv(TAXADB_HOME=tempdir())
+  }
+common_contains("monkey")
+}
+}
diff --git a/man/common_starts_with.Rd b/man/common_starts_with.Rd
new file mode 100644
index 0000000..2e5d4f1
--- /dev/null
+++ b/man/common_starts_with.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fuzzy_filter.R
+\name{common_starts_with}
+\alias{common_starts_with}
+\title{common name starts with}
+\usage{
+common_starts_with(name, provider = "itis", db = td_connect(),
+  ignore_case = TRUE)
+}
+\arguments{
+\item{name}{vector of names (scientific or common, see \code{by}) to be matched against.}
+
+\item{provider}{from which provider should the hierarchy be returned?
+Default is 'itis'.}
+
+\item{db}{a connection to the taxadb database. See details.}
+
+\item{ignore_case}{should we ignore case (capitalization) in matching names?
+default is \code{TRUE}.}
+}
+\description{
+common name starts with
+}
+\examples{
+\donttest{
+  \dontshow{
+   ## All examples use a temporary directory
+   Sys.setenv(TAXADB_HOME=tempdir())
+  }
+common_starts_with("monkey")
+}
+}
diff --git a/man/name_contains.Rd b/man/name_contains.Rd
new file mode 100644
index 0000000..6ceb032
--- /dev/null
+++ b/man/name_contains.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fuzzy_filter.R
+\name{name_contains}
+\alias{name_contains}
+\title{return all taxa in which scientific name contains the text provided}
+\usage{
+name_contains(name, provider = "itis", db = td_connect(),
+  ignore_case = TRUE)
+}
+\arguments{
+\item{name}{vector of names (scientific or common, see \code{by}) to be matched against.}
+
+\item{provider}{from which provider should the hierarchy be returned?
+Default is 'itis'.}
+
+\item{db}{a connection to the taxadb database. See details.}
+
+\item{ignore_case}{should we ignore case (capitalization) in matching names?
+default is \code{TRUE}.}
+}
+\description{
+return all taxa in which scientific name contains the text provided
+}
+\examples{
+\donttest{
+  \dontshow{
+   ## All examples use a temporary directory
+   Sys.setenv(TAXADB_HOME=tempdir())
+  }
+name_contains("Homo ")
+}
+}
diff --git a/man/name_starts_with.Rd b/man/name_starts_with.Rd
new file mode 100644
index 0000000..3a941ca
--- /dev/null
+++ b/man/name_starts_with.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fuzzy_filter.R
+\name{name_starts_with}
+\alias{name_starts_with}
+\title{scientific name starts with}
+\usage{
+name_starts_with(name, provider, db = td_connect(), ignore_case = TRUE)
+}
+\arguments{
+\item{name}{vector of names (scientific or common, see \code{by}) to be matched against.}
+
+\item{provider}{from which provider should the hierarchy be returned?
+Default is 'itis'.}
+
+\item{db}{a connection to the taxadb database. See details.}
+
+\item{ignore_case}{should we ignore case (capitalization) in matching names?
+default is \code{TRUE}.}
+}
+\description{
+scientific name starts with
+}
+\examples{
+\donttest{
+  \dontshow{
+   ## All examples use a temporary directory
+   Sys.setenv(TAXADB_HOME=tempdir())
+  }
+name_contains("Homo ")
+}
+}
diff --git a/notebook/globi.R b/notebook/globi.R
index 423a9ca..25f26d4 100644
--- a/notebook/globi.R
+++ b/notebook/globi.R
@@ -1,8 +1,8 @@
-# FROM: Poelen, Jorrit H. (2018). Global Biotic Interactions: Taxon Graph (Version 0.3.2) [Data set]. 
+# FROM: Poelen, Jorrit H. (2018). Global Biotic Interactions: Taxon Graph (Version 0.3.2) [Data set].
 # Zenodo. http://doi.org/10.5281/zenodo.1250572
 
 library(tidyverse)
-
+library(MonetDBLite)
 expect_none <- function(df){ testthat::expect_equal(dim(df)[[1]], 0) }
 
 
@@ -32,8 +32,8 @@ if(test){
   #taxonCache %>% filter(!grepl(":", id)) %>% expect_none()
   ## Some tests
   #taxonCache %>% filter(grepl(":", path)) %>% expect_none()  ## some paths are ids
-  taxonCache %>% filter(grepl("\\s", id))  %>% expect_none()  
-    
+  taxonCache %>% filter(grepl("\\s", id))  %>% expect_none()
+
   pattern = "\\s*\\|\\s*"
   path_pipes <- taxonCache %>% purrr::transpose() %>%
     map_int( ~length(str_split(.x$path, pattern)[[1]]))
@@ -54,8 +54,8 @@ if(test){
 ## taxonCache <- taxonCache[-trouble,]
 
 
-longform <- function(row, pattern = "\\s*\\|\\s*"){ 
-  row_as_df <- 
+longform <- function(row, pattern = "\\s*\\|\\s*"){
+  row_as_df <-
     data_frame(id = row$id,
                name = row$name,
                rank = row$rank,
@@ -65,20 +65,20 @@ longform <- function(row, pattern = "\\s*\\|\\s*"){
                commonNames = row$commonNames,
                externalUrl = row$externalUrl,
                thumbnailUrl = row$thumbnailUrl)
-               
+
 }
 
 
 # 3052673 rows.  3,052,673
 
 system.time({
-taxa <- taxonCache %>% 
-  transpose() %>% 
-  map_dfr(longform) %>% 
-  distinct() 
+taxa <- taxonCache %>%
+  transpose() %>%
+  map_dfr(longform) %>%
+  distinct()
 })
 
-## FIXME 
+## FIXME
 ## - [ ] standardize case
 ## - [x] standardize rank names
 
@@ -87,35 +87,35 @@ taxon_rank_list <- read_tsv(paste0("https://raw.githubusercontent.com/",
   "globalbioticinteractions/nomer/master/nomer/src/main/resources/org/",
   "globalbioticinteractions/nomer/match/taxon_rank_links.tsv"))
 
-rank_mapper <- taxon_rank_list %>% 
-  select(pathNames = providedName, 
-         rank_level_id = resolvedId, 
+rank_mapper <- taxon_rank_list %>%
+  select(pathNames = providedName,
+         rank_level_id = resolvedId,
          rank_level = resolvedName)
 
-globi_long <- inner_join(taxa, 
-                    rank_mapper, 
-                    copy = TRUE) %>% 
+globi_long <- inner_join(taxa,
+                    rank_mapper,
+                    copy = TRUE) %>%
   arrange(id) %>%
   select(-pathNames) %>% # drop the uncorrected names
-  select(id, 
+  select(id,
          name,
          rank,
-         path, 
-         path_id = pathIds, 
-         path_rank = rank_level, 
-         path_rank_id = rank_level_id, 
-         common_names = commonNames, 
-         external_url = externalUrl, 
+         path,
+         path_id = pathIds,
+         path_rank = rank_level,
+         path_rank_id = rank_level_id,
+         common_names = commonNames,
+         external_url = externalUrl,
          thumbnail_url = thumbnailUrl)
 
-## serious compression ~ about the same.  
+## serious compression ~ about the same.
 write_tsv(globi_long, bzfile("data/globi_long.tsv.bz2", compression=9))
 
-pre_spread <- 
-  globi_long %>% 
+pre_spread <-
+  globi_long %>%
   filter(rank == "species") %>%
-  select(id, species = name, path, path_rank) %>% 
-  distinct() 
+  select(id, species = name, path, path_rank) %>%
+  distinct()
 
 ## see debug: OTT, WORMS, NCBI, NBN & INAT contain non-unique rank names
 pre_spread <- pre_spread %>% mutate(row = 1:n())
@@ -127,7 +127,7 @@ uniques <- left_join(tmp, pre_spread, by = c("row", "id"))
 uniques %>% pull(id) %>% duplicated() %>% any() %>% testthat::expect_false()
 
 
-globi_wide <- uniques %>% spread(path_rank, path) 
+globi_wide <- uniques %>% spread(path_rank, path)
 write_tsv(globi_wide, bzfile("data/globi_hierarchy.tsv.bz2", compression=9))
 
 
@@ -137,12 +137,12 @@ write_tsv(globi_wide, bzfile("data/globi_hierarchy.tsv.bz2", compression=9))
 
 
 ## Find all cases with duplicate identifiers!
-has_duplicate_rank <- pre_spread %>% 
-  group_by(id, path_rank) %>% 
-  summarise(l = length(path)) %>% 
+has_duplicate_rank <- pre_spread %>%
+  group_by(id, path_rank) %>%
+  summarise(l = length(path)) %>%
   filter(l>1)
 
-dups <- pre_spread %>% 
+dups <- pre_spread %>%
   semi_join(select(has_duplicate_rank, id, path_rank))
 
 dups
diff --git a/tests/testthat/test-fuzzy_filter.R b/tests/testthat/test-fuzzy_filter.R
index 35ed336..7c612a9 100644
--- a/tests/testthat/test-fuzzy_filter.R
+++ b/tests/testthat/test-fuzzy_filter.R
@@ -15,4 +15,25 @@ test_that("we can fuzzy match scientific and common names", {
 
 
 
+})
+
+test_that("we can fuzzy match scientific and common names", {
+
+  name <- c("woodpecker", "monkey")
+  df <- common_contains(name, "itis")
+  expect_is(df, "data.frame")
+  expect_gt(dim(df)[1], 1)
+  df <- common_starts_with(name, "itis")
+  expect_is(df, "data.frame")
+
+
+  df <- name_starts_with("Homo ", "itis",
+                     ignore_case = FALSE)
+  expect_is(df, "data.frame")
+  expect_gt(dim(df)[1], 1)
+  df <- name_contains("Homo", "itis")
+  expect_is(df, "data.frame")
+  expect_gt(dim(df)[1], 1)
+
+
 })
diff --git a/vignettes/intro.Rmd b/vignettes/articles/intro.Rmd
similarity index 100%
rename from vignettes/intro.Rmd
rename to vignettes/articles/intro.Rmd