Skip to content

Commit

Permalink
Address #1447
Browse files Browse the repository at this point in the history
- Revert change in ad359e6.
- Fix handling of new line markers.
- Update documentation as well.
- Correct tests
  • Loading branch information
koheiw committed Oct 22, 2018
1 parent e2d8005 commit 8e206f9
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 23 deletions.
14 changes: 5 additions & 9 deletions R/tokens.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#' slowest, word tokenization method; see
#' \link[stringi]{stringi-search-boundaries} for details.}
#' \item{\code{"fasterword"}}{dumber, but faster, word tokenization method,
#' uses \code{{\link[stringi]{stri_split_charclass}(x, "\\\\p{WHITE_SPACE}")}}}
#' uses \code{{\link[stringi]{stri_split_charclass}(x, "[\\\\p{Z}\\\\p{C}]+")}}}
#' \item{\code{"fastestword"}}{dumbest, but fastest, word tokenization method,
#' calls \code{\link[stringi]{stri_split_fixed}(x, " ")}}
#' \item{\code{"character"}}{tokenization into individual characters}
Expand Down Expand Up @@ -664,14 +664,10 @@ tokens_word <- function(txt,
remove_separators = TRUE,
verbose = FALSE){

if (what=="fastestword") {
if (what == "fastestword") {
tok <- stri_split_fixed(txt, " ")
} else if (what=="fasterword") {
tok <- if (remove_separators) {
stri_split_regex(txt, "\\p{WHITE_SPACE}+")
} else {
stri_split_regex(txt, "\\p{Z}+")
}
} else if (what == "fasterword") {
tok <- stri_split_regex(txt, "[\\p{Z}\\p{C}]+")
} else {
txt <- stri_replace_all_regex(txt, "[\uFE00-\uFE0F]", '') # remove variant selector
txt <- stri_replace_all_regex(txt, "\\s[\u0300-\u036F]", '') # remove whitespace with diacritical marks
Expand Down Expand Up @@ -726,7 +722,7 @@ tokens_sentence <- function(txt, verbose = FALSE){
x <- stri_trim_right(x) # trim trailing spaces
x <- stri_replace_all_fixed(x, "_pd_", ".") # replace the non-full-stop "." characters
return(x)
} )
})

return(tok)
}
Expand Down
2 changes: 1 addition & 1 deletion man/tokens.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 10 additions & 13 deletions tests/testthat/test-tokens.R
Original file line number Diff line number Diff line change
Expand Up @@ -466,14 +466,6 @@ test_that("assignment operators are disabled for tokens object", {
expect_error(toks[1] <- list(c(6, 100, 'z')), 'assignment to tokens objects is not allowed')
})

test_that("what = 'fasterword' works correctly", {
txt <- "\n \t word"
expect_equal(as.list(tokens(txt, what = "fasterword", remove_separators = TRUE))[[1]],
"word")
expect_equal(as.list(tokens(txt, what = "fasterword", remove_separators = FALSE))[[1]],
c("\n", "\t", "word"))
})

test_that("empty tokens are removed correctly", {
txt <- 'a b c d e '
tok <- c('a', 'b', 'c', 'd', 'e')
Expand Down Expand Up @@ -517,17 +509,23 @@ test_that("combined tokens objects have all the attributes", {

})

test_that("tokens fasterword handles newlines correctly (#1420)", {
test_that("tokens fasterword handles newlines correctly (#1447)", {
expect_identical(
as.list(tokens("one\ntwo\tthree", what = "fastestword", remove_separators = TRUE)),
list(text1 = c("one\ntwo\tthree"))
)
expect_identical(
as.list(tokens("one\ntwo\tthree", what = "fastestword", remove_separators = FALSE)),
list(text1 = c("one\ntwo\tthree"))
)
expect_identical(
as.list(tokens("one\ntwo\tthree", what = "fasterword", remove_separators = TRUE)),
list(text1 = c("one", "two", "three"))
)
expect_identical(
as.list(tokens("one\ntwo\tthree", what = "fasterword", remove_separators = FALSE)),
list(text1 = c("one\ntwo\tthree"))
list(text1 = c("one", "two", "three"))
)

# with "word" (behaviour is different)
expect_identical(
as.list(tokens("one\ntwo\tthree", what = "word", remove_separators = TRUE)),
list(text1 = c("one", "two", "three"))
Expand All @@ -536,5 +534,4 @@ test_that("tokens fasterword handles newlines correctly (#1420)", {
as.list(tokens("one\ntwo\tthree", what = "word", remove_separators = FALSE)),
list(text1 = c("one", "\n", "two", "\t", "three"))
)

})

0 comments on commit 8e206f9

Please sign in to comment.