Skip to content

Commit

Permalink
Don't worry about stemming whitespace in tokens or features
Browse files Browse the repository at this point in the history
  • Loading branch information
kbenoit committed Nov 9, 2021
1 parent 7819d55 commit 4759cd0
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 15 deletions.
8 changes: 4 additions & 4 deletions R/wordstem.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ tokens_wordstem.tokens <- function(x, language = quanteda_options("language_stem
x <- as.tokens(x)
attrs <- attributes(x)
if (identical(field_object(attrs, "ngram"), 1L)) {
types(x) <- char_wordstem(types(x), language = language)
types(x) <- char_wordstem(types(x), language = language, check_whitespace = FALSE)
} else {
types(x) <- wordstem_ngrams(
types(x),
Expand All @@ -64,13 +64,13 @@ tokens_wordstem.tokens <- function(x, language = quanteda_options("language_stem
#' char_wordstem(c("win", "winning", "wins", "won", "winner"))
#'
char_wordstem <- function(x, language = quanteda_options("language_stemmer"),
check_whitespace = TRUE) {
check_whitespace = TRUE) {
UseMethod("char_wordstem")
}

#' @export
char_wordstem.default <- function(x, language = quanteda_options("language_stemmer"),
check_whitespace = TRUE) {
check_whitespace = TRUE) {
check_class(class(x), "char_wordstem")
}

Expand Down Expand Up @@ -112,7 +112,7 @@ dfm_wordstem.dfm <- function(x, language = quanteda_options("language_stemmer"))
x <- as.dfm(x)
attrs <- attributes(x)
if (identical(field_object(attrs, "ngram"), 1L)) {
set_dfm_featnames(x) <- char_wordstem(featnames(x), language = language)
set_dfm_featnames(x) <- char_wordstem(featnames(x), language = language, check_whitespace = FALSE)
} else {
set_dfm_featnames(x) <- wordstem_ngrams(
featnames(x),
Expand Down
12 changes: 1 addition & 11 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ACL
Benoit
Biden
Biden's
Bormuth
Expand All @@ -14,9 +13,7 @@ Datatables
DocumentTermMatrix
Docvars
Dáil
dist
ERC
Fellows's
fileset
Haass
Herdan's
Expand All @@ -42,8 +39,6 @@ Raghavan
Rcpp
Rdata
Rprofile
Rtools
simil
STM
SnowballC
Soroka
Expand Down Expand Up @@ -84,7 +79,6 @@ ci
cleanC
codecov
color
colors
com
compoundWords
concatenator
Expand All @@ -105,7 +99,6 @@ docfreq
docname
docnames
docid
docids
docvar
docvars
doi
Expand All @@ -120,15 +113,13 @@ etc
exampleString
fastmatch
fcm
fcm's
featnames
fitmodel
formals
fromJSON
gcc
getStemLanguages
getTweets
gfortran
github
ggplot
httr
Expand All @@ -150,6 +141,7 @@ lda
lexdiv
libicu
libstemmer
lintr
linux
lsa
macOS
Expand Down Expand Up @@ -179,9 +171,7 @@ phrasetotokens
pkgdown
pmi
proxyC
quanteda
quantedaData
quantedaverse
rOpenSci
readLIWCdict
readWStatDict
Expand Down

0 comments on commit 4759cd0

Please sign in to comment.