> library(quanteda)
Package version: 4.0.2
Unicode version: 15.0
ICU version: 73.2
Parallel computing: disabled
See https://quanteda.io for tutorials and examples.
> d <- dget("dput-frame.txt")
> x <- corpus(d, docid_field = "filename", text_field = "content")
> y <- dfm(tokens(x))
Error in validObject(.Object) :
invalid class "dfm" object: first element of 'p' slot is not 0
> traceback()
12: stop(msg, ": ", errors, domain = NA)
11: validObject(.Object)
10: .nextMethod(.Object = .Object, ... = ...)
9: callNextMethod()
8: initialize(value, ...)
7: initialize(value, ...)
6: new("dfm", as(as(as(x, "CsparseMatrix"), "generalMatrix"), "dMatrix"),
docvars = docvars, meta = make_meta("dfm", inherit = meta,
...))
5: build_dfm(temp, colnames(temp), docvars = get_docvars(x, user = TRUE,
system = TRUE), meta = attrs[["meta"]])
4: dfm.tokens_xptr(as.tokens_xptr(x), tolower = tolower, remove_padding = remove_padding,
verbose = verbose, ...)
3: dfm(as.tokens_xptr(x), tolower = tolower, remove_padding = remove_padding,
verbose = verbose, ...)
2: dfm.tokens(tokens(x))
1: dfm(tokens(x))
> debugonce(quanteda:::dfm.tokens_xptr)
> y <- dfm(tokens(x))
debugging in: dfm.tokens_xptr(as.tokens_xptr(x), tolower = tolower, remove_padding = remove_padding,
verbose = verbose, ...)
debug: {
if (is.null(global$object_class)) {
global$object_class <- class(x)[1]
global$proc_time <- proc.time()
}
check_dots(...)
if (verbose)
catm("Creating a dfm from a", global$object_class, "object...\n")
x <- as.tokens_xptr(x)
if (tolower)
x <- tokens_tolower(x)
if (remove_padding)
x <- tokens_remove(x, "", valuetype = "fixed")
attrs <- attributes(x)
temp <- t(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
result <- build_dfm(temp, colnames(temp), docvars = get_docvars(x,
user = TRUE, system = TRUE), meta = attrs[["meta"]])
if (verbose) {
catm(" ...complete, elapsed time:", format((proc.time() -
global$proc_time)[3], digits = 3), "seconds.\n")
catm("Finished constructing a", paste(format(dim(result),
big.mark = ",", trim = TRUE), collapse = " x "),
"sparse dfm.\n")
}
global$object_class <- NULL
return(result)
}
Browse[1]> n
debug: if (is.null(global$object_class)) {
global$object_class <- class(x)[1]
global$proc_time <- proc.time()
}
Browse[1]> n
debug: check_dots(...)
Browse[1]> n
debug: if (verbose) catm("Creating a dfm from a", global$object_class,
"object...\n")
Browse[1]> n
debug: x <- as.tokens_xptr(x)
Browse[1]> n
debug: if (tolower) x <- tokens_tolower(x)
Browse[1]> n
debug: x <- tokens_tolower(x)
Browse[1]> n
debug: if (remove_padding) x <- tokens_remove(x, "", valuetype = "fixed")
Browse[1]> n
debug: attrs <- attributes(x)
Browse[1]> n
debug: temp <- t(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
Browse[1]> validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
Error in validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary")) :
invalid class "dgCMatrix" object: first differences of 'p' slot exceed Dim[1]
Browse[1]> validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
[1] TRUE
Browse[1]> validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
[1] TRUE
Browse[1]>
> sessionInfo()
R Under development (unstable) (2024-08-07 r86984)
Platform: aarch64-apple-darwin22.6.0
Running under: macOS Ventura 13.6.7
Matrix products: default
BLAS: /usr/local/lib/R/lib/libRblas.dylib
LAPACK: /usr/local/lib/R/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] C/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
time zone: America/Toronto
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] quanteda_4.0.2
loaded via a namespace (and not attached):
[1] compiler_4.5.0 magrittr_2.0.3 Matrix_1.8-0 cli_3.6.3
[5] tools_4.5.0 fastmatch_1.1-4 Rcpp_1.0.13 stringi_1.8.4
[9] grid_4.5.0 stopwords_2.3 lifecycle_1.0.4 rlang_1.1.4
[13] lattice_0.22-6
Describe the bug
cpp_dfmreturns an invaliddgCMatrixinside ofdfm.tokens_xptr. In a debugger, evaluating the call tocpp_dfma second time, interactively, gives a different (this time valid) object. So the return value can be not only invalid but also non-deterministic.Reproducible code
(Original, non-minimal example is here: https://stackoverflow.com/q/78841310/12685768)
dput-frame.txt
## System information