Skip to content

Return value of cpp_dfm can be invalid, non-deterministic #2407

@jaganmn

Description

@jaganmn

Describe the bug

cpp_dfm returns an invalid dgCMatrix inside of dfm.tokens_xptr. In a debugger, evaluating the call to cpp_dfm a second time, interactively, gives a different (this time valid) object. So the return value can be not only invalid but also non-deterministic.

Reproducible code

(Original, non-minimal example is here: https://stackoverflow.com/q/78841310/12685768)

library(quanteda)
d <- dget("dput-frame.txt")
x <- corpus(d, docid_field = "filename", text_field = "content")
y <- dfm(tokens(x))

dput-frame.txt

> library(quanteda)
Package version: 4.0.2
Unicode version: 15.0
ICU version: 73.2
Parallel computing: disabled
See https://quanteda.io for tutorials and examples.
> d <- dget("dput-frame.txt")
> x <- corpus(d, docid_field = "filename", text_field = "content")
> y <- dfm(tokens(x))
Error in validObject(.Object) : 
  invalid class "dfm" object: first element of 'p' slot is not 0
> traceback()
12: stop(msg, ": ", errors, domain = NA)
11: validObject(.Object)
10: .nextMethod(.Object = .Object, ... = ...)
9: callNextMethod()
8: initialize(value, ...)
7: initialize(value, ...)
6: new("dfm", as(as(as(x, "CsparseMatrix"), "generalMatrix"), "dMatrix"), 
       docvars = docvars, meta = make_meta("dfm", inherit = meta, 
           ...))
5: build_dfm(temp, colnames(temp), docvars = get_docvars(x, user = TRUE, 
       system = TRUE), meta = attrs[["meta"]])
4: dfm.tokens_xptr(as.tokens_xptr(x), tolower = tolower, remove_padding = remove_padding, 
       verbose = verbose, ...)
3: dfm(as.tokens_xptr(x), tolower = tolower, remove_padding = remove_padding, 
       verbose = verbose, ...)
2: dfm.tokens(tokens(x))
1: dfm(tokens(x))
> debugonce(quanteda:::dfm.tokens_xptr)
> y <- dfm(tokens(x))
debugging in: dfm.tokens_xptr(as.tokens_xptr(x), tolower = tolower, remove_padding = remove_padding, 
    verbose = verbose, ...)
debug: {
    if (is.null(global$object_class)) {
        global$object_class <- class(x)[1]
        global$proc_time <- proc.time()
    }
    check_dots(...)
    if (verbose) 
        catm("Creating a dfm from a", global$object_class, "object...\n")
    x <- as.tokens_xptr(x)
    if (tolower) 
        x <- tokens_tolower(x)
    if (remove_padding) 
        x <- tokens_remove(x, "", valuetype = "fixed")
    attrs <- attributes(x)
    temp <- t(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
    result <- build_dfm(temp, colnames(temp), docvars = get_docvars(x, 
        user = TRUE, system = TRUE), meta = attrs[["meta"]])
    if (verbose) {
        catm(" ...complete, elapsed time:", format((proc.time() - 
            global$proc_time)[3], digits = 3), "seconds.\n")
        catm("Finished constructing a", paste(format(dim(result), 
            big.mark = ",", trim = TRUE), collapse = " x "), 
            "sparse dfm.\n")
    }
    global$object_class <- NULL
    return(result)
}
Browse[1]> n
debug: if (is.null(global$object_class)) {
    global$object_class <- class(x)[1]
    global$proc_time <- proc.time()
}
Browse[1]> n
debug: check_dots(...)
Browse[1]> n
debug: if (verbose) catm("Creating a dfm from a", global$object_class, 
    "object...\n")
Browse[1]> n
debug: x <- as.tokens_xptr(x)
Browse[1]> n
debug: if (tolower) x <- tokens_tolower(x)
Browse[1]> n
debug: x <- tokens_tolower(x)
Browse[1]> n
debug: if (remove_padding) x <- tokens_remove(x, "", valuetype = "fixed")
Browse[1]> n
debug: attrs <- attributes(x)
Browse[1]> n
debug: temp <- t(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
Browse[1]> validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
Error in validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary")) : 
  invalid class "dgCMatrix" object: first differences of 'p' slot exceed Dim[1]
Browse[1]> validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
[1] TRUE
Browse[1]> validObject(cpp_dfm(x, attrs$meta$object$what == "dictionary"))
[1] TRUE
Browse[1]>

## System information

> sessionInfo()
R Under development (unstable) (2024-08-07 r86984)
Platform: aarch64-apple-darwin22.6.0
Running under: macOS Ventura 13.6.7

Matrix products: default
BLAS:   /usr/local/lib/R/lib/libRblas.dylib 
LAPACK: /usr/local/lib/R/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] C/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8

time zone: America/Toronto
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] quanteda_4.0.2

loaded via a namespace (and not attached):
 [1] compiler_4.5.0  magrittr_2.0.3  Matrix_1.8-0    cli_3.6.3      
 [5] tools_4.5.0     fastmatch_1.1-4 Rcpp_1.0.13     stringi_1.8.4  
 [9] grid_4.5.0      stopwords_2.3   lifecycle_1.0.4 rlang_1.1.4    
[13] lattice_0.22-6

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions