Possibly convert tidy_dist_() functions to use `data.table` as the underlying generator #350

spsanderson · 2023-10-11T17:35:46Z

Function Example:

tidy_bernoulli <- function(.n = 50, .prob = 0.1, .num_sims = 1, .return_tibble = TRUE) {
  
  # Arguments
  n <- as.integer(.n)
  num_sims <- as.integer(.num_sims)
  pr <- as.numeric(.prob)
  ret_tbl <- as.logical(.return_tibble)
  
  # Checks ----
  if (!is.integer(n) | n < 0) {
    rlang::abort(
      message = "The parameters '.n' must be of class integer. Please pass a whole
            number like 50 or 100. It must be greater than 0.",
      use_cli_format = TRUE
    )
  }
  
  if (!is.integer(num_sims) | num_sims < 0) {
    rlang::abort(
      message = "The parameter `.num_sims' must be of class integer. Please pass a
            whole number like 50 or 100. It must be greater than 0.",
      use_cli_format = TRUE
    )
  }
  
  if (pr < 0 | pr > 1) {
    rlang::abort(
      message = "The '.prob' parameter must have an argument between 0 and 1 inclusive",
      use_cli_format = TRUE
    )
  }
  
  # Create a data.table with one row per simulation
  df <- data.table::CJ(sim_number = factor(1:num_sims), x = 1:n)
  
  # Group the data by sim_number and add columns for x and y
  df[, y := stats::rbinom(n = .N, size = 1, prob = pr)]
  
  # Compute the density of the y values and add columns for dx and dy
  df[, c("dx", "dy") := density(y, n = n)[c("x", "y")], by = sim_number]
  
  # Compute the p-values for the y values and add a column for p
  df[, p := stats::pbinom(y, size = 1, prob = pr)]
  
  # Compute the q-values for the p-values and add a column for q
  df[, q := stats::qbinom(p, size = 1, prob = pr)]
  
  if(.return_tibble){
    df <- dplyr::as_tibble(df)
  } else {
    data.table::setkey(df, NULL)
  }
  
  param_grid <- dplyr::tibble(pr)
  
  # Attach descriptive attributes to tibble
  attr(df, "distribution_family_type") <- "discrete"
  attr(df, ".prob") <- .prob
  attr(df, ".n") <- .n
  attr(df, ".num_sims") <- .num_sims
  attr(df, ".ret_tbl") <- .return_tibble
  attr(df, "tibble_type") <- "tidy_bernoulli"
  attr(df, "param_grid") <- param_grid
  attr(df, "param_grid_txt") <- paste0(
    "c(",
    paste(param_grid[, names(param_grid)], collapse = ", "),
    ")"
  )
  attr(df, "dist_with_params") <- paste0(
    "Bernoulli",
    " ",
    paste0(
      "c(",
      paste(param_grid[, names(param_grid)], collapse = ", "),
      ")"
    )
  )
  
  return(df)
}

Example Output

> system.time(bernoulli(2, 10000, 0.5, FALSE))
   user  system elapsed 
   0.05    0.00    0.04 
> system.time(TidyDensity::tidy_bernoulli(10000, 0.5, 2))
   user  system elapsed 
   0.12    0.02    0.14 

> util_bernoulli_stats_tbl(b) |> glimpse()
Rows: 1
Columns: 18
$ tidy_function      <chr> "tidy_bernoulli"
$ function_call      <chr> "Bernoulli c(0.5)"
$ distribution       <chr> "Bernoulli"
$ distribution_type  <chr> "discrete"
$ points             <dbl> 1000
$ simulations        <dbl> 2
$ mean               <dbl> 0.5
$ mode               <chr> "[0,1]"
$ coeff_var          <dbl> 0.25
$ skewness           <dbl> 0
$ kurtosis           <dbl> -2
$ mad                <dbl> 0.5
$ entropy            <dbl> 0.6931472
$ fisher_information <dbl> 4
$ computed_std_skew  <dbl> -0.07004291
$ computed_std_kurt  <dbl> 1.004906
$ ci_lo              <dbl> 0
$ ci_hi              <dbl> 1

> n <- 10000
> benchmark(
+   "tidy_bernoulli_v2" = {
+     tidy_bernoulli_v2(n, .5, 1, FALSE)
+   },
+   "tidy_bernoulli_v1" = {
+     TidyDensity::tidy_bernoulli(n, .5, 1)
+   },
+   replications = 100,
+   columns = c("test","replications","elapsed","relative","user.self","sys.self")
+ ) |>
+   arrange(relative)
               test replications elapsed relative user.self sys.self
1 tidy_bernoulli_v2          100    2.50    1.000      2.22     0.26
2 tidy_bernoulli_v1          100    4.67    1.868      4.34     0.31

spsanderson added the enhancement New feature or request label Oct 11, 2023

spsanderson self-assigned this Oct 11, 2023

spsanderson added this to the TidyDensity 1.3.0 milestone Dec 3, 2023

spsanderson added a commit that referenced this issue Dec 7, 2023

Towards #350

73f30cb

spsanderson closed this as completed in 4b55837 Dec 7, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Possibly convert tidy_dist_() functions to use `data.table` as the underlying generator #350

Possibly convert tidy_dist_() functions to use `data.table` as the underlying generator #350

spsanderson commented Oct 11, 2023 •

edited

Possibly convert tidy_dist_() functions to use data.table as the underlying generator #350

Possibly convert tidy_dist_() functions to use data.table as the underlying generator #350

Comments

spsanderson commented Oct 11, 2023 • edited

Possibly convert tidy_dist_() functions to use `data.table` as the underlying generator #350

Possibly convert tidy_dist_() functions to use `data.table` as the underlying generator #350

spsanderson commented Oct 11, 2023 •

edited