Fixes #414

spsanderson · Apr 15, 2024 · a173c14 · a173c14
1 parent 02455d4
commit a173c14
Show file tree

Hide file tree

Showing 25 changed files with 293 additions and 3 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -36,7 +36,8 @@ Imports:
     nloptr,
     broom,
     tidyselect,
-    data.table
+    data.table,
+    bbmle
 Suggests: 
     rmarkdown,
     knitr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -95,6 +95,7 @@ export(util_burr_param_estimate)
 export(util_burr_stats_tbl)
 export(util_cauchy_param_estimate)
 export(util_cauchy_stats_tbl)
+export(util_chisquare_param_estimate)
 export(util_chisquare_stats_tbl)
 export(util_exponential_param_estimate)
 export(util_exponential_stats_tbl)

diff --git a/NEWS.md b/NEWS.md
@@ -6,10 +6,11 @@ None
 ## New Features
 1. Fix #405 - Add function `quantile_normalization()` to normalize data using quantiles.
 2. Fix #409 - Add function `check_duplilcate_rows()` to check for duplicate rows in a data frame.
+3. Fix #414 - Add function `util_chisquare_param_estimate()` to estimate the parameters of the chi-square distribution.
 
 ## Minor Fixes and Improvements
 1. Fix #401 - Update `tidy_multi_single_dist()` to respect the `.return_tibble` parameter
-2. Fix $406 - Updaet `tidy_multi_single_dist()` to exclude the `.return_tibble` parameter
+2. Fix $406 - Update `tidy_multi_single_dist()` to exclude the `.return_tibble` parameter
 from returning in the distribution parameters.
 
 # TidyDensity 1.3.0

diff --git a/R/est-param-chisq.R b/R/est-param-chisq.R
@@ -0,0 +1,164 @@
+#' Estimate Chisquare Parameters
+#'
+#' @family Parameter Estimation
+#' @family Chisquare
+#'
+#' @author Steven P. Sanderson II, MPH
+#'
+#' @details This function will see if the given vector `.x` is a numeric vector.
+#' It will attempt to estimate the prob parameter of a Chisquare distribution.
+#' The function first performs tidyeval on the input data to ensure it's a
+#' numeric vector. It then checks if there are at least two data points, as this
+#' is a requirement for parameter estimation.
+#'
+#' The estimation of the chi-square distribution parameters is performed using
+#' maximum likelihood estimation (MLE) implemented with the `bbmle` package.
+#' The negative log-likelihood function is minimized to obtain the estimates for
+#' the degrees of freedom (\code{doff}) and the non-centrality parameter (\code{ncp}).
+#' Initial values for the optimization are set based on the sample variance and
+#' mean, but these can be adjusted if necessary.
+#'
+#' If the estimation fails or encounters an error, the function returns \code{NA}
+#' for both \code{doff} and \code{ncp}.
+#'
+#' Finally, the function returns a tibble containing the following information:
+#' \describe{
+#'   \item{dist_type}{The type of distribution, which is "Chisquare" in this case.}
+#'   \item{samp_size}{The sample size, i.e., the number of data points in the input vector.}
+#'   \item{min}{The minimum value of the data points.}
+#'   \item{max}{The maximum value of the data points.}
+#'   \item{mean}{The mean of the data points.}
+#'   \item{degrees_of_freedom}{The estimated degrees of freedom (\code{doff}) for the chi-square distribution.}
+#'   \item{ncp}{The estimated non-centrality parameter (\code{ncp}) for the chi-square distribution.}
+#' }
+#'
+#' Additionally, if the argument \code{.auto_gen_empirical} is set to \code{TRUE}
+#' (which is the default behavior), the function also returns a combined tibble
+#' containing both empirical and chi-square distribution data, obtained by
+#' calling \code{tidy_empirical} and \code{tidy_chisquare}, respectively.
+#'
+#' @description This function will attempt to estimate the Chisquare prob parameter
+#' given some vector of values `.x`. The function will return a list output by default,
+#' and  if the parameter `.auto_gen_empirical` is set to `TRUE` then the empirical
+#' data given to the parameter `.x` will be run through the `tidy_empirical()`
+#' function and combined with the estimated Chisquare data.
+#'
+#' @param .x The vector of data to be passed to the function. Must be non-negative
+#' integers.
+#' @param .auto_gen_empirical This is a boolean value of TRUE/FALSE with default
+#' set to TRUE. This will automatically create the `tidy_empirical()` output
+#' for the `.x` parameter and use the `tidy_combine_distributions()`. The user
+#' can then plot out the data using `$combined_data_tbl` from the function output.
+#'
+#' @examples
+#' library(dplyr)
+#' library(ggplot2)
+#'
+#' tc <- tidy_chisquare(.n = 500, .df = 6, .ncp = 1) |> pull(y)
+#' output <- util_chisquare_param_estimate(tc)
+#'
+#' output$parameter_tbl
+#'
+#' output$combined_data_tbl |>
+#'   tidy_combined_autoplot()
+#'
+#' @return
+#' A tibble/list
+#'
+#' @name util_chisquare_param_estimate
+NULL
+
+#' @export
+#' @rdname util_chisquare_param_estimate
+
+util_chisquare_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {
+
+  # Tidyeval ----
+  x_term <- as.numeric(.x)
+  n <- length(x_term)
+  minx <- min(as.numeric(x_term))
+  maxx <- max(as.numeric(x_term))
+
+  # Checks ----
+  if (!is.vector(x_term, mode = "numeric")) {
+    rlang::abort(
+      message = "The '.x' term must be a numeric vector.",
+      use_cli_format = TRUE
+    )
+  }
+
+  if (n < 2) {
+    rlang::abort(
+      message = "You must supply at least two data points for this function.",
+      use_cli_format = TRUE
+    )
+  }
+
+  # Parameters ----
+  estimate_chisq_params <- function(data) {
+    # Negative log-likelihood function
+    negLogLik <- function(df, ncp) {
+      -sum(stats::dchisq(data, df = df, ncp = ncp, log = TRUE))
+    }
+
+    # Initial values (adjust based on your data if necessary)
+    start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data)))
+
+    # MLE using bbmle
+    mle_fit <- bbmle::mle2(negLogLik, start = start_vals)
+    # Return estimated parameters as a named vector
+    df <- dplyr::tibble(
+      est_df = bbmle::coef(mle_fit)[1],
+      est_ncp = bbmle::coef(mle_fit)[2]
+    )
+    return(df)
+  }
+
+  safe_estimates <- {
+    purrr::possibly(
+      estimate_chisq_params,
+      otherwise = NA_real_,
+      quiet = TRUE
+    )
+  }
+
+  estimates <- safe_estimates(x_term)
+  doff <- estimates$est_df |> unname()
+  ncp <- estimates$est_ncp |> unname()
+
+  # Return Tibble ----
+  if (.auto_gen_empirical) {
+    te <- tidy_empirical(.x = x_term)
+    tc <- tidy_chisquare(.n = n, .df = round(doff, 3), .ncp = round(ncp, 3))
+    combined_tbl <- tidy_combine_distributions(te, tc)
+  }
+
+  ret <- dplyr::tibble(
+    dist_type = "Chisquare",
+    samp_size = n,
+    min = minx,
+    max = maxx,
+    mean = mean(x_term),
+    degrees_of_freedom = doff,
+    ncp = ncp
+  )
+
+  # Return ----
+  attr(ret, "tibble_type") <- "parameter_estimation"
+  attr(ret, "family") <- "chisquare"
+  attr(ret, "x_term") <- .x
+  attr(ret, "n") <- n
+
+  if (.auto_gen_empirical) {
+    output <- list(
+      combined_data_tbl = combined_tbl,
+      parameter_tbl     = ret
+    )
+  } else {
+    output <- list(
+      parameter_tbl = ret
+    )
+  }
+
+  return(output)
+}
diff --git a/man/tidy_chisquare.Rd b/man/tidy_chisquare.Rd
diff --git a/man/util_bernoulli_param_estimate.Rd b/man/util_bernoulli_param_estimate.Rd
diff --git a/man/util_beta_param_estimate.Rd b/man/util_beta_param_estimate.Rd
diff --git a/man/util_binomial_param_estimate.Rd b/man/util_binomial_param_estimate.Rd
diff --git a/man/util_burr_param_estimate.Rd b/man/util_burr_param_estimate.Rd
diff --git a/man/util_cauchy_param_estimate.Rd b/man/util_cauchy_param_estimate.Rd
diff --git a/man/util_chisquare_param_estimate.Rd b/man/util_chisquare_param_estimate.Rd
diff --git a/man/util_chisquare_stats_tbl.Rd b/man/util_chisquare_stats_tbl.Rd
diff --git a/man/util_exponential_param_estimate.Rd b/man/util_exponential_param_estimate.Rd
diff --git a/man/util_gamma_param_estimate.Rd b/man/util_gamma_param_estimate.Rd
diff --git a/man/util_geometric_param_estimate.Rd b/man/util_geometric_param_estimate.Rd
diff --git a/man/util_hypergeometric_param_estimate.Rd b/man/util_hypergeometric_param_estimate.Rd
diff --git a/man/util_logistic_param_estimate.Rd b/man/util_logistic_param_estimate.Rd