Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add function util_chisquare_param_estimate() #414

Closed
spsanderson opened this issue Apr 12, 2024 · 3 comments
Closed

Add function util_chisquare_param_estimate() #414

spsanderson opened this issue Apr 12, 2024 · 3 comments
Assignees
Labels
enhancement New feature or request

Comments

@spsanderson
Copy link
Owner

spsanderson commented Apr 12, 2024

Add a function for util_chisquare_param_estimate()

Function:

#' Estimate Chisquare Parameters
#'
#' @family Parameter Estimation
#' @family Chisquare
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will see if the given vector `.x` is a numeric vector.
#' It will attempt to estimate the prob parameter of a Chisquare distribution.
#' The function first performs tidyeval on the input data to ensure it's a 
#' numeric vector. It then checks if there are at least two data points, as this 
#' is a requirement for parameter estimation.
#' 
#' The estimation of the chi-square distribution parameters is performed using 
#' maximum likelihood estimation (MLE) implemented with the `bbmle` package. 
#' The negative log-likelihood function is minimized to obtain the estimates for 
#' the degrees of freedom (\code{doff}) and the non-centrality parameter (\code{ncp}). 
#' Initial values for the optimization are set based on the sample variance and 
#' mean, but these can be adjusted if necessary.
#' 
#' If the estimation fails or encounters an error, the function returns \code{NA} 
#' for both \code{doff} and \code{ncp}.
#' 
#' Finally, the function returns a tibble containing the following information:
#' \describe{
#'   \item{dist_type}{The type of distribution, which is "Chisquare" in this case.}
#'   \item{samp_size}{The sample size, i.e., the number of data points in the input vector.}
#'   \item{min}{The minimum value of the data points.}
#'   \item{max}{The maximum value of the data points.}
#'   \item{mean}{The mean of the data points.}
#'   \item{degrees_of_freedom}{The estimated degrees of freedom (\code{doff}) for the chi-square distribution.}
#'   \item{ncp}{The estimated non-centrality parameter (\code{ncp}) for the chi-square distribution.}
#' }
#' 
#' Additionally, if the argument \code{.auto_gen_empirical} is set to \code{TRUE} 
#' (which is the default behavior), the function also returns a combined tibble 
#' containing both empirical and chi-square distribution data, obtained by 
#' calling \code{tidy_empirical} and \code{tidy_chisquare}, respectively.
#' 
#' @description This function will attempt to estimate the Chisquare prob parameter
#' given some vector of values `.x`. The function will return a list output by default,
#' and  if the parameter `.auto_gen_empirical` is set to `TRUE` then the empirical
#' data given to the parameter `.x` will be run through the `tidy_empirical()`
#' function and combined with the estimated Chisquare data.
#'
#' @param .x The vector of data to be passed to the function. Must be non-negative
#' integers.
#' @param .auto_gen_empirical This is a boolean value of TRUE/FALSE with default
#' set to TRUE. This will automatically create the `tidy_empirical()` output
#' for the `.x` parameter and use the `tidy_combine_distributions()`. The user
#' can then plot out the data using `$combined_data_tbl` from the function output.
#'
#' @examples
#' library(dplyr)
#' library(ggplot2)
#'
#' tc <- tidy_chisquare(.n = 500, .df = 6, .ncp = 1) |> pull(y)
#' output <- util_chisquare_param_estimate(tc)
#'
#' output$parameter_tbl
#'
#' output$combined_data_tbl |>
#'   tidy_combined_autoplot()
#'
#' @return
#' A tibble/list
#'
#' @name util_chisquare_param_estimate
NULL

#' @export
#' @rdname util_chisquare_param_estimate

util_chisquare_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {
  
  # Tidyeval ----
  x_term <- as.numeric(.x)
  n <- length(x_term)
  minx <- min(as.numeric(x_term))
  maxx <- max(as.numeric(x_term))
  
  # Checks ----
  if (!is.vector(x_term, mode = "numeric")) {
    rlang::abort(
      message = "The '.x' term must be a numeric vector.",
      use_cli_format = TRUE
    )
  }
  
  if (n < 2) {
    rlang::abort(
      message = "You must supply at least two data points for this function.",
      use_cli_format = TRUE
    )
  }
  
  # Parameters ----
  estimate_chisq_params <- function(data) {
    # Negative log-likelihood function
    negLogLik <- function(df, ncp) {
      -sum(stats::dchisq(data, df = df, ncp = ncp, log = TRUE))
    }
    
    # Initial values (adjust based on your data if necessary)
    start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data)))
    
    # MLE using bbmle
    mle_fit <- bbmle::mle2(negLogLik, start = start_vals)
    # Return estimated parameters as a named vector
    df <- dplyr::tibble(
      est_df = bbmle::coef(mle_fit)[1],
      est_ncp = bbmle::coef(mle_fit)[2]
    )
    return(df)
  }
  
  safe_estimates <- {
    purrr::possibly(
      estimate_chisq_params,
      otherwise = NA_real_,
      quiet = TRUE
    )
  }
  
  estimates <- safe_estimates(x_term)
  doff <- estimates$est_df
  ncp <- estimates$est_ncp
  
  # Return Tibble ----
  if (.auto_gen_empirical) {
    te <- tidy_empirical(.x = x_term)
    tc <- tidy_chisquare(.n = n, .df = round(doff, 3), .ncp = round(ncp, 3))
    combined_tbl <- tidy_combine_distributions(te, tc)
  }
  
  ret <- dplyr::tibble(
    dist_type = "Chisquare",
    samp_size = n,
    min = minx,
    max = maxx,
    mean = mean(x_term),
    degrees_of_freedom = doff,
    ncp = ncp
  )
  
  # Return ----
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "chisquare"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- n
  
  if (.auto_gen_empirical) {
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl     = ret
    )
  } else {
    output <- list(
      parameter_tbl = ret
    )
  }
  
  return(output)
}

Example:

> tc <- tidy_chisquare(.n = 500, .df = 6, .ncp = 1) |> pull(y)
> output <- util_chisquare_param_estimate(tc)
Warning messages:
1: In stats::dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
2: In stats::dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
3: In stats::dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
> 
> output$parameter_tbl
# A tibble: 1 × 7
  dist_type samp_size   min   max  mean degrees_of_freedom   ncp
  <chr>         <int> <dbl> <dbl> <dbl>              <dbl> <dbl>
1 Chisquare       500 0.935  20.6  7.13               6.65 0.488
> 
> output$combined_data_tbl |>
+   tidy_combined_autoplot()

image

@spsanderson spsanderson added the enhancement New feature or request label Apr 12, 2024
@spsanderson spsanderson added this to the TidyDensity 1.3.1 milestone Apr 12, 2024
@spsanderson spsanderson self-assigned this Apr 12, 2024
@spsanderson
Copy link
Owner Author

spsanderson commented Apr 13, 2024

This might work well too, although still finding a good ncp start point might prove somewha elusive:

# Load required package
library(bbmle)

# Sample data (replace with your actual data)
data <- rchisq(100, df = 5, ncp = 20)

# Define negative log-likelihood function
negLogLik <- function(df, ncp) {
  -sum(dchisq(data, df = df, ncp = ncp, log = TRUE))
}

# Initial values for optimization (crucial for good convergence)
start_vals <- list(df = 3, ncp = 20)

# Maximum likelihood estimation
mle_fit <- mle2(negLogLik, start = start_vals)

# Extract estimated parameters
df_est <- coef(mle_fit)[1]
ncp_est <- coef(mle_fit)[2]

# Print the results
cat("Estimated df:", df_est, "\n")
cat("Estimated ncp:", ncp_est)

Output:

> # Load required package
> library(bbmle)
> 
> # Sample data (replace with your actual data)
> data <- rchisq(100, df = 5, ncp = 20)
> 
> # Define negative log-likelihood function
> negLogLik <- function(df, ncp) {
+   -sum(dchisq(data, df = df, ncp = ncp, log = TRUE))
+ }
> 
> # Initial values for optimization (crucial for good convergence)
> start_vals <- list(df = 3, ncp = 20)
> 
> # Maximum likelihood estimation
> mle_fit <- mle2(negLogLik, start = start_vals)
> 
> # Extract estimated parameters
> df_est <- coef(mle_fit)[1]
> ncp_est <- coef(mle_fit)[2]
> 
> # Print the results
> cat("Estimated df:", df_est, "\n")
Estimated df: 10.84161 
> cat("Estimated ncp:", ncp_est)
Estimated ncp: 14.05248

nd <- rchisq(100, df = df_est, ncp = ncp_est)

hist(data, col = "lightblue", main = "Histogram of data")
hist(nd, col = "lightgreen", add = TRUE)

image

@spsanderson
Copy link
Owner Author

Another:

> estimate_chisq_params <- function(data) {
+   # Negative log-likelihood function
+   negLogLik <- function(df, ncp) {
+     -sum(dchisq(data, df = df, ncp = ncp, log = TRUE))
+   }
+   
+   # Initial values (adjust based on your data if necessary)
+   start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data)))
+   
+   # MLE using bbmle
+   mle_fit <- bbmle::mle2(negLogLik, start = start_vals)
+   
+   # Return estimated parameters as a named vector
+   c(df = coef(mle_fit)[1], ncp = coef(mle_fit)[2])
+ }
> 
> library(purrr)
> 
> # List of data vectors (replace with your actual data)
> data_list <- list(rchisq(100, df = 5, ncp = 2), 
+                   rchisq(80, df = 3, ncp = 1), 
+                   rchisq(120, df = 7, ncp = 4))
> 
> # Apply the estimation function to each data vector
> param_estimates <- map(data_list, estimate_chisq_params)
Warning messages:
1: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
2: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
3: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
4: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
5: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
6: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
> 
> # Print results
> print(param_estimates)
[[1]]
    df.df   ncp.ncp 
5.3677154 0.9875087 

[[2]]
    df.df   ncp.ncp 
3.2668169 0.3253299 

[[3]]
   df.df  ncp.ncp 
6.559403 4.540669 

@spsanderson
Copy link
Owner Author

spsanderson commented Apr 13, 2024

Mega Script test

# Lib Load ----
library(tidyverse)
library(bbmle)

# Data ----
# Make parameters and grid
df <- 1:10
ncp <- 1:10
n <- runif(10, 250, 500) |> trunc()
param_grid <- expand_grid(n = n, df = df, ncp = ncp)

# Functions ----
# functions to estimate the parameters of a chisq distribution
# dof
mean_x <- function(x) mean(x)
mean_minus_1 <- function(x) mean(x) - 1
var_div_2 <- function(x) var(x) / 2
length_minus_1 <- function(x) length(x) - 1
# ncp
mean_minus_mean_minus_1 <- function(x) mean(x) - (mean(x) - 1)
ie_mean_minus_var_div_2 <- function(x) ifelse((mean(x) - (var(x) / 2)) < 0, 0, mean(x) - var(x)/2)
ie_optim <- function(x) optim(par = 0,
                             fn = function(ncp) {
                               -sum(dchisq(x, df = var(x)/2, ncp = ncp, log = TRUE))
                             },
                             method = "Brent",
                             lower = 0,
                             upper = 10 * var(x)/2)$par
# both
estimate_chisq_params <- function(data) {
  # Negative log-likelihood function
  negLogLik <- function(df, ncp) {
    -sum(dchisq(data, df = df, ncp = ncp, log = TRUE))
  }
  
  # Initial values (adjust based on your data if necessary)
  start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data)))
  
  # MLE using bbmle
  mle_fit <- bbmle::mle2(negLogLik, start = start_vals)
  # Return estimated parameters as a named vector
  df <- dplyr::tibble(
    est_df = coef(mle_fit)[1],
    est_ncp = coef(mle_fit)[2]
  )
  return(df)
}

safe_estimates <- {
  purrr::possibly(
    estimate_chisq_params,
    otherwise = NA_real_,
    quiet = TRUE
  )
}

# Simulate data ----
set.seed(123)
dff <- param_grid |>
  mutate(x = pmap(pick(everything()), match.fun("rchisq"))) |>
  mutate(
    safe_est_parms = map(x, safe_estimates),
    dfa = map_dbl(x, mean_minus_1),
    dfb = map_dbl(x, var_div_2),
    dfc = map_dbl(x, length_minus_1),
    ncpa = map_dbl(x, mean_minus_mean_minus_1),
    ncpb = map_dbl(x, ie_mean_minus_var_div_2),
    ncpc = map_dbl(x, ie_optim)
  ) |>
  select(-x) |>
  filter(map_lgl(safe_est_parms, ~ any(is.na(.x))) == FALSE) |>
  unnest(cols = safe_est_parms) |>
  mutate(
    dfa_resid = dfa - df,
    dfb_resid = dfb - df,
    dfc_resid = dfc - df,
    dfd_resid = est_df - df,
    ncpa_resid = ncpa - ncp,
    ncpb_resid = ncpb - ncp,
    ncpc_resid = ncpc - ncp,
    ncpd_resid = est_ncp - ncp
  )

# Visuals ----
boxplot(dff$dfa ~ dff$df, main = "mean(x) -1 ~ df")
boxplot(dff$dfb ~ dff$df, main = "var(x) / 2 ~ df")         
boxplot(dff$dfc ~ dff$df, main = "length(x) - 1 ~ df")
boxplot(dff$est_df ~ dff$df, main = "negloglik ~ df - Looks Good")
boxplot(dff$ncpa ~ dff$ncp, main = "mean(x) - (mean(x) - 1) ~ ncp")
boxplot(dff$ncpb ~ dff$ncp, main = "mean(x) - var(x)/2 ~ nc")
boxplot(dff$ncpc ~ dff$ncp, main = "optim ~ ncp")
boxplot(dff$est_ncp ~ dff$ncp, main = "negloglik ~ ncp - Looks Good")
boxplot(dff$dfa_resid ~ dff$df, main = "mean(x) -1 ~ df Residuals")
boxplot(dff$dfb_resid ~ dff$df, main = "var(x) / 2 ~ df Residuals")
boxplot(dff$dfc_resid ~ dff$df, main = "length(x) - 1 ~ df Residuals")
boxplot(dff$dfd_resid ~ dff$df, main = "negloglik ~ df Residuals")
boxplot(dff$ncpa_resid ~ dff$ncp, main = "mean(x) - (mean(x) - 1) ~ ncp Residuals")
boxplot(dff$ncpb_resid ~ dff$ncp, main = "mean(x) - var(x)/2 ~ ncp Residuals")
boxplot(dff$ncpc_resid ~ dff$ncp, main = "optim ~ ncp Residuals")
boxplot(dff$ncpd_resid ~ dff$ncp, main = "negloglik ~ ncp Residuals")

image

image

image

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Development

No branches or pull requests

1 participant