Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
719 lines (609 sloc) 15.4 KB
#' Akaike information criterion
#'
#' @description Akaike information criterion for model selection.
#'
#' @param model An object of class \code{lm}.
#'
#' @param method A character vector; specify the method to compute AIC. Valid
#' options include R, STATA and SAS.
#'
#' @details
#' AIC provides a means for model selection. Given a collection of models for
#' the data, AIC estimates the quality of each model, relative to each of the
#' other models. R and STATA use loglikelihood to compute AIC. SAS uses residual
#' sum of squares. Below is the formula in each case:
#'
#' \emph{R & STATA}
#' \deqn{AIC = -2(loglikelihood) + 2p}
#'
#' \emph{SAS}
#' \deqn{AIC = n * ln(SSE / n) + 2p}
#'
#' where \emph{n} is the sample size and \emph{p} is the number of model parameters including intercept.
#'
#' @return Akaike information criterion of the model.
#'
#' @references
#' Akaike, H. (1969). “Fitting Autoregressive Models for Prediction.” Annals of the Institute of Statistical
#' Mathematics 21:243–247.
#'
#' Judge, G. G., Griffiths, W. E., Hill, R. C., and Lee, T.-C. (1980). The Theory and Practice of Econometrics.
#' New York: John Wiley & Sons.
#'
#' @examples
#' # using R computation method
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_aic(model)
#'
#' # using STATA computation method
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_aic(model, method = 'STATA')
#'
#' # using SAS computation method
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_aic(model, method = 'SAS')
#'
#' @family model selection criteria
#'
#' @importFrom stats logLik
#'
#' @export
#'
ols_aic <- function(model, method = c("R", "STATA", "SAS")) {
check_model(model)
method <- match.arg(method)
n <- model_rows(model)
p <- model_n_coeffs(model)
if (method == "R") {
lk <- logLik(model)
-2 * lk[1] + 2 * (p + 1)
} else if (method == "STATA") {
lk <- logLik(model)
-2 * lk[1] + 2 * p
} else if (method == "SAS") {
sse <- model_rss(model)
n * log(sse / n) + 2 * p
} else {
message("Please specify a valid method.")
}
}
#' Bayesian information criterion
#'
#' Bayesian information criterion for model selection.
#'
#' @param model An object of class \code{lm}.
#' @param method A character vector; specify the method to compute BIC. Valid
#' options include R, STATA and SAS.
#'
#' @details
#' SBC provides a means for model selection. Given a collection of models for
#' the data, SBC estimates the quality of each model, relative to each of the
#' other models. R and STATA use loglikelihood to compute SBC. SAS uses residual
#' sum of squares. Below is the formula in each case:
#'
#' \emph{R & STATA}
#' \deqn{AIC = -2(loglikelihood) + ln(n) * 2p}
#'
#' \emph{SAS}
#' \deqn{AIC = n * ln(SSE / n) + p * ln(n)}
#'
#' where \emph{n} is the sample size and \emph{p} is the number of model parameters including intercept.
#'
#' @return The bayesian information criterion of the model.
#'
#' @references
#' Schwarz, G. (1978). “Estimating the Dimension of a Model.” Annals of Statistics 6:461–464.
#'
#' Judge, G. G., Griffiths, W. E., Hill, R. C., and Lee, T.-C. (1980). The Theory and Practice of Econometrics.
#' New York: John Wiley & Sons.
#'
#' @examples
#' # using R computation method
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_sbc(model)
#'
#' # using STATA computation method
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_sbc(model, method = 'STATA')
#'
#' # using SAS computation method
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_sbc(model, method = 'SAS')
#'
#' @family model selection criteria
#'
#' @export
#'
ols_sbc <- function(model, method = c("R", "STATA", "SAS")) {
check_model(model)
method <- match.arg(method)
n <- model_rows(model)
p <- model_n_coeffs(model)
if (method == "R") {
lk <- logLik(model)
-2 * lk[1] + log(n) * (p + 1)
} else if (method == "STATA") {
lk <- logLik(model)
-2 * lk[1] + log(n) * p
} else if (method == "SAS") {
sse <- model_rss(model)
n * log(sse / n) + p * log(n)
} else {
message("Please specify a valid method.")
}
}
#' Sawa's bayesian information criterion
#'
#' @description Sawa's bayesian information criterion for model selection.
#'
#' @param model An object of class \code{lm}.
#' @param full_model An object of class \code{lm}.
#'
#' @details
#' Sawa (1978) developed a model selection criterion that was derived from a
#' Bayesian modification of the AIC criterion. Sawa's Bayesian Information
#' Criterion (BIC) is a function of the number of observations n, the SSE, the
#' pure error variance fitting the full model, and the number of independent
#' variables including the intercept.
#'
#' \deqn{SBIC = n * ln(SSE / n) + 2(p + 2)q - 2(q^2)}
#'
#' where \eqn{q = n(\sigma^2)/SSE}, \emph{n} is the sample size, \emph{p} is the number of model parameters including intercept
#' \emph{SSE} is the residual sum of squares.
#' @return Sawa's Bayesian Information Criterion
#'
#' @references
#' Sawa, T. (1978). “Information Criteria for Discriminating among Alternative Regression Models.” Econometrica
#' 46:1273–1282.
#'
#' Judge, G. G., Griffiths, W. E., Hill, R. C., and Lee, T.-C. (1980). The Theory and Practice of Econometrics.
#' New York: John Wiley & Sons.
#'
#' @examples
#' full_model <- lm(mpg ~ ., data = mtcars)
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_sbic(model, full_model)
#'
#' @family model selection criteria
#'
#' @export
#'
ols_sbic <- function(model, full_model) {
check_model(model)
check_model(full_model)
n <- model_rows(model)
p <- anova_coeffs(model)
r <- full_model_coeffs(full_model)
q <-
q1(full_model, r) %>%
divide_by(q2(model, p)) %>%
multiply_by(n)
sbicout(model, n, p, q)
}
#' Extract mean square
#'
#' Extracts the mean square from \code{anova()}.
#'
#' @param full_model An object of class \code{lm}.
#' @param r A vector of length 1.
#'
#' @keywords internal
#'
#' @noRd
#'
q1 <- function(full_model, r) {
full_model %>%
anova() %>%
extract2(3) %>%
extract(r)
}
#' Extract sum of squares
#'
#' Extracts the sum of squares from \code{anova()}.
#'
#' @param full_model An object of class \code{lm}.
#' @param p A vector of length 1.
#'
#' @keywords internal
#'
#' @noRd
#'
q2 <- function(model, p) {
model %>%
anova() %>%
extract2(2) %>%
extract(p)
}
#' SBIC internal
#'
#' Returns sawa's bayesian information criterion.
#'
#' @param model An object of class \code{lm}.
#' @param n A vector of length 1.
#' @param p A vector of length 1.
#' @param q A vector of length 1.
#'
#' @keywords internal
#'
#' @noRd
#'
sbicout <- function(model, n, p, q) {
a <- (2 * (p + 2) * q)
b <- (2 * (q ^ 2))
model %>%
model_rss() %>%
divide_by(n) %>%
log() %>%
multiply_by(n) %>%
add(a) %>%
subtract(b)
}
#' Mallow's Cp
#'
#' @description Mallow's Cp.
#'
#' @param model An object of class \code{lm}.
#' @param fullmodel An object of class \code{lm}.
#'
#' @details
#' Mallows' Cp statistic estimates the size of the bias that is introduced into
#' the predicted responses by having an underspecified model. Use Mallows' Cp
#' to choose between multiple regression models. Look for models where
#' Mallows' Cp is small and close to the number of predictors in the model plus
#' the constant (p).
#'
#' @return Mallow's Cp of the model.
#'
#' @references
#' Hocking, R. R. (1976). “The Analysis and Selection of Variables in a Linear Regression.” Biometrics
#' 32:1–50.
#'
#' Mallows, C. L. (1973). “Some Comments on Cp.” Technometrics 15:661–675.
#'
#' @examples
#' full_model <- lm(mpg ~ ., data = mtcars)
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_mallows_cp(model, full_model)
#'
#' @family model selection criteria
#'
#' @export
#'
ols_mallows_cp <- function(model, fullmodel) {
check_model(model)
check_model(fullmodel)
n <- model_rows(model)
p <- anova_coeffs(model)
q <- full_model_coeffs(fullmodel)
mcpout(model, fullmodel, n, p, q)
}
#' Mallow's Cp internal
#'
#' Computes Mallow's Cp.
#'
#' @param model An object of class \code{lm}.
#' @param fullmodel An object of class \code{lm}.
#' @param n A numeric vector of length 1.
#' @param p A numeric vector of length 1.
#' @param q A numeric vector of length 1.
#'
#' @keywords internal
#'
#' @noRd
#'
mcpout <- function(model, fullmodel, n, p, q) {
sse <- model_rss(model)
sec <- (n - (2 * p))
mse <-
fullmodel %>%
anova() %>%
extract2(3) %>%
rev() %>%
extract(1)
sse %>%
divide_by(mse) %>%
subtract(sec)
}
#' MSEP
#'
#' Estimated error of prediction, assuming multivariate normality.
#'
#' @param model An object of class \code{lm}.
#'
#' @details
#' Computes the estimated mean square error of prediction assuming that both
#' independent and dependent variables are multivariate normal.
#'
#' \deqn{MSE(n + 1)(n - 2) / n(n - p - 1)}
#'
#' where \eqn{MSE = SSE / (n - p)}, n is the sample size and p is the number of
#' predictors including the intercept
#'
#' @return Estimated error of prediction of the model.
#'
#' @references
#' Stein, C. (1960). “Multiple Regression.” In Contributions to Probability and Statistics: Essays in Honor
#' of Harold Hotelling, edited by I. Olkin, S. G. Ghurye, W. Hoeffding, W. G. Madow, and H. B. Mann,
#' 264–305. Stanford, CA: Stanford University Press.
#'
#' Darlington, R. B. (1968). “Multiple Regression in Psychological Research and Practice.” Psychological
#' Bulletin 69:161–182.
#'
#' @examples
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_msep(model)
#'
#' @family model selection criteria
#'
#' @export
#'
ols_msep <- function(model) {
check_model(model)
sepout(model)
}
#' MSEP internal
#'
#' Computes the estimated mean square error of prediction.
#'
#' @param model An object of class \code{lm}.
#'
#' @keywords internal
#'
#' @noRd
#'
sepout <- function(model) {
n <- model_rows(model)
p <- anova_coeffs(model)
mse <-
model %>%
anova() %>%
extract2(3) %>%
extract(p)
num <-
(n + 1) %>%
multiply_by((n - 2)) %>%
multiply_by(mse)
den <- n * (n - p - 1)
num / den
}
#' Final prediction error
#'
#' @description Estimated mean square error of prediction.
#'
#' @param model An object of class \code{lm}.
#'
#' @details
#' Computes the estimated mean square error of prediction for each model
#' selected assuming that the values of the regressors are fixed and that the
#' model is correct.
#'
#' \deqn{MSE((n + p) / n)}
#'
#' where \eqn{MSE = SSE / (n - p)}, n is the sample size and p is the number of predictors including the intercept
#'
#' @return Final prediction error of the model.
#'
#' @references
#' Akaike, H. (1969). “Fitting Autoregressive Models for Prediction.” Annals of the Institute of Statistical
#' Mathematics 21:243–247.
#'
#' Judge, G. G., Griffiths, W. E., Hill, R. C., and Lee, T.-C. (1980). The Theory and Practice of Econometrics.
#' New York: John Wiley & Sons.
#'
#' @examples
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_fpe(model)
#'
#' @family model selection criteria
#'
#' @export
#'
ols_fpe <- function(model) {
check_model(model)
jpout(model)
}
#' Final prediction error internal
#'
#' Computes the final prediction error.
#'
#' @param model An object of class \code{lm}.
#'
#' @keywords internal
#'
#' @noRd
#'
jpout <- function(model) {
n <- model_rows(model)
p <- anova_coeffs(model)
mse <-
model %>%
anova() %>%
extract2(3) %>%
extract(p)
n %>%
add(p) %>%
divide_by(n) %>%
multiply_by(mse)
}
#' @title Amemiya's prediction criterion
#'
#' @description Amemiya's prediction error.
#'
#' @param model An object of class \code{lm}.
#'
#' @details
#' Amemiya's Prediction Criterion penalizes R-squared more heavily than does
#' adjusted R-squared for each addition degree of freedom used on the
#' right-hand-side of the equation. The higher the better for this criterion.
#'
#' \deqn{((n + p) / (n - p))(1 - (R^2))}
#'
#' where \emph{n} is the sample size, \emph{p} is the number of predictors including the intercept and
#' \emph{R^2} is the coefficient of determination.
#'
#' @return Amemiya's prediction error of the model.
#'
#' @references
#' Amemiya, T. (1976). Selection of Regressors. Technical Report 225, Stanford University, Stanford, CA.
#'
#' Judge, G. G., Griffiths, W. E., Hill, R. C., and Lee, T.-C. (1980). The Theory and Practice of Econometrics.
#' New York: John Wiley & Sons.
#'
#' @examples
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_apc(model)
#'
#' @family model selection criteria
#'
#' @export
#'
ols_apc <- function(model) {
check_model(model)
pcout(model)
}
#' Amemiya internal
#'
#' Computes the Amemiya prediction error.
#'
#' @param model An object of class \code{lm}.
#'
#' @importFrom magrittr multiply_by
#'
#' @keywords internal
#'
#' @noRd
#'
pcout <- function(model) {
n <- model_rows(model)
p <- anova_coeffs(model)
rse <-
model %>%
summary() %>%
extract2(8)
(n + p) %>%
divide_by((n - p)) %>%
multiply_by((1 - rse))
}
#' @title Hocking's Sp
#'
#' @description Average prediction mean squared error.
#'
#' @param model An object of class \code{lm}.
#'
#' @details Hocking's Sp criterion is an adjustment of the residual sum of
#' Squares. Minimize this criterion.
#'
#' \deqn{MSE / (n - p - 1)}
#'
#' where \eqn{MSE = SSE / (n - p)}, n is the sample size and p is the number of predictors including the intercept
#'
#' @return Hocking's Sp of the model.
#'
#' @references
#' Hocking, R. R. (1976). “The Analysis and Selection of Variables in a Linear Regression.” Biometrics
#' 32:1–50.
#'
#' @examples
#' model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
#' ols_hsp(model)
#'
#' @family model selection criteria
#'
#' @export
#'
ols_hsp <- function(model) {
check_model(model)
spout(model)
}
#' Hocking's internal
#'
#' Computes the Hocking's Sp statistic.
#'
#' @param model An object of class \code{lm}.
#'
#' @importFrom magrittr divide_by
#'
#' @keywords internal
#'
#' @noRd
#'
spout <- function(model) {
n <- model_rows(model)
p <- anova_coeffs(model)
mse <-
model %>%
anova() %>%
extract2(3) %>%
extract(p)
mse / (n - p - 1)
}
#' Model data rows
#'
#' Returns the number of rows in the data used in the model.
#'
#' @param model An object of class \code{lm}.
#'
#' @noRd
#'
model_rows <- function(model) {
model %>%
model.frame() %>%
nrow()
}
#' Model Coefficients
#'
#' Returns the number of coefficients in the model.
#'
#' @param model An object of class \code{lm}.
#'
#' @noRd
#'
model_n_coeffs <- function(model) {
model %>%
use_series(coefficients) %>%
length()
}
#' Residual sum of squares
#'
#' Returns the residual sum of squares.
#'
#' @importFrom magrittr raise_to_power
#'
#' @param model An object of class \code{lm}.
#'
#' @noRd
#'
model_rss <- function(model) {
model %>%
residuals() %>%
raise_to_power(2) %>%
sum()
}
#' Coefficients
#'
#' Returns the number of coefficients in the model
#' using `anova()` including the residual sum of squares.
#'
#' @param model An object of class \code{lm}.
#'
#' @noRd
#'
anova_coeffs <- function(model) {
model %>%
anova() %>%
extract2(1) %>%
length()
}
#' Number of columns
#'
#' Returns the number of columns in the data.
#'
#' @param model An object of class \code{lm}.
#'
#' @noRd
#'
full_model_coeffs <- function(model) {
model %>%
model.frame() %>%
length()
}
You can’t perform that action at this time.