-
Notifications
You must be signed in to change notification settings - Fork 0
/
density.R
191 lines (185 loc) · 6.14 KB
/
density.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#' @export
print.kde <- function(x, ...) {
kde <- !(is.null(x$h) & is.null(x$H))
if (inherits(x$eval.points, "list")) {
d <- length(x$eval.points)
} else {
d <- 1L
}
if(!kde) {
cat("Density of: [",
paste0(x$names, collapse = ", "), "]\n", sep = "")
} else {
cat("Kernel density estimate of: [",
paste0(x$names, collapse = ", "), "]\n", sep = "")
}
if(d == 1L){
ngrid <- length(x$eval.points)
} else {
ngrid <- lapply(x$eval.points, length)
}
cat("Computed on a grid of size", paste(ngrid, collapse = " x "), "\n")
if(kde) {
cat("Bandwidth: ")
if (d == 1L) {
cat("h = ", format(x$h, digits = 4))
} else {
cat("H = \n")
cat(format(x$H, digits = 4), quote=FALSE)
}
}
invisible(x)
}
#' Convert data frame or matrix object to kde class
#'
#' A density specified as a data frame or matrix can be converted to a kde object.
#' This is useful for plotting the density using \code{\link{autoplot.kde}}.
#' As kde objects are defined on a grid, the density values are interpolated
#' based on the points in the data frame or matrix.
#'
#' @param object Data frame or matrix with numerical columns, where one column
#' (specified by `density_column`) contains the density values, and the
#' remaining columns define the points at which the density is evaluated.
#' @param density_column Name of the column containing the density values, specified
#' as a bare expression. If missing, the last column is used.
#' @param ngrid Number of points to use for the grid in each dimension. Default is
#' 10001 for univariate densities and 101 for multivariate densities.
#' @param ... Additional arguments are ignored.
#' @return An object of class "kde"
#' @author Rob J Hyndman
#' @examples
#' tibble(y = seq(-4, 4, by = 0.01), density = dnorm(y)) |>
#' as_kde()
#' @export
as_kde <- function(object, density_column, ngrid, ...) {
# Check columns of object are all numerical
object <- as.data.frame(object)
if(!all(sapply(object, is.numeric))) {
stop("All columns of object must be numeric")
}
# Check density_column is in object
if(missing(density_column)) {
density_column <- tail(colnames(object), 1)
} else {
density_column <- dplyr::as_label(dplyr::enquo(density_column))
}
if(!(density_column %in% colnames(object))) {
stop(paste(density_column, "not found"))
}
# Separate points from density values
den <- object[[density_column]]
object[[density_column]] <- NULL
# Find the dimension
d <- NCOL(object)
if (d == 1L) {
if(missing(ngrid)) {
ngrid <- 10001
}
# Interpolate density on finer grid
density <- list(eval.points = seq(min(object), max(object), length = ngrid))
density$estimate <- approx(object[[1]], den, xout = density$eval.points)$y
} else if (d == 2L) {
if(missing(ngrid)) {
ngrid <- 101
}
density <- density_on_grid(as.matrix(object), den, ngrid)
} else {
stop("Only univariate and bivariate densities are supported")
}
# Find falpha using quantile method
missing <- is.na(density$estimate)
samplex <- sample(density$estimate[!missing],
size = 50000, replace = TRUE,
prob = density$estimate[!missing]
)
density$cont <- quantile(samplex, prob = (99:1) / 100, type = 8)
# Set missing values to 0
density$estimate[is.na(density$estimate)] <- 0
# Add names
density$names <- colnames(object)
if(is.null(density$names)) {
if(d == 1) {
density$names <- "y"
} else {
density$names <- paste0("y", seq_len(d))
}
}
structure(density, class = "kde")
}
density_on_grid <- function(y, fy, ngrid) {
y <- as.matrix(y)
if (NCOL(y) != 2L) {
stop("y must be a matrix with 2 columns")
}
# Create grid of points
density <- list(eval.points = list(
seq(min(y[, 1]), max(y[, 1]), length = ngrid),
seq(min(y[, 2]), max(y[, 2]), length = ngrid)
))
# Bivariate interpolation
grid <- expand.grid(density$eval.points[[1]], density$eval.points[[2]])
ifun <- interpolation::interpfun(x = y[, 1], y = y[, 2], z = fy)
density$estimate <- ifun(grid[,1], grid[,2]) |>
matrix(nrow = ngrid)
return(density)
}
#' Robust bandwidth estimation for kernel density estimation
#'
#' @param data A numeric matrix or data frame.
#' @param method Method to use for selecting the bandwidth.
#' `robust_normal` uses a robust version of the normal reference rule.
#' `lookout` uses the topological data analysis approach that is part of the lookout algorithm.
#' @param max.iter How many times should the `lookout` method be iterated. That is, outliers
#' (probability < 0.05) are removed and the bandwidth is re-computed from the
#' remaining observations.
#' @return A matrix of bandwidths (or scalar in the case of univariate data).
#' @author Rob J Hyndman
#' @examples
#' # Univariate bandwidth calculation
#' kde_bandwidth(oldfaithful$duration)
#' # Bivariate bandwidth calculation
#' kde_bandwidth(oldfaithful[,2:3])
#' @export
kde_bandwidth <- function(data, method = c("robust_normal", "double", "lookout"),
max.iter = 2) {
method <- match.arg(method)
d <- NCOL(data)
n <- NROW(data)
if(d > 1) {
# Find robust covariance matrix of data
S <- robustbase::covOGK(data, sigmamu = robustbase::s_IQR)$cov
}
if(method != "lookout") {
k <- ifelse(method == "double", 2, 1)
if(d == 1L)
return(k * 1.06 * robustbase::s_IQR(data) * n^(-0.2))
else {
return((4/(n * (d + 2)))^(2/(d + 4)) * k^2 * S)
}
} else {
stop("Not yet implemented")
# # Initial estimate
# if(d == 1L) {
# S <- 1
# } else {
# # Normalize data
# U <- chol(solve(S))
# data <- as.matrix(data) %*% t(U)
# }
# h <- lookout::find_tda_bw(data, fast = (n > 1000)) |>
# suppressWarnings()
# iter <- 1
# oldh <- 0
# while(iter < max.iter & h != oldh) {
# iter <- iter + 1
# oldh <- h
# scores <- calc_kde_scores(data, h=h, H=h*diag(d))
# p <- lookout(density_scores = scores$scores, loo_scores = scores$loo) |>
# suppressWarnings()
# data <- as.matrix(data)[p > 0.05,]
# # Refined estimate
# h <- lookout::find_tda_bw(data, fast = (n > 1000))
# }
# return(h * S)
}
}