-
Notifications
You must be signed in to change notification settings - Fork 15
/
prepare.R
183 lines (172 loc) · 6.34 KB
/
prepare.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# ------------------------------------------
# Authors: Andreas Alfons, Matthias Templ
# and Daniel Schopfhauser
# Vienna University of Technology
# ------------------------------------------
#' Transformation and standardization
#'
#' This function is used by the `VIM` GUI for transformation and
#' standardization of the data.
#'
#'
#' **Transformation**:
#'
#' `"none"`: no transformation is used.
#'
#' `"logarithm"`: compute the the logarithm (to the base 10).
#'
#' `"boxcox"`: apply a Box-Cox transformation. Powers may be specified or
#' calculated with the function [car::powerTransform()].
#'
#' **Standardization**:
#'
#' `"none"`: no standardization is used.
#'
#' `"classical"`: apply a *z*-Transformation on each variable by
#' using function [scale()].
#'
#' `"robust"`: apply a robustified *z*-Transformation by using median
#' and MAD.
#'
#' @param x a vector, matrix or `data.frame`.
#' @param scaling the scaling to be applied to the data. Possible values are
#' `"none"`, `"classical"`, `MCD`, `"robust"` and
#' `"onestep"`.
#' @param transformation the transformation of the data. Possible values are
#' `"none"`, `"minus"`, `"reciprocal"`, `"logarithm"`,
#' `"exponential"`, `"boxcox"`, `"clr"`, `"ilr"` and
#' `"alr"`.
#' @param alpha a numeric parameter controlling the size of the subset for the
#' *MCD* (if `scaling="MCD"`). See [robustbase::covMcd()].
#' @param powers a numeric vector giving the powers to be used in the Box-Cox
#' transformation (if `transformation="boxcox"`). If `NULL`, the
#' powers are calculated with function [car::powerTransform()].
#' @param start a constant to be added prior to Box-Cox transformation (if
#' `transformation="boxcox"`).
#' @param alrVar variable to be used as denominator in the additive logratio
#' transformation (if `transformation="alr"`).
#' @return Transformed and standardized data.
#' @author Matthias Templ, modifications by Andreas Alfons
#' @seealso [scale()], [car::powerTransform()]
#' @keywords manip
#' @examples
#'
#' data(sleep, package = "VIM")
#' x <- sleep[, c("BodyWgt", "BrainWgt")]
#' prepare(x, scaling = "robust", transformation = "logarithm")
#'
#' @export
prepare <- function(x, scaling = c("none","classical","MCD","robust","onestep"),
transformation = c("none","minus","reciprocal","logarithm",
"exponential","boxcox","clr","ilr","alr"),
alpha = NULL, powers = NULL, start = 0, alrVar) {
check_data(x)
x <- as.data.frame(x)
if(is.data.frame(x)) x <- data.matrix(x)
transformation <- match.arg(transformation)
if(transformation != "none") {
x <- switch(transformation, minus=-x, reciprocal=1/x,
logarithm=log10(x), exponential=10^x,
boxcox=boxcoxVIM(x, powers, start), clr=clr(x),
ilr=ilr(x), alr=alr(x, alrVar))
}
scaling <- match.arg(scaling)
if(scaling != "none") {
x <- switch(scaling, classical=scale(x), MCD=scaleMcd(x, alpha=alpha),
robust=scaleRob(x), onestep=scaleRob(x, onestep=TRUE))
}
x
}
scaleMcd <- function(x, alpha = NULL) {
tmp <- if(is.null(alpha)) covMcd(x) else covMcd(x, alpha=alpha)
scale(x, center=tmp$center, scale=diag(tmp$cov))
}
scaleRob <- function(x, onestep = FALSE) {
if(onestep) {
if(is.null(dim(x))) cen <- onestepMean(x, na.rm=TRUE)
else cen <- apply(x, 2, onestepMean, na.rm=TRUE)
scale(x, center=cen)
} else {
if(is.null(dim(x))) {
cen <- median(x, na.rm=TRUE)
sc <- mad(x, na.rm=TRUE)
}
else {
cen <- apply(x, 2, median, na.rm=TRUE)
sc <- apply(x, 2, mad, na.rm=TRUE)
}
scale(x, center=cen, scale=sc)
}
}
onestepMean <- function(x, na.rm = FALSE) {
if (!is.numeric(x) && !is.complex(x) && !is.logical(x)) {
warning("argument is not numeric or logical: returning NA")
return(NA_real_)
}
mMedian <- median(x, na.rm=na.rm)
mMad <- mad(x, na.rm=na.rm)
constant <- 3/1.486
limit <- mMedian + c(-1,1) * constant * mMad
x[x < limit[1]] <- limit[1]
x[x > limit[2]] <- limit[2]
mean(x, na.rm=na.rm)
}
boxcoxVIM <- function(x, powers = NULL, start = 0) {
if(is.null(dim(x))) {
if(length(x) == 0) stop("'x' must have positive length")
if(is.null(powers)) {
i <- !is.na(x)
xi <- x[i]
if(length(xi) == 0) stop("all values in 'x' are NA")
powers <- powerTransform(xi)$lambda
}
bcPower(x, powers, start)
} else {
if(nrow(x) == 0) stop("'x' has no rows")
else if(ncol(x) == 0) stop("'x' has no columns")
if(is.null(powers)) {
i <- apply(x, 1, function(x) !any(is.na(x)))
xi <- x[i,, drop=FALSE]
if(nrow(xi) == 0) stop("all rows in 'x' contain NAs")
powers <- powerTransform(xi)$lambda
}
# box.cox only works with vectors
# if x has only one column, the result of mapply is a vector
as.matrix(mapply(bcPower, x=x, p=powers, start=start))
}
}
logratio <- function(x, type = c("clr","ilr","alr"), alrVar) {
type <- match.arg(type)
switch(type, clr=clr(x), ilr=ilr(x), alr=alr(x, alrVar))
}
clr <- function(x) {
if(!inherits(x, c("matrix","data.frame")))
stop("'x' must be a matrix or data.frame")
else if(ncol(x) < 2) stop("'x' must be at least 2-dimensional")
xgeom <- 10^apply(log10(x), 1, mean, na.rm=TRUE)
x/xgeom
}
ilr <- function(x) {
if(!inherits(x, c("matrix","data.frame")))
stop("'x' must be a matrix or data.frame")
else if(ncol(x) < 2) stop("'x' must be at least 2-dimensional")
x.ilr <- matrix(NA, nrow=nrow(x), ncol=ncol(x)-1)
for(i in 1:ncol(x.ilr)) {
x.ilr[, i] <- sqrt((i)/(i+1))*
log(((apply(x[, 1:i, drop=FALSE], 1, prod))^(1/i))/(x[, i+1]))
}
colnames(x.ilr) <- paste("ilrVar", 1:ncol(x.ilr), sep="")
x.ilr
}
alr <- function(x, alrVar) {
if(!inherits(x, c("matrix","data.frame")))
stop("'x' must be a matrix or data.frame")
else if(ncol(x) < 2) stop("'x' must be at least 2-dimensional")
if(is.numeric(alrVar)) w <- alrVar
else if(is.character(alrVar)) {
w <- match(alrVar, colnames(x))
if(is.na(w)) stop(gettextf("'x' does not contain variable '%s'",alrVar))
}
else stop("'alrVar' must be numeric or a character string")
log10(x[, -w])-log10(x[, w])%*%t(rep(1, ncol(x)-1))
}