New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PCA of Raster data [Feature Request] #1361
Comments
Here's an example of how it could be used library(terra)
#> Warning: package 'terra' was built under R version 4.3.2
#> terra 1.7.62
rasterPCA <- function(x, nSamples = NULL, nComp = nlyr(x), spca = FALSE, maskCheck = TRUE, ...){
if(nlyr(x) <= 1) stop("Need at least two layers to calculate PCA.")
ellip <- list(...)
if(nComp > nlyr(x)) nComp <- nlyr(x)
if(!is.null(nSamples)){
trainData <- spatSample(x, size = nSamples, method="random", replace=FALSE, na.rm = TRUE)
if(nrow(trainData) < nlyr(x)) stop("nSamples too small or x contains a layer with NAs only")
model <- princomp(trainData, scores = FALSE, cor = spca)
} else {
if(maskCheck) {
totalMask <- !sum(app(x, is.na))
if(global(totalMask, sum)$sum == 0) stop("x contains either a layer with NAs only or no single pixel with valid values across all layers")
x <- mask(x, totalMask , maskvalue = 0) ## NA areas must be masked from all layers, otherwise the covariance matrix is not non-negative definite
}
covMat <- layerCor(x, fun = "cov", na.rm = TRUE)
model <- princomp(covmat = covMat[[1]], cor=spca)
model$center <- diag(covMat$mean)
model$n.obs <- global(!any(is.na(x)), sum)$sum
if(spca) {
## Calculate scale as population sd like in in princomp
S <- diag(covMat$covariance)
model$scale <- sqrt(S * (model$n.obs-1)/model$n.obs)
}
}
## Predict
out<- terra::predict(x, model = model, na.rm = TRUE, index = 1:nComp, wrArgs = ellip)
names(out) <- paste0("PC", 1:nComp)
return(structure(list(call = match.call(), model = model, map = out)))
}
#Create Data
r<- rast(nrow=40, ncol=40, nlyrs=10)
set.seed(5)
values(r)<- rnorm(ncell(r)*nlyr(r))
values(r)[sample(1:(ncell(r)*nlyr(r)), size = 1000, replace = FALSE)]<- NA #Add NA's
plot(r) PCA<- rasterPCA(r, spca = TRUE)
summary(PCA$model)
#> Importance of components:
#> Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
#> Standard deviation 1.065035 1.060924 1.0353062 1.0232500 1.0026015
#> Proportion of Variance 0.113430 0.112556 0.1071859 0.1047041 0.1005210
#> Cumulative Proportion 0.113430 0.225986 0.3331719 0.4378759 0.5383969
#> Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
#> Standard deviation 0.9866337 0.98439181 0.96219138 0.95073612 0.91861112
#> Proportion of Variance 0.0973446 0.09690272 0.09258123 0.09038992 0.08438464
#> Cumulative Proportion 0.6357415 0.73264422 0.82522545 0.91561536 1.00000000
plot(PCA$map) Created on 2023-11-30 with reprex v2.0.2 |
I am not opposed but not convinced either given the simplicity of this example in
|
@rhijmans, I think this method doesn't require sampling for large rasters because it calls |
I added a |
Thanks @rhijmans, I think splitting it up into model and prediction makes sense and like the idea of adding a dedicated
library(terra)
#> Warning: package 'terra' was built under R version 4.3.2
#> terra 1.7.62
rasterPCA <- function(x, nSamples = NULL, nComp = nlyr(x), spca = FALSE, maskCheck = TRUE, ...){
if(nlyr(x) <= 1) stop("Need at least two layers to calculate PCA.")
ellip <- list(...)
if(nComp > nlyr(x)) nComp <- nlyr(x)
if(!is.null(nSamples)){
trainData <- spatSample(x, size = nSamples, method="random", replace=FALSE, na.rm = TRUE)
if(nrow(trainData) < nlyr(x)) stop("nSamples too small or x contains a layer with NAs only")
model <- princomp(trainData, scores = FALSE, cor = spca)
} else {
if(maskCheck) {
totalMask <- !sum(app(x, is.na))
if(global(totalMask, sum)$sum == 0) stop("x contains either a layer with NAs only or no single pixel with valid values across all layers")
x <- mask(x, totalMask , maskvalue = 0) ## NA areas must be masked from all layers, otherwise the covariance matrix is not non-negative definite
}
covMat <- layerCor(x, fun = "cov", na.rm = TRUE)
model <- princomp(covmat = covMat[[1]], cor=spca)
model$center <- diag(covMat$mean)
model$n.obs <- global(!any(is.na(x)), sum)$sum
if(spca) {
## Calculate scale as population sd like in in princomp
S <- diag(covMat$covariance)
model$scale <- sqrt(S * (model$n.obs-1)/model$n.obs)
}
}
## Predict
out<- terra::predict(x, model = model, na.rm = TRUE, index = 1:nComp, wrArgs = ellip)
names(out) <- paste0("PC", 1:nComp)
return(structure(list(call = match.call(), model = model, map = out)))
}
princomp_edit<- function(x, cor=FALSE, ...) {
stopifnot(nlyr(x) > 1)
xcov <- layerCor(x, fun="cov", na.rm=TRUE)
model <- princomp(covmat = xcov$covariance, cor=cor, ...)
model$center <- diag(xcov$mean)
if (cor) {
## Calculate scale as population sd like in in princomp
S <- diag(xcov$covariance)
n <- diag(xcov$n)
model$scale <- sqrt(S * (n-1) / n)
}
model
}
#Create Data
r<- rast(nrow=40, ncol=40, nlyrs=10)
set.seed(5)
values(r)<- rnorm(ncell(r)*nlyr(r))
values(r)[sample(1:(ncell(r)*nlyr(r)), size = 1000, replace = FALSE)]<- NA #Add NA's
r_df<- as.data.frame(r)
r_df<- na.omit(r_df) # If you do not omit NA's from the dataframe, you get Error in cov.wt(z) : 'x' must contain finite values only
# PCA cor=FALSE
m1<- princomp(r_df) # This is on a dataframe so it serves as a reference to the correct output
m1$n.obs
#> [1] 859
m1$sdev
#> Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
#> 1.0826671 1.0704703 1.0429269 1.0255518 1.0028366 0.9855091 0.9820035 0.9674348
#> Comp.9 Comp.10
#> 0.9487587 0.9176803
m2<- rasterPCA(r)$model
m2$n.obs
#> [1] 859
m2$sdev
#> Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
#> 1.0832979 1.0710939 1.0435344 1.0261493 1.0034208 0.9860832 0.9825756 0.9679984
#> Comp.9 Comp.10
#> 0.9493114 0.9182149
m3<- terra::princomp(r)
m3$n.obs
#> [1] NA
m3$sdev
#> Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
#> 1.0745181 1.0494377 1.0409172 1.0199349 1.0037053 1.0022756 0.9830014 0.9800315
#> Comp.9 Comp.10
#> 0.9710888 0.9527138
# PCA cor=TRUE
m4<- terra::princomp(r, cor=TRUE)
m4$sdev==m3$sdev #cor= TRUE did not change result
#> Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
#> TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
m5<- princomp_edit(r, cor=TRUE)
m5$sdev==m3$sdev #Edited version properly passes along cor argument
#> Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
#> FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE Created on 2023-12-01 with reprex v2.0.2 |
Yes, thanks, fixed that.
The only other meaningful argument is
I do not think that is strictly necessary. Our best estimate of the covariance matrix would use all pairs of data that are not NA. We can add a note in the manual that you can do It may affect the computation of the scale (if
I put it back in. However, there is not one |
Thank you! |
@rhijmans, although |
I would prefer to use The population covariance can be computed with
I think it generally makes more sense to to treat raster data as the population and not as a sample. It could also be an option that may lead to oh so small differences for small rasters. Anyway, this is confusing stuff, and I would appreciate your guidance. |
@rhijmans, since the "scale" should be the standard deviation and the S (the diagonal of the covariance matrix) is the variance, the square root of that would be appropriate. The pairwise n's must still be used to calculate the covariance matrix so it's not really circumventing that, but we it does avoid multiplying by That being said, I think it is slightly better as you suggested to use I am unsure however why results of library(terra)
#> terra 1.7.62
cov_pop <- function(data) {
# Calculate the means of each variable
means <- colMeans(data)
# Number of observations
n_obs <- nrow(data)
# Number of variables
n_vars <- ncol(data)
# Initialize an empty covariance matrix
covariance_matrix <- matrix(0, nrow = n_vars, ncol = n_vars)
# Calculate the covariance for each pair of variables
for (i in 1:n_vars) {
for (j in 1:n_vars) {
covariance_matrix[i, j] <- sum((data[, i] - means[i]) * (data[, j] - means[j])) / n_obs
}
}
return(covariance_matrix)
}
r<- rast(nrow=40, ncol=40, nlyrs=10)
set.seed(5)
values(r)<- rnorm(ncell(r)*nlyr(r))
r_df<- as.data.frame(r)
n<- ncell(r) #No NA's in this example
cov_samp_terra<- layerCor(r, fun="cov", na.rm=TRUE, asSamle=TRUE)
cov_pop_terra<- layerCor(r, fun="cov", na.rm=TRUE, asSamle=FALSE)
cov_samp_R<- cov(r_df)
cov_pop_R<- cov_pop(r_df)
# Terra has differences on order of e-05
max((cov_samp_terra$covariance*(n-1)/n)-cov_pop_terra$covariance)
#> [1] 3.426207e-05
# R has differences on order of e-16
max((cov_samp_R*(n-1)/n)-cov_pop_R)
#> [1] 2.220446e-16 Created on 2023-12-06 with reprex v2.0.2 |
or with your approach using
|
Yes, that is the |
Oh that was dumb of me 🤦♂️🤦♂️🤦♂️. Running it again, it all looks good! Since you created the library(terra)
#> terra 1.7.62
#Create Data
r<- rast(nrow=40, ncol=40, nlyrs=10)
set.seed(5)
values(r)<- rnorm(ncell(r)*nlyr(r))
values(r)[sample(1:(ncell(r)*nlyr(r)), size = 1000, replace = FALSE)]<- NA #Add NA's
cov_samp_terra<- layerCor(r, fun="cov", na.rm=TRUE, asSample=TRUE)
cov_pop_terra<- layerCor(r, fun="cov", na.rm=TRUE, asSample=FALSE)
n_mat<- cov_samp_terra$n
n<- diag(n_mat)
max((cov_samp_terra$covariance*(n_mat-1)/n_mat) - cov_pop_terra$covariance)
#> [1] 6.938894e-18
n # n is a vector of pairwise sample sizes corresponding to the length of S
#> [1] 1497 1497 1509 1496 1493 1489 1506 1505 1507 1501
S1<- diag(cov_samp_terra$covariance)
scale1<- sqrt(S1*(n-1)/n)
S2<- diag(cov_pop_terra$covariance)
scale2<- sqrt(S2)
scale1==scale2
#> lyr.1 lyr.2 lyr.3 lyr.4 lyr.5 lyr.6 lyr.7 lyr.8 lyr.9 lyr.10
#> TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE Created on 2023-12-06 with reprex v2.0.2 |
Thanks so much for adding this! |
Thank you! I have changed argument |
@rhijmans, an added bonus is I believe this also fixed a bug in Older Versionlibrary(terra)
#> Warning: package 'terra' was built under R version 4.3.2
#> terra 1.7.62
r<- rast(nrow=40, ncol=40, nlyrs=2)
set.seed(5)
values(r[[1]])<- rnorm(ncell(r))
values(r[[2]])<- values(r[[1]])*2+rnorm(ncell(r))
cov_full<- layerCor(r, fun="cov", maxcell=Inf)
cov_half<- layerCor(r, fun="cov", maxcell=ncell(r)/2)
cov_half$covariance
#> lyr.1 lyr.2
#> lyr.1 0.541334 1.058303
#> lyr.2 1.058303 2.621351
cov_full$covariance
#> lyr.1 lyr.2
#> lyr.1 1.019472 2.029023
#> lyr.2 2.029023 5.057638
cov_half$covariance/cov_full$covariance
#> lyr.1 lyr.2
#> lyr.1 0.5309946 0.5215826
#> lyr.2 0.5215826 0.5182955
df_half<- spatSample(r, 800, method="regular")
cov(df_half)
#> lyr.1 lyr.2
#> lyr.1 1.030468 2.014555
#> lyr.2 2.014555 4.989929 Created on 2023-12-07 with reprex v2.0.2 Latest Commitlibrary(terra)
#> terra 1.7.62
r<- rast(nrow=40, ncol=40, nlyrs=2)
set.seed(5)
values(r[[1]])<- rnorm(ncell(r))
values(r[[2]])<- values(r[[1]])*2+rnorm(ncell(r))
cov_full<- layerCor(r, fun="cov", maxcell=Inf)
cov_half<- layerCor(r, fun="cov", maxcell=ncell(r)/2)
cov_half$covariance
#> lyr.1 lyr.2
#> lyr.1 1.030468 2.014555
#> lyr.2 2.014555 4.989929
cov_full$covariance
#> lyr.1 lyr.2
#> lyr.1 1.019472 2.029023
#> lyr.2 2.029023 5.057638
cov_half$covariance/cov_full$covariance
#> lyr.1 lyr.2
#> lyr.1 1.0107862 0.9928697
#> lyr.2 0.9928697 0.9866125
df_half<- spatSample(r, 800, method="regular")
cov(df_half)
#> lyr.1 lyr.2
#> lyr.1 1.030468 2.014555
#> lyr.2 2.014555 4.989929 Created on 2023-12-07 with reprex v2.0.2 |
Since "cov" and |
I did not include the It is interesting to see how few you cells you need with this (unrealistically simple) example
|
@rhijmans, that for adding this function -- I think it will be helpful for many. Do you also plan to add some updates to library(terra)
#> terra 1.7.65
f <- system.file("ex/logo.tif", package = "terra")
r <- rast(f)
r[100] <- NA
# works
pca <- princomp(r)
# fails (due to NA's)
pca2 <- prcomp(r)
#> Error in svd(x, nu = 0, nv = k): infinite or missing values in 'x' From the princomp help file (
|
@Nowosad, the point about |
If there are no NAs in SpatRaster
Otherwise you need something like
I could add a |
@rhijmans @Nowosad, it could be useful to add a |
@ailich thanks for the explanation -- I was suspecting something like this, but was unable to find any discussion on the topic. And yes -- my thinking was to make |
I used to use
RSToolbox
for conducting PCA's on raster data, but it relies on theraster
package and is no longer actively maintained (I actually can't get it to install anymore). Since PCA is a common procedure it might be nice to have directly interra
like how kmeans was recently added. I've taken the code from their function and converted it toterra
code and it appears to work with the development version.The text was updated successfully, but these errors were encountered: