In [1]:
library(Matrix)
library(qlcMatrix)
set.seed(42)
#' Replace non-zero entries in a sparse entries with non-zero ranks
#'
#' This method creates a rank matrix for a sparse matrix X using the following approach:
#' 1. Use non-zero enries in a column to calculate the ranks
#' 2. Add (z-1)/2 to the ranks (only non-zero entries are changed). z is the number of zeros
#' in the column
#' Since all the entries are shifted by the same constant (the zeros
#' are already shifted), the covariance matrix of this shifted matrix is
#' the same as the rank matrix of the entire matrix (where the zeros would
#' all also have a rank = (z+1)/2) where z is the number of zeros
#'
#' This rank matrix can then be used to calculate pearson correlation
#' (pearson correlation )
SparsifiedRanks <- function(X) {
  X <- as(object = X, Class = "dgCMatrix")
  j <- summary(object = X)$j
  n_zeros_per_col <- nrow(X) - diff(X@p)

  for (column in unique(x = j)) {
    non_zero_element_index <- which(j == column)
    elements_along_row <- X@x[non_zero_element_index]
    ranks <- rank(elements_along_row)
    ranks <- ranks + (n_zeros_per_col[column] - 1) / 2
    X@x[non_zero_element_index] <- ranks
  }
  return(X)
}
SparseSpearmanCor <- function(X, Y = NULL, cov = FALSE) {

  # Get sparsified ranks
  rankX <- SparsifiedRanks(X)
  if (is.null(Y)){
    # Calculate pearson correlation on rank matrices
    return (corSparse(X=rankX, cov=cov))
    }
  rankY <- SparsifiedRanks(Y)
  return(corSparse( X = rankX, Y = rankY, cov = cov))
}

########################
SparsifiedRanks2 <- function(X) {
  if (class(X.sparse)[1] != "dgCMatrix") {
    X <- as(object = X, Class = "dgCMatrix")
  }
  non_zeros_per_col <- diff(x = X@p)
  n_zeros_per_col <- nrow(x = X) - non_zeros_per_col
  offsets <- (n_zeros_per_col - 1) / 2
  x <- X@x
  ## split entries to columns
  col_lst <- split(x = x, f = rep.int(1:ncol(X), non_zeros_per_col))
  ## calculate sparsified ranks and do shifting
  sparsified_ranks <- unlist(x = lapply(X = seq_along(col_lst), 
                                        FUN = function(i) rank(x = col_lst[[i]]) + offsets[i]))
  ## Create template rank matrix
  X.ranks <- X
  X.ranks@x <- sparsified_ranks
  return(X.ranks)
}



SparseSpearmanCor2 <- function(X, Y = NULL, cov = FALSE) {

  # Get sparsified ranks
  rankX <- SparsifiedRanks2(X)
  if (is.null(Y)){
    # Calculate pearson correlation on rank matrices
    return (corSparse(X=rankX, cov=cov))
    }
  rankY <- SparsifiedRanks2(Y)
  return(corSparse( X = rankX, Y = rankY, cov = cov))
}

Loading required package: slam

Loading required package: sparsesvd



In [2]:
n_peaks <- 10000
n_cells <- 1000
X.dense <- replicate(n_cells, rbinom(n = n_peaks, size = 1, prob = 0.1))
X.sparse <- Matrix::Matrix(X.dense, sparse = TRUE)


n_peaks <- 5000
n_cells <- 1000
Y.dense <- replicate(n_cells, rbinom(n = n_peaks, size = 1, prob = 0.1))
Y.sparse <- Matrix::Matrix(Y.dense, sparse = TRUE)

dim(X.sparse)
dim(Y.sparse)

In [3]:
system.time(corXY.dense <- cor(as.matrix(Matrix::t(X.sparse)), as.matrix(Matrix::t(Y.sparse)), method = "spearman"))


   user  system elapsed 
 56.691   0.831  59.200 

In [4]:
system.time(corXY.sparse <- SparseSpearmanCor(Matrix::t(X.sparse), Matrix::t(Y.sparse)))


   user  system elapsed 
 53.154  30.470  82.593 

In [5]:
system.time(corXY.sparse2 <- SparseSpearmanCor2(Matrix::t(X.sparse), Matrix::t(Y.sparse)))


   user  system elapsed 
  5.821   0.766   4.267 

In [6]:
norm(corXY.dense - corXY.sparse, type = "2")


In [7]:
norm(corXY.dense - corXY.sparse2, type = "2")
