In [1]:
library(Matrix)
library(qlcMatrix)
set.seed(42)
#' Replace non-zero entries in a sparse entries with non-zero ranks
#'
#' This method creates a rank matrix for a sparse matrix X using the following approach:
#' 1. Use non-zero enries in a column to calculate the ranks
#' 2. Add (z-1)/2 to the ranks (only non-zero entries are changed). z is the number of zeros
#' in the column
#' Since all the entries are shifted by the same constant (the zeros
#' are already shifted), the covariance matrix of this shifted matrix is
#' the same as the rank matrix of the entire matrix (where the zeros would
#' all also have a rank = (z+1)/2) where z is the number of zeros
#'
#' This rank matrix can then be used to calculate pearson correlation
#' (pearson correlation )
SparsifiedRanks <- function(X){
  X <- as(object = X, Class = "dgCMatrix")
  j <- summary(object = X)$j
  n_zeros_per_col <- nrow(X) - diff(X@p)
  offsets <- (n_zeros_per_col - 1)/2

  for (column in unique(x = j)){
    non_zero_element_index <- which(j == column)
    elements_along_row <- X@x[non_zero_element_index]
    X@x[non_zero_element_index] <- rank(elements_along_row) + offsets[column]
    #X@x[non_zero_element_index] <- frank(elements_along_row) + offsets[column]
  }
  return(X)
}

#' @importFrom qlcMatrix corSparse
#'
SparseSpearmanCor <- function(X, Y = NULL, cov = FALSE) {

  # Get sparsified ranks
  rankX <- SparsifiedRanks(X)
  if (is.null(Y)){
    # Calculate pearson correlation on rank matrices
    return (corSparse(X=rankX, cov=cov))
    }
  rankY <- SparsifiedRanks(Y)
  return(corSparse(X = rankX, Y = rankY, cov = cov))
}

Loading required package: slam

Loading required package: sparsesvd



In [2]:
n_peaks <- 5000
n_cells <- 1000

# Create a sparse matrix, mostly 1s
X.dense <- replicate(n_cells, rbinom(n = n_peaks, size = 1, prob = 0.1))
X.sparse <- Matrix::Matrix(X.dense, sparse = TRUE)


# do the densification inside
system.time(corX.dense <- cor(as.matrix(X.sparse), method="spearman"))


   user  system elapsed 
  3.229   0.064   3.314 

In [3]:
system.time(corX.sparse <- SparseSpearmanCor(X.sparse))

   user  system elapsed 
  2.254   1.096   3.322 

In [4]:
norm(corX.dense - corX.sparse, type = "2")

In [5]:
n_genes <- 15000
n_cells <- 1000
Y.dense <- replicate(n_cells, rbinom(n = n_genes, size = 1, prob = 0.15))
Y.sparse <- Matrix::Matrix(Y.dense, sparse = TRUE)

system.time(corXY.dense <- cor(as.matrix(t(X.sparse)), as.matrix(t(Y.sparse)), method="spearman"))


   user  system elapsed 
 82.993   0.955  85.764 

In [6]:
system.time(corXY.sparse <- SparseSpearmanCor(t(X.sparse), t(Y.sparse)))


   user  system elapsed 
141.545  40.880 179.208 

In [7]:
norm(corXY.dense - corXY.sparse, type = "2")