Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare Seurat for CellRanger 3.0 #708

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 67 additions & 21 deletions R/preprocessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -152,26 +152,38 @@ CreateSeuratObject <- function(
#'
#' Enables easy loading of sparse data matrices provided by 10X genomics.
#'
#' @param data.dir Directory containing the matrix.mtx, genes.tsv, and barcodes.tsv
#' @param data.dir Directory containing the matrix.mtx, genes.tsv (or features.tsv), and barcodes.tsv
#' files provided by 10X. A vector or named vector can be given in order to load
#' several data directories. If a named vector is given, the cell barcode names
#' will be prefixed with the name.
#' @param gene.column Specify which column of genes.tsv or features.tsv to use for gene names; default is 2
#'
#' @return Returns a sparse matrix with rows and columns labeled
#'
#' @return If features.csv indicates the data has multiple data types, a list
#' containing a sparse matrix of the data from each type will be returned.
#' Otherwise a sparse matrix containing the expression data will be returned.
#'
#' @importFrom Matrix readMM
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # For output from CellRanger < 3.0
#' data_dir <- 'path/to/data/directory'
#' list.files(data_dir) # Should show barcodes.tsv, genes.tsv, and matrix.mtx
#' expression_matrix <- Read10X(data.dir = data_dir)
#' seurat_object = CreateSeuratObject(raw.data = expression_matrix)
#'
#' # For output from CellRanger >= 3.0 with multiple data types
#' data_dir <- 'path/to/data/directory'
#' list.files(data_dir) # Should show barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz
#' data <- Read10X(data.dir = data_dir)
#' seurat_object = CreateSeuratObject(raw.data = data$`Gene Expression`)
#' seurat_object = SetAssayData(seurat_object, assay.type = "Protein", slot = "raw.data", new.data = data$`Antibody Capture`)
#' }
#'
Read10X <- function(data.dir = NULL){
Read10X <- function(data.dir = NULL,
gene.column = 2){
full.data <- list()
for (i in seq_along(data.dir)) {
run <- data.dir[i]
Expand All @@ -183,19 +195,26 @@ Read10X <- function(data.dir = NULL){
}
barcode.loc <- paste0(run, "barcodes.tsv")
gene.loc <- paste0(run, "genes.tsv")
features.loc <- paste0(run, "features.tsv.gz")
matrix.loc <- paste0(run, "matrix.mtx")
# Flag to indicate if this data is from CellRanger >= 3.0
pre_ver_3 = file.exists(gene.loc)
if(!pre_ver_3) {
addgz <- function(s) paste0(s, ".gz")
barcode.loc = addgz(barcode.loc)
matrix.loc = addgz(matrix.loc)
}
if (!file.exists(barcode.loc)){
stop("Barcode file missing")
}
if (! file.exists(gene.loc)){
stop("Gene name file missing")
if (!pre_ver_3 && !file.exists(features.loc)){
stop("Gene name or features file missing")
}
if (! file.exists(matrix.loc)){
if (!file.exists(matrix.loc)){
stop("Expression matrix file missing")
}
data <- readMM(file = matrix.loc)
cell.names <- readLines(barcode.loc)
gene.names <- readLines(gene.loc)
if (all(grepl(pattern = "\\-1$", x = cell.names))) {
cell.names <- as.vector(
x = as.character(
Expand All @@ -207,16 +226,6 @@ Read10X <- function(data.dir = NULL){
)
)
}
rownames(x = data) <- make.unique(
names = as.character(
x = sapply(
X = gene.names,
FUN = ExtractField,
field = 2,
delim = "\\t"
)
)
)
if (is.null(x = names(x = data.dir))) {
if(i < 2){
colnames(x = data) <- cell.names
Expand All @@ -227,10 +236,44 @@ Read10X <- function(data.dir = NULL){
} else {
colnames(x = data) <- paste0(names(x = data.dir)[i], "_", cell.names)
}
full.data <- append(x = full.data, values = data)

feature.names = read.delim(ifelse(pre_ver_3, gene.loc, features.loc),
header = FALSE,
stringsAsFactors = FALSE)
rownames(x = data) <- make.unique(feature.names[, gene.column])
# In cell ranger 3.0, a third column specifying the type of data was added
# and we will return each type of data as a separate matrix
if (ncol(feature.names) > 2){
if(length(full.data) ==0) {
message("10X data contains more than one type and is being returned as a list containing matrices of each type.")
}
data_types = factor(feature.names$V3)
lvls = levels(data_types)
expr_name = "Gene Expression"
if(expr_name %in% lvls) { # Return Gene Expression first
lvls = c(expr_name, lvls[-which(lvls==expr_name)])
}
data = lapply(lvls, function(l) data[data_types==l,])
names(data) = lvls
} else{
data = list(data)
}
full.data[[length(full.data)+1]] = data
}
# Combine all the data from different directories into one big matrix, note this
# assumes that all data directories essentially have the same features files
list_of_data=list()
for(j in 1:length(full.data[[1]])) {
list_of_data[[j]] = do.call(cbind, lapply(full.data, `[[`, j))
}
names(list_of_data) = names(full.data[[1]])
# If multiple features, will return a list, otherwise
# a matrix.
if(length(list_of_data)==1) {
return(list_of_data[[1]])
} else {
return(list_of_data)
}
full.data <- do.call(cbind, full.data)
return(full.data)
}

#' Read 10X hdf5 file
Expand All @@ -252,6 +295,9 @@ Read10X_h5 <- function(filename, ensg.names = FALSE){
stop("File not found")
}
infile <- H5File$new(filename)
if(!infile$attr_exists("PYTABLES_FORMAT_VERSION")) {
stop("Only older (pre-3.0) 10X hdf5 files are supported.")
}
genomes <- names(infile)
output <- list()
for(genome in genomes){
Expand Down
5 changes: 5 additions & 0 deletions R/preprocessing_internal.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ RegressOutResid <- function(
if (use.umi) {
data.use <- object@raw.data[genes.regress, object@cell.names, drop = FALSE]
}

if(anyNA(latent.data) | anyNA(data.use)) {
stop("Data used in regression cannot contain NA values.")
}

# input checking for parallel options
if (do.par) {
if (num.cores == 1) {
Expand Down
18 changes: 15 additions & 3 deletions man/Read10X.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added tests/testdata/cr3.0/barcodes.tsv.gz
Binary file not shown.
Binary file added tests/testdata/cr3.0/features.tsv.gz
Binary file not shown.
Binary file added tests/testdata/cr3.0/matrix.mtx.gz
Binary file not shown.
27 changes: 27 additions & 0 deletions tests/testthat/test_load_10X.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
context("Read10X")
# These tests were added to ensure Seurat was forwards and backwards compatible for 3.0 data

dname = "../testdata/cr3.0"
test.data <- Read10X(dname)
test.data2 <- Read10X(c(dname, dname))

test_that("Cell Ranger 3.0 Data Parsing", {
expect_is(test.data, "list")
expect_equal(ncol(test.data$`Gene Expression`), .5 * ncol(test.data2$`Gene Expression`))
expect_equal(ncol(test.data$`Antibody Capture`), .5 * ncol(test.data2$`Antibody Capture`))
expect_equal(colnames(test.data2[[1]])[6], "2_AAAGTAGCACAGTCGC")
expect_equal(test.data$`Gene Expression`[2,2], 1000)
})

# Tests of Pre-3.0 Data
test.data3 <- Read10X("../testdata/")
test_that("Read10X creates sparse matrix", {
expect_is(test.data3, "dgTMatrix")
expect_equal(colnames(test.data3)[1], "ATGCCAGAACGACT")
expect_equal(rownames(test.data3)[1], "MS4A1")
})

test_that("Read10X handles missing files properly", {
expect_error(Read10X("."))
expect_error(Read10X("./notadir/"))
})
14 changes: 0 additions & 14 deletions tests/testthat/test_preprocessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,6 @@ test_that("Genes are filtered out based on min.cells", {
expect_equal(nrow(object@data), 162)
})

# Tests for Read10X
# --------------------------------------------------------------------------------
context("Read10X")

test_that("Read10X handles missing files properly", {
expect_error(Read10X("."))
expect_error(Read10X("./notadir/"))
})

test.data <- Read10X("../testdata/")
test_that("Read10X creates sparse matrix", {
expect_is(test.data, "dgTMatrix")
})


# Tests for NormalizeData
# --------------------------------------------------------------------------------
Expand Down