library(here)
library(purrr)
library(stringr)
library(readr)
library(djvdj)
library(dplyr)
library(Seurat)
library(qs)
After downloading the GEX and VDJ files, first pull the GEX files based on the file suffix.
file_dir <- here("data-old/rimanpreetkaur")
.get_sample_key <- function(x) {
x %>%
basename() %>%
str_extract("(?<=GSM[0-9]{7}_)[^_]+")
}
# Pull file names to load
h5s <- dir(file_dir, pattern = "\\.h5$", full.names = TRUE)
names(h5s) <- .get_sample_key(h5s)
h5s %>%
map_chr(basename)
## blood1
## "GSM5830281_blood1_filtered_feature_bc_matrix.h5"
## liver1
## "GSM5830282_liver1_filtered_feature_bc_matrix.h5"
## blood2.A
## "GSM5830283_blood2.A_filtered_feature_bc_matrix.h5"
## blood2.B
## "GSM5830284_blood2.B_filtered_feature_bc_matrix.h5"
## liver2.A
## "GSM5830285_liver2.A_filtered_feature_bc_matrix.h5"
## liver2.B
## "GSM5830286_liver2.B_filtered_feature_bc_matrix.h5"
## blood3.A
## "GSM5830287_blood3.A_filtered_feature_bc_matrix.h5"
## blood3.B
## "GSM5830288_blood3.B_filtered_feature_bc_matrix.h5"
## liver3.A
## "GSM5830289_liver3.A_filtered_feature_bc_matrix.h5"
## liver3.B
## "GSM5830290_liver3.B_filtered_feature_bc_matrix.h5"
## liver4
## "GSM5830291_liver4_filtered_feature_bc_matrix.h5"
Create and merge Seurat objects for each GEX sample, set a sample name based on the file names.
# Create Seurat object
h5s <- h5s %>%
imap(~ {
.x %>%
Read10X_h5() %>%
CreateSeuratObject(project = .y)
})
so <- merge(h5s[[1]], h5s[-1], add.cell.ids = names(h5s))
so
## An object of class Seurat
## 33538 features across 44043 samples within 1 assay
## Active assay: RNA (33538 features, 0 variable features)
Normally import_vdj()
expects a single sample per file.
However, some of the files uploaded to GEO include multiple samples per
file with different cell barcode suffixes. This will occur if some of
the files were processed separately using
cellranger aggr
.
import_vdj()
can load output files from
cellranger aggr
(using the aggr_dir
argument),
but cannot easily load a mix of files where some were processed with
aggr
and others not.
To load these files into your Seurat object, first split each file based on the cell barcode suffix (-1 or -2) and write new files.
# Split VDJ contig files that contain multiple samples
new_dir <- here(file_dir, "new_vdj_files")
vdj_files <- dir(file_dir, pattern = "\\.csv.gz", full.names = TRUE)
names(vdj_files) <- .get_sample_key(vdj_files)
vdj_files %>%
iwalk(~ {
file_nm <- basename(.x)
nm <- .y
df <- read_csv(.x, progress = FALSE, )
df <- df %>%
mutate(sfx = str_extract(barcode, "-[0-9]+$")) %>%
split(.$sfx)
if (length(df) > 1) {
new_nms <- LETTERS[seq_along(df)]
new_nms <- new_nms %>%
map_chr(~ {
file_nm %>%
str_replace(nm, str_c(nm, ".", .x))
})
names(df) <- new_nms
df %>%
iwalk(~ {
dir_nm <- str_remove(.y, ".csv.gz$")
new_dir <- here(new_dir, dir_nm)
dir.create(new_dir)
.x %>%
dplyr::select(-sfx) %>%
write_csv(here(new_dir, "filtered_contig_annotations.csv.gz"))
})
} else {
dir_nm <- basename(str_remove(.x, ".csv.gz$"))
new_dir <- here(new_dir, dir_nm)
dir.create(new_dir)
file.symlink(.x, here(new_dir, "filtered_contig_annotations.csv.gz"))
}
})
new_files <- dir(new_dir, full.names = TRUE)
names(new_files) <- .get_sample_key(new_files)
new_files %>%
map_chr(basename)
## blood1
## "GSM5830292_blood1_filtered_contig_annotations"
## liver1
## "GSM5830293_liver1_filtered_contig_annotations"
## blood2.A
## "GSM5830294_blood2.A_filtered_contig_annotations"
## blood2.B
## "GSM5830294_blood2.B_filtered_contig_annotations"
## liver2.A
## "GSM5830296_liver2.A_filtered_contig_annotations"
## liver2.B
## "GSM5830296_liver2.B_filtered_contig_annotations"
## blood3.A
## "GSM5830298_blood3.A_filtered_contig_annotations"
## blood3.B
## "GSM5830298_blood3.B_filtered_contig_annotations"
## liver3.A
## "GSM5830300_liver3.A_filtered_contig_annotations"
## liver3.B
## "GSM5830300_liver3.B_filtered_contig_annotations"
## liver4
## "GSM5830302_liver4_filtered_contig_annotations"
After splitting each VDJ file, add the revised VDJ files to the Seurat object. Be sure to set the vector names to match the GEX sample names that were used to create the object.
##
ℹ Loading V(D)J data
✔ Loading V(D)J data [5.2s]
##
ℹ Formatting V(D)J data
✔ Formatting V(D)J data [6.5s]
## ────────────────────────────────────────────────────────────────────────────────
## # cells # VDJ # paired # overlap % overlap
## ✔ blood1_ | 44043 | 3877 | 3192 | 3871 | 100%
## ✔ liver1_ | 44043 | 3429 | 2730 | 3416 | 100%
## ✔ blood2.A_ | 44043 | 3836 | 3099 | 3830 | 100%
## ✔ blood2.B_ | 44043 | 3826 | 3105 | 3824 | 100%
## ✔ liver2.A_ | 44043 | 2793 | 2279 | 2787 | 100%
## ✔ liver2.B_ | 44043 | 2880 | 2302 | 2875 | 100%
## ✔ blood3.A_ | 44043 | 4023 | 3160 | 4020 | 100%
## ✔ blood3.B_ | 44043 | 3987 | 3114 | 3978 | 100%
## ✔ liver3.A_ | 44043 | 3149 | 2511 | 3143 | 100%
## ✔ liver3.B_ | 44043 | 2992 | 2351 | 2985 | 100%
## ✔ liver4_ | 44043 | 2412 | 1797 | 2402 | 100%
## ────────────────────────────────────────────────────────────────────────────────
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/Denver
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] qs_0.25.5 SeuratObject_4.1.4 Seurat_4.4.0 dplyr_1.1.3
## [5] djvdj_0.1.0 readr_2.1.4 stringr_1.5.0 purrr_1.0.2
## [9] here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.15.0 jsonlite_1.8.7
## [4] magrittr_2.0.3 spatstat.utils_3.0-3 rmarkdown_2.25
## [7] vctrs_0.6.3 ROCR_1.0-11 spatstat.explore_3.2-3
## [10] htmltools_0.5.6 sass_0.4.7 sctransform_0.4.0
## [13] parallelly_1.36.0 KernSmooth_2.23-21 bslib_0.5.1
## [16] htmlwidgets_1.6.2 ica_1.0-3 plyr_1.8.8
## [19] plotly_4.10.2 zoo_1.8-12 cachem_1.0.8
## [22] igraph_1.5.1 mime_0.12 lifecycle_1.0.3
## [25] pkgconfig_2.0.3 Matrix_1.6-1.1 R6_2.5.1
## [28] fastmap_1.1.1 fitdistrplus_1.1-11 future_1.33.0
## [31] shiny_1.7.5 digest_0.6.33 colorspace_2.1-0
## [34] patchwork_1.1.3 rprojroot_2.0.3 tensor_1.5
## [37] irlba_2.3.5.1 progressr_0.14.0 fansi_1.0.4
## [40] spatstat.sparse_3.0-2 httr_1.4.7 polyclip_1.10-6
## [43] abind_1.4-5 compiler_4.3.1 withr_2.5.1
## [46] bit64_4.0.5 MASS_7.3-60 tools_4.3.1
## [49] lmtest_0.9-40 httpuv_1.6.11 future.apply_1.11.0
## [52] goftest_1.2-3 glue_1.6.2 nlme_3.1-162
## [55] promises_1.2.1 grid_4.3.1 Rtsne_0.16
## [58] cluster_2.1.4 reshape2_1.4.4 generics_0.1.3
## [61] hdf5r_1.3.8 gtable_0.3.4 spatstat.data_3.0-1
## [64] tzdb_0.4.0 tidyr_1.3.0 data.table_1.14.8
## [67] RApiSerialize_0.1.2 hms_1.1.3 sp_2.0-0
## [70] stringfish_0.15.8 utf8_1.2.3 spatstat.geom_3.2-5
## [73] RcppAnnoy_0.0.21 ggrepel_0.9.3 RANN_2.6.1
## [76] pillar_1.9.0 vroom_1.6.3 later_1.3.1
## [79] splines_4.3.1 lattice_0.21-8 survival_3.5-5
## [82] bit_4.0.5 deldir_1.0-9 tidyselect_1.2.0
## [85] miniUI_0.1.1.1 pbapply_1.7-2 knitr_1.44
## [88] gridExtra_2.3 scattermore_1.2 xfun_0.40
## [91] matrixStats_1.0.0 stringi_1.7.12 lazyeval_0.2.2
## [94] yaml_2.3.7 evaluate_0.22 codetools_0.2-19
## [97] tibble_3.2.1 cli_3.6.1 uwot_0.1.16
## [100] RcppParallel_5.1.7 xtable_1.8-4 reticulate_1.32.0
## [103] munsell_0.5.0 jquerylib_0.1.4 Rcpp_1.0.11
## [106] globals_0.16.2 spatstat.random_3.1-6 png_0.1-8
## [109] parallel_4.3.1 ellipsis_0.3.2 ggplot2_3.4.3
## [112] listenv_0.9.0 viridisLite_0.4.2 scales_1.2.1
## [115] ggridges_0.5.4 crayon_1.5.2 leiden_0.4.3
## [118] rlang_1.1.1 cowplot_1.1.1