Load GEX data

library(here)
library(purrr)
library(stringr)
library(readr)
library(djvdj)
library(dplyr)
library(Seurat)
library(qs)


After downloading the GEX and VDJ files, first pull the GEX files based on the file suffix.

file_dir <- here("data-old/rimanpreetkaur")

.get_sample_key <- function(x) {
  x %>%
    basename() %>%
    str_extract("(?<=GSM[0-9]{7}_)[^_]+")
}

# Pull file names to load
h5s <- dir(file_dir, pattern = "\\.h5$", full.names = TRUE)

names(h5s) <- .get_sample_key(h5s)

h5s %>%
  map_chr(basename)
##                                              blood1 
##   "GSM5830281_blood1_filtered_feature_bc_matrix.h5" 
##                                              liver1 
##   "GSM5830282_liver1_filtered_feature_bc_matrix.h5" 
##                                            blood2.A 
## "GSM5830283_blood2.A_filtered_feature_bc_matrix.h5" 
##                                            blood2.B 
## "GSM5830284_blood2.B_filtered_feature_bc_matrix.h5" 
##                                            liver2.A 
## "GSM5830285_liver2.A_filtered_feature_bc_matrix.h5" 
##                                            liver2.B 
## "GSM5830286_liver2.B_filtered_feature_bc_matrix.h5" 
##                                            blood3.A 
## "GSM5830287_blood3.A_filtered_feature_bc_matrix.h5" 
##                                            blood3.B 
## "GSM5830288_blood3.B_filtered_feature_bc_matrix.h5" 
##                                            liver3.A 
## "GSM5830289_liver3.A_filtered_feature_bc_matrix.h5" 
##                                            liver3.B 
## "GSM5830290_liver3.B_filtered_feature_bc_matrix.h5" 
##                                              liver4 
##   "GSM5830291_liver4_filtered_feature_bc_matrix.h5"


Create and merge Seurat objects for each GEX sample, set a sample name based on the file names.

# Create Seurat object
h5s <- h5s %>%
  imap(~ {
    .x %>%
      Read10X_h5() %>%
      CreateSeuratObject(project = .y)
  })

so <- merge(h5s[[1]], h5s[-1], add.cell.ids = names(h5s))

so
## An object of class Seurat 
## 33538 features across 44043 samples within 1 assay 
## Active assay: RNA (33538 features, 0 variable features)


Load VDJ data

Normally import_vdj() expects a single sample per file. However, some of the files uploaded to GEO include multiple samples per file with different cell barcode suffixes. This will occur if some of the files were processed separately using cellranger aggr.

import_vdj() can load output files from cellranger aggr (using the aggr_dir argument), but cannot easily load a mix of files where some were processed with aggr and others not.

To load these files into your Seurat object, first split each file based on the cell barcode suffix (-1 or -2) and write new files.

# Split VDJ contig files that contain multiple samples
new_dir <- here(file_dir, "new_vdj_files")

vdj_files <- dir(file_dir, pattern = "\\.csv.gz", full.names = TRUE)

names(vdj_files) <- .get_sample_key(vdj_files)

vdj_files %>%
  iwalk(~ {
    file_nm <- basename(.x)
    nm      <- .y
    df      <- read_csv(.x, progress = FALSE, )
    
    df <- df %>%
      mutate(sfx = str_extract(barcode, "-[0-9]+$")) %>%
      split(.$sfx)
    
    if (length(df) > 1) {
      new_nms <- LETTERS[seq_along(df)]
      
      new_nms <- new_nms %>%
        map_chr(~ {
          file_nm %>%
            str_replace(nm, str_c(nm, ".", .x))
        })
      
      names(df) <- new_nms
      
      df %>%
        iwalk(~ {
          dir_nm  <- str_remove(.y, ".csv.gz$")
          new_dir <- here(new_dir, dir_nm)
          
          dir.create(new_dir)
          
          .x %>%
            dplyr::select(-sfx) %>%
            write_csv(here(new_dir, "filtered_contig_annotations.csv.gz"))
        })
      
    } else {
      dir_nm  <- basename(str_remove(.x, ".csv.gz$"))
      new_dir <- here(new_dir, dir_nm)
      
      dir.create(new_dir)
      
      file.symlink(.x, here(new_dir, "filtered_contig_annotations.csv.gz"))
    }
  })

new_files <- dir(new_dir, full.names = TRUE)

names(new_files) <- .get_sample_key(new_files)

new_files %>%
  map_chr(basename)
##                                            blood1 
##   "GSM5830292_blood1_filtered_contig_annotations" 
##                                            liver1 
##   "GSM5830293_liver1_filtered_contig_annotations" 
##                                          blood2.A 
## "GSM5830294_blood2.A_filtered_contig_annotations" 
##                                          blood2.B 
## "GSM5830294_blood2.B_filtered_contig_annotations" 
##                                          liver2.A 
## "GSM5830296_liver2.A_filtered_contig_annotations" 
##                                          liver2.B 
## "GSM5830296_liver2.B_filtered_contig_annotations" 
##                                          blood3.A 
## "GSM5830298_blood3.A_filtered_contig_annotations" 
##                                          blood3.B 
## "GSM5830298_blood3.B_filtered_contig_annotations" 
##                                          liver3.A 
## "GSM5830300_liver3.A_filtered_contig_annotations" 
##                                          liver3.B 
## "GSM5830300_liver3.B_filtered_contig_annotations" 
##                                            liver4 
##   "GSM5830302_liver4_filtered_contig_annotations"


After splitting each VDJ file, add the revised VDJ files to the Seurat object. Be sure to set the vector names to match the GEX sample names that were used to create the object.

# Load VDJ data
so <- so %>%
  import_vdj(vdj_dir = new_files)
## 
ℹ Loading V(D)J data

✔ Loading V(D)J data [5.2s]
## 
ℹ Formatting V(D)J data

✔ Formatting V(D)J data [6.5s]
## ────────────────────────────────────────────────────────────────────────────────
##               # cells   # VDJ   # paired   # overlap   % overlap 
## ✔   blood1_ |   44043 |  3877 |     3192 |      3871 |      100%
## ✔   liver1_ |   44043 |  3429 |     2730 |      3416 |      100%
## ✔ blood2.A_ |   44043 |  3836 |     3099 |      3830 |      100%
## ✔ blood2.B_ |   44043 |  3826 |     3105 |      3824 |      100%
## ✔ liver2.A_ |   44043 |  2793 |     2279 |      2787 |      100%
## ✔ liver2.B_ |   44043 |  2880 |     2302 |      2875 |      100%
## ✔ blood3.A_ |   44043 |  4023 |     3160 |      4020 |      100%
## ✔ blood3.B_ |   44043 |  3987 |     3114 |      3978 |      100%
## ✔ liver3.A_ |   44043 |  3149 |     2511 |      3143 |      100%
## ✔ liver3.B_ |   44043 |  2992 |     2351 |      2985 |      100%
## ✔   liver4_ |   44043 |  2412 |     1797 |      2402 |      100%
## ────────────────────────────────────────────────────────────────────────────────
# Save object
so %>%
  qsave(here(file_dir, "so.qs"))


Session Info

## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/Denver
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] qs_0.25.5          SeuratObject_4.1.4 Seurat_4.4.0       dplyr_1.1.3       
## [5] djvdj_0.1.0        readr_2.1.4        stringr_1.5.0      purrr_1.0.2       
## [9] here_1.0.1        
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3     rstudioapi_0.15.0      jsonlite_1.8.7        
##   [4] magrittr_2.0.3         spatstat.utils_3.0-3   rmarkdown_2.25        
##   [7] vctrs_0.6.3            ROCR_1.0-11            spatstat.explore_3.2-3
##  [10] htmltools_0.5.6        sass_0.4.7             sctransform_0.4.0     
##  [13] parallelly_1.36.0      KernSmooth_2.23-21     bslib_0.5.1           
##  [16] htmlwidgets_1.6.2      ica_1.0-3              plyr_1.8.8            
##  [19] plotly_4.10.2          zoo_1.8-12             cachem_1.0.8          
##  [22] igraph_1.5.1           mime_0.12              lifecycle_1.0.3       
##  [25] pkgconfig_2.0.3        Matrix_1.6-1.1         R6_2.5.1              
##  [28] fastmap_1.1.1          fitdistrplus_1.1-11    future_1.33.0         
##  [31] shiny_1.7.5            digest_0.6.33          colorspace_2.1-0      
##  [34] patchwork_1.1.3        rprojroot_2.0.3        tensor_1.5            
##  [37] irlba_2.3.5.1          progressr_0.14.0       fansi_1.0.4           
##  [40] spatstat.sparse_3.0-2  httr_1.4.7             polyclip_1.10-6       
##  [43] abind_1.4-5            compiler_4.3.1         withr_2.5.1           
##  [46] bit64_4.0.5            MASS_7.3-60            tools_4.3.1           
##  [49] lmtest_0.9-40          httpuv_1.6.11          future.apply_1.11.0   
##  [52] goftest_1.2-3          glue_1.6.2             nlme_3.1-162          
##  [55] promises_1.2.1         grid_4.3.1             Rtsne_0.16            
##  [58] cluster_2.1.4          reshape2_1.4.4         generics_0.1.3        
##  [61] hdf5r_1.3.8            gtable_0.3.4           spatstat.data_3.0-1   
##  [64] tzdb_0.4.0             tidyr_1.3.0            data.table_1.14.8     
##  [67] RApiSerialize_0.1.2    hms_1.1.3              sp_2.0-0              
##  [70] stringfish_0.15.8      utf8_1.2.3             spatstat.geom_3.2-5   
##  [73] RcppAnnoy_0.0.21       ggrepel_0.9.3          RANN_2.6.1            
##  [76] pillar_1.9.0           vroom_1.6.3            later_1.3.1           
##  [79] splines_4.3.1          lattice_0.21-8         survival_3.5-5        
##  [82] bit_4.0.5              deldir_1.0-9           tidyselect_1.2.0      
##  [85] miniUI_0.1.1.1         pbapply_1.7-2          knitr_1.44            
##  [88] gridExtra_2.3          scattermore_1.2        xfun_0.40             
##  [91] matrixStats_1.0.0      stringi_1.7.12         lazyeval_0.2.2        
##  [94] yaml_2.3.7             evaluate_0.22          codetools_0.2-19      
##  [97] tibble_3.2.1           cli_3.6.1              uwot_0.1.16           
## [100] RcppParallel_5.1.7     xtable_1.8-4           reticulate_1.32.0     
## [103] munsell_0.5.0          jquerylib_0.1.4        Rcpp_1.0.11           
## [106] globals_0.16.2         spatstat.random_3.1-6  png_0.1-8             
## [109] parallel_4.3.1         ellipsis_0.3.2         ggplot2_3.4.3         
## [112] listenv_0.9.0          viridisLite_0.4.2      scales_1.2.1          
## [115] ggridges_0.5.4         crayon_1.5.2           leiden_0.4.3          
## [118] rlang_1.1.1            cowplot_1.1.1