# Summary

* This is a tutorial on using R (& Python via [Reticulate](https://rstudio.github.io/reticulate/)) 
  for accessing the scBaseCamp dataset hosted by the Arc Institute.
* The data can be streamed or downloaded locally.
  * For small jobs (e.g., summarizing the some metadata), streaming is recommended.
  * For large jobs (e.g., training a model), downloading is recommended.
* See the [README](README.md#metadata) for a description of the obs metadata.

# Setup

### Installation

If needed, install the necessary dependencies.

You can use the [conda environment](../conda_envs/R.yml) provided in this git repository.

# Load libraries

In [13]:
# Load required libraries
library(dplyr)
library(arrow)
library(reticulate)
os = import("os")
pd = import("pandas")
ad = import("anndata")
gcsfs = import("gcsfs")


Attaching package: ‘arrow’


The following object is masked from ‘package:utils’:

    timestamp




In [2]:
# max print rows
options(repr.matrix.max.rows=4)

# Data location

In [3]:
# GCS bucket path
gcs_base_path = "gs://arc-ctc-scbasecamp/2025-02-25"

# List available files

Let's see what we have to work with!

In [47]:
# initialize GCS file system for reading data from GCS
fs = gcsfs$GCSFileSystem(
    token = Sys.getenv("GOOGLE_APPLICATION_CREDENTIALS")
)

In [53]:
# helper function
get_parquet_files = function(gcs_base_path, target = NULL, endswith = NULL) {
  files = fs$glob(os$path$join(gcs_base_path, "**"))
  

  if (!is.null(target)) {
    files = files[sapply(files, function(f) basename(f) == target)]
  } else if (!is.null(endswith)) {
    files = files[sapply(files, function(f) grepl(paste0(endswith, "$"), f))]
  }
  
  file_list = lapply(files, function(f) {
    parts = unlist(strsplit(f, "/"))
    c(parts[(length(parts)-2):(length(parts)-1)], f)
  })
  
  file_df = as.data.frame(do.call(rbind, file_list), stringsAsFactors = FALSE)
  colnames(file_df) = c("organism", "feature_type", "file_path")
  
  return(file_df)
}

## Parquet files

* Contain the obs metadata

In [54]:
# set the path to the metadata files
gcs_path = file.path(gcs_base_path, "metadata")
gcs_path

In [55]:
# list files
sample_pq_files = get_parquet_files(gcs_path, "sample_metadata.parquet.gz")
sample_pq_files 

organism,feature_type,file_path
<chr>,<chr>,<chr>
Arabidopsis_thaliana,Gene,arc-ctc-scbasecamp/2025-02-25/metadata/Arabidopsis_thaliana/Gene/sample_metadata.parquet.gz
Arabidopsis_thaliana,GeneFull_Ex50pAS,arc-ctc-scbasecamp/2025-02-25/metadata/Arabidopsis_thaliana/GeneFull_Ex50pAS/sample_metadata.parquet.gz
⋮,⋮,⋮
Zea_mays,GeneFull_Ex50pAS,arc-ctc-scbasecamp/2025-02-25/metadata/Zea_mays/GeneFull_Ex50pAS/sample_metadata.parquet.gz
Zea_mays,Velocyto,arc-ctc-scbasecamp/2025-02-25/metadata/Zea_mays/Velocyto/sample_metadata.parquet.gz


### Per-obs metadata

In [56]:
# list files
obs_pq_files = get_parquet_files(gcs_path, "obs_metadata.parquet.gz")
obs_pq_files %>% head()

Unnamed: 0_level_0,organism,feature_type,file_path
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,Arabidopsis_thaliana,Gene,arc-ctc-scbasecamp/2025-02-25/metadata/Arabidopsis_thaliana/Gene/obs_metadata.parquet.gz
2,Arabidopsis_thaliana,GeneFull_Ex50pAS,arc-ctc-scbasecamp/2025-02-25/metadata/Arabidopsis_thaliana/GeneFull_Ex50pAS/obs_metadata.parquet.gz
⋮,⋮,⋮,⋮
5,Caenorhabditis_elegans,Gene,arc-ctc-scbasecamp/2025-02-25/metadata/Caenorhabditis_elegans/Gene/obs_metadata.parquet.gz
6,Caenorhabditis_elegans,GeneFull_Ex50pAS,arc-ctc-scbasecamp/2025-02-25/metadata/Caenorhabditis_elegans/GeneFull_Ex50pAS/obs_metadata.parquet.gz


# h5ad files

In [9]:
gcs_path = file.path(gcs_base_path, "h5ad")
gcs_path

In [10]:
# list files
h5ad_files = get_parquet_files(gcs_path, endswith=".h5ad.gz")
h5ad_files 

organism,feature_type,file_path
<chr>,<chr>,<chr>
Arabidopsis_thaliana,Gene,arc-ctc-scbasecamp/2025-02-25/h5ad/Arabidopsis_thaliana/Gene/SRX15202187.h5ad.gz
Arabidopsis_thaliana,Gene,arc-ctc-scbasecamp/2025-02-25/h5ad/Arabidopsis_thaliana/Gene/SRX15202188.h5ad.gz
⋮,⋮,⋮
Zea_mays,Velocyto,arc-ctc-scbasecamp/2025-02-25/h5ad/Zea_mays/Velocyto/SRX19383767.h5ad.gz
Zea_mays,Velocyto,arc-ctc-scbasecamp/2025-02-25/h5ad/Zea_mays/Velocyto/SRX19383768.h5ad.gz


# Obs metadata

* `obs` ≃ cell

In [11]:
# select a particular STARsolo output type
## "GeneFull_Ex50pAS" is most similar to CellRanger output
target_feature_type = "GeneFull_Ex50pAS"

### Per-sample

* Useful for quickly summarizing the per-sample metadata (a small file versus the entire obs metadata file; see below).

In [57]:
# filter to target count type
sample_pq_files_f = sample_pq_files %>% filter(feature_type == target_feature_type)
sample_pq_files_f

organism,feature_type,file_path
<chr>,<chr>,<chr>
Arabidopsis_thaliana,GeneFull_Ex50pAS,arc-ctc-scbasecamp/2025-02-25/metadata/Arabidopsis_thaliana/GeneFull_Ex50pAS/sample_metadata.parquet.gz
Bos_taurus,GeneFull_Ex50pAS,arc-ctc-scbasecamp/2025-02-25/metadata/Bos_taurus/GeneFull_Ex50pAS/sample_metadata.parquet.gz
⋮,⋮,⋮
Sus_scrofa,GeneFull_Ex50pAS,arc-ctc-scbasecamp/2025-02-25/metadata/Sus_scrofa/GeneFull_Ex50pAS/sample_metadata.parquet.gz
Zea_mays,GeneFull_Ex50pAS,arc-ctc-scbasecamp/2025-02-25/metadata/Zea_mays/GeneFull_Ex50pAS/sample_metadata.parquet.gz


In [59]:
#file = "gs://arc-ctc-scbasecamp/2025-02-25/metadata/Arabidopsis_thaliana/GeneFull_Ex50pAS/obs_metadata.parquet.gz"
#file = "gs://arc-ctc-scbasecamp/2025-02-25/metadata/Arabidopsis_thaliana/GeneFull_Ex50pAS/sample_metadata.parquet.gz"
#file = "gs://arc-ctc-scbasecamp/2025-02-25/metadata/Arabidopsis_thaliana/GeneFull_Ex50pAS/obs_metadata.parquet.gz"
#open_dataset(file, format = "parquet") %>% collect() %>% data.frame() %>%
#    mutate(czi_collection_id = as.character(czi_collection_id))

In [62]:
# function to read parquet files
read_data = function(file, n = 3){
  file = paste0("gs://", file)
  open_dataset(file, format = "parquet") %>%
    head(n) %>% collect() %>% as.data.frame() 
}

# read the metadata files
sample_metadata = sample_pq_files_f %>%
  pull(file_path) %>%
  head(n=3) %>%
  lapply(read_data) %>%
  bind_rows()
sample_metadata

entrez_id,srx_accession,file_path,obs_count,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name
<int>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>
24123125,SRX17302366,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/Arabidopsis_thaliana/GeneFull_Ex50pAS/SRX17302366.h5ad.gz,9036,10x_Genomics,3_prime_gex,single_cell,Arabidopsis thaliana,other,not specified,"BL (Brassinolide), 100nM, 0.5 hours post-treatment",WT Col-0,,
24123140,SRX17302381,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/Arabidopsis_thaliana/GeneFull_Ex50pAS/SRX17302381.h5ad.gz,14317,10x_Genomics,3_prime_gex,single_cell,Arabidopsis thaliana,other,not specified,"control treatment, age: 7 days",WT Col-0,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
22946265,SRX16241286,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/Caenorhabditis_elegans/GeneFull_Ex50pAS/SRX16241286.h5ad.gz,3496,10x_Genomics,3_prime_gex,single_nucleus,Caenorhabditis elegans,other,unsure,"water treatment, synchronized using nylon mesh filter, grown for 62 hours at 25°C, washed to remo...",F1 generation,,
22946262,SRX16241283,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/Caenorhabditis_elegans/GeneFull_Ex50pAS/SRX16241283.h5ad.gz,4767,10x_Genomics,3_prime_gex,single_nucleus,Caenorhabditis elegans,other,not specified,0.5% ethanol treatment,not applicable,,
