---
# Dataset Formatting: Caron-ALL
*L.Richards*  
*2020-06-08*  
*/cluster/projects/pughlab/projects/cancer_scrna_integration/data/Caron-ALL/*  

---

Format Caron-ALL dataset. Emailed the authors to obtain cell level metadata/ This will be input for data integration tools. https://www.nature.com/articles/s41598-020-64929-x

In [None]:
library(Seurat) #v4.0.1
library(earlycross) # v0.1
library(Matrix)

# load custom functions
source("~/github/oicr-brain-tri-gbm/src/scRNA_helper_functions.r")

---
## 1.0 Format downloaded public data
---

Downloaded data from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE132509

In [None]:
# load cell level metadata
# 38922 cells
meta <- read.table("./original-data/metadata.tsv", sep = "\t")
rownames(meta) <- gsub("-", ".", rownames(meta))

# make a SampleID column
# split it off the cell barcode
id <- sapply(strsplit(rownames(meta),"_"), `[`, 1)
meta$SampleID <- id

# make a patientID column
meta$PatientID <- id

# clean up cell type labels
# turn sample labels into malignant
# we are going to need to add our own annotations to this...
# need to cluster and split up B+Mono and T+NK cells
meta$CellType <- meta$celltype
meta$CellType <- gsub("ETV6.RUNX1.1", "Malignant", meta$CellType)
meta$CellType <- gsub("ETV6.RUNX1.2", "Malignant", meta$CellType)
meta$CellType <- gsub("ETV6.RUNX1.3", "Malignant", meta$CellType)
meta$CellType <- gsub("ETV6.RUNX1.4", "Malignant", meta$CellType)
meta$CellType <- gsub("HHD.1", "Malignant", meta$CellType)
meta$CellType <- gsub("HHD.2", "Malignant", meta$CellType)
meta$CellType <- gsub("PRE-T.1", "Malignant", meta$CellType)
meta$CellType <- gsub("PRE-T.2", "Malignant", meta$CellType)

# make a cell barcode column
meta$CellBarcode <- rownames(meta)

In [None]:
# PBMCCs have .1 at the end, but not all of them do -_-
# need to manually fix this matrix

file <- "GSM3872442_PBMMC_1"
sample <- "PBMMC.1"

# read in sparse counts matrix
barcode.path <- paste0(matrix_dir, file, ".", "barcodes.tsv.gz")
features.path <- paste0(matrix_dir, file, ".", "genes.tsv.gz")
matrix.path <- paste0(matrix_dir, file, ".", "matrix.mtx.gz")
mat <- readMM(file = matrix.path)
feature.names <- read.delim(features.path,
                            header = FALSE,
                            stringsAsFactors = FALSE)
barcode.names <- read.delim(barcode.path,
                            header = FALSE,
                            stringsAsFactors = FALSE)
colnames(mat) <- barcode.names$V1
rownames(mat) <- feature.names$V2
    
# format barcodes to match metadata provided by authors
colnames(mat) <- sapply(strsplit(colnames(mat),"-"), `[`, 1)
colnames(mat) <- paste0(sample, "_", colnames(mat))

# sanity check
print(file)
print(table(colnames(mat) %in% rownames(meta)))

# check rownames for PBMCC meta
newCBs <- gsub('.{2}$', '', rownames(meta)[grep("PBMMC.1", rownames(meta))])
meta$CellBarcode[grep("PBMMC.1", rownames(meta))] <- newCBs
meta <- meta[!duplicated(meta$CellBarcode), ]
rownames(meta) <- meta$CellBarcode

# remove barcodes that are not in metadata
# probably represent poor quality ones
mat <- mat[ ,colnames(mat) %in% rownames(meta)]
print(table(colnames(mat) %in% rownames(meta)))
    
counts[[samples[i]]] <- mat

In [None]:
# read in count matrices and merge

# extract samlpe and file name for each mtx
matrix_dir <- "/cluster/projects/pughlab/projects/cancer_scrna_integration/data/Caron-ALL/original-data/"
files <- list.files(matrix_dir, pattern = "gz")
files  <- sapply(strsplit(files,"\\."), `[`, 1)
files <- unique(files)
samples <- paste0(sapply(strsplit(files,"_"), `[`, 2),
                  ".",
                  sapply(strsplit(files,"_"), `[`, 3)
                  )
samples <- gsub("-", ".", samples)
table(samples %in% meta$SampleID) # SANITY CHECK - looks good


# read in each samples count matrix
seurats <- list()

for(i in 1:length(files)){
    
    # read in sparse counts matrix
    barcode.path <- paste0(matrix_dir, files[i], ".", "barcodes.tsv.gz")
    features.path <- paste0(matrix_dir, files[i], ".", "genes.tsv.gz")
    matrix.path <- paste0(matrix_dir, files[i], ".", "matrix.mtx.gz")
    mat <- readMM(file = matrix.path)
    feature.names <- read.delim(features.path,
                                header = FALSE,
                                stringsAsFactors = FALSE)
    barcode.names <- read.delim(barcode.path,
                                header = FALSE,
                                stringsAsFactors = FALSE)
    colnames(mat) <- barcode.names$V1
    rownames(mat) <- feature.names$V2
    
    # format barcodes to match metadata provided by authors
    colnames(mat) <- sapply(strsplit(colnames(mat),"-"), `[`, 1)
    colnames(mat) <- paste0(samples[i], "_", colnames(mat))
    
    # sanity check
    print(files[i])
    print(table(colnames(mat) %in% rownames(meta)))
    
    # remove barcodes that are not in metadata
    # probably represent poor quality ones
    mat <- mat[ ,colnames(mat) %in% rownames(meta)]
    print(table(colnames(mat) %in% rownames(meta)))
    
    seurats[[samples[i]]] <- CreateSeuratObject(counts = mat)
    
}

In [None]:
# merge seurat objects together
combo <- merge(seurats[[1]], seurats[2:11])
combo <- AddMetaData(combo, metadata = meta)

In [None]:
# cluster the cells
combo <- quickCluster(combo,
                        normalize = TRUE,
                        vars.to.regress = NULL,
                        #k.param = 20,
                        dims = 20, # max dims 1:dims
                        n.vargenes = 2000,
                        min.resolution = 1.5,
                        max.resolution = 1.5,
                        n.resolution = 1, #how many resolutions to cluster over
                        verbose = FALSE,
                        pc.calc = 75, # how many PCs to calculate
                        pca.genes = "var" # accepts "all" or "var"
                       )

# save intermediate file
saveRDS(combo, file = "Caron-ALL_seurat.rds")

---
## 2.0 Correct author-provided cell labels
---

Use gene marker expression to split "B cells + Mono" into "B_cells" & "Macrophages", and "T cells + NK" into "T_cells" & "NK_cells"

In [None]:
combo <- readRDS("~/Desktop/H4H/pughlab/projects/cancer_scrna_integration/data/Caron-ALL/Caron-ALL_seurat.rds")

In [None]:
DimPlot(combo)

---
## 3.0 Output files in 10x common format
---

Output counts matrix with 10x/CellRanger formatting style and metadata csv.

In [None]:
# save metadata as csv file
meta <- data.frame(combo@meta.data)
write.csv(meta, file = "Caron-ALL_meta.csv")

In [None]:
# export count matrix as default 10x CellRanger output
Write10X(combo, dir = "./")

---