---
# Dataset Formatting: Caron-LIHC
*L.Richards*  
*2020-06-07*  
*/cluster/projects/pughlab/projects/cancer_scrna_integration/data/Caron-ALL/*  

---

Format Caron-ALL dataset. This will be input for data integration tools.

AUTHORS HAVE NOT MADE CELL LEVEL METADATA AVAILABLE FOR THIS STUDY.

In [None]:
library(Seurat) #v4.0.1
library(earlycross) # v0.1
library(Matrix)

In [None]:
# read in count matrices and merge

# extract sample name of each file
matrix_dir <- "/cluster/projects/pughlab/projects/cancer_scrna_integration/data/Caron-ALL/original-data/"
samples <- list.files(matrix_dir, pattern = "gz")
samples <- sapply(strsplit(samples,"\\."), `[`, 1)


# read in each samples count matrix
for(i in 1:length(samples)){
    
    barcode.path <- paste0(matrix_dir, samples[i], ".", "barcodes.tsv.gz")
    features.path <- paste0(matrix_dir, samples[i], ".", "genes.tsv.gz")
    matrix.path <- paste0(matrix_dir, samples[i], ".", "matrix.mtx.gz")
    mat <- readMM(file = matrix.path)
    feature.names <- read.delim(features.path,
                                header = FALSE,
                                stringsAsFactors = FALSE)
    barcode.names <- read.delim(barcode.path,
                                header = FALSE,
                                stringsAsFactors = FALSE)
    colnames(mat) <- barcode.names$V1
    rownames(mat) <- feature.names$V1
       
}



---
## 1.0 Format downloaded public data
---

Downloaded data from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE132509

In [None]:
### Set 1

# read and format files from GEO accession
meta <- read.table("./original-data/Set1/GSE125449_Set1_samples.txt.gz",
                   sep = "\t",
                   header = T
                  )
rownames(meta) <- meta$Cell.Barcode

counts <- Read10X("./original-data/Set1/")

# combine into a seurat object
set1 <- CreateSeuratObject(counts = counts, 
                          meta.data = meta
                         )


In [None]:
### Set 1

# read and format files from GEO accession
meta <- read.table("./original-data/Set2/GSE125449_Set2_samples.txt.gz",
                   sep = "\t",
                   header = T
                  )
rownames(meta) <- meta$Cell.Barcode

counts <- Read10X("./original-data/Set2/")

# combine into a seurat object
set2 <- CreateSeuratObject(counts = counts, 
                          meta.data = meta
                         )

In [None]:
### COMBINE Set1 & Set2
combo <- merge(set1, y = set2)

# save seurat object
saveRDS(combo, file = "Ma-LIHC_seurat.rds")

---
## 2.0 Output files in 10x common format
---

Output counts matrix with 10x/CellRanger formatting style and metadata csv.

In [None]:
# save metadata as csv file
meta <- data.frame(combo@meta.data)
write.csv(meta, file = "Ma-LIHC_meta.csv")

In [None]:
# export count matrix as default 10x CellRanger output
Write10X(combo, dir = "./")

---