---
# Re-format public datasets for upload to CRESCENT
*L.Richards*    
*2021-06-14*    
/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent

---

----
## 1.0 Re-format data for upload to CReSCENT (H4H)
----

/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/portal-upload

Each dataset-integration runs requires the following input files:
1. Normalized expression matrix (genes x barcodes)
2. Metadata (labels x barcodes)
3. Coordinates file (barcodes x coords)
4. FindMarkers file from Seurat (columns = gene, cluster, p_val, avg_logFC)

In [None]:
####################################
# Format files for uplod to CReSCENT
# L.Richards
# 2021-07-05
####################################

#----- PACKAGES & PARSE -----#

suppressMessages(library(Seurat))
suppressMessages(library(optparse))
suppressMessages(library(dplyr))
suppressMessages(library(future))
suppressMessages(library(taRifx))

option_list <- list(make_option("--seurat",
                                type = "character",
                                default = NULL,
                                help = "file name",
                                metavar= "character"
                               ),
                    make_option("--inputPath",
                                type = "character",
                                default = NULL,
                                help = "path to seurat object",
                                metavar= "character"
                               )
                    )

opt_parser <- OptionParser(option_list=option_list)
opt <- parse_args(opt_parser)

file <- opt$seurat
inputPath <- opt$inputPath
# file <- "Ma-LIHC_STACAS_seurat.rds"
# inputPath <- "/cluster/projects/pughlab/projects/cancer_scrna_integration/integration"

#----- SETUP PARALLELIZATION -----#

CoresAvailable <- as.numeric(availableCores()[[1]])
print(paste0(CoresAvailable, " Cores Available"))
plan("multicore", workers = CoresAvailable)
options(future.globals.maxSize = 10 * 1024 ^ 3)
#plan()


#----- LOAD DATA -----#

print("Loading data...")
print(file)

# get list of seurat objects & define output file prefix
outPrefix <- gsub("_seurat.rds", "", file)
# load data
dat <- readRDS(paste0(inputPath, "/", file))


#----- (1) MARKER GENES -----#

print("Marker genes...")

# cluster data if not already done
if ("leiden" %in% colnames(dat@meta.data)){

    dat@meta.data$leiden <- paste0("C", dat@meta.data$leiden)
    Idents(dat) <- "leiden"


} else if ("seurat_clusters" %in% colnames(dat@meta.data)){

    dat@meta.data$seurat_clusters <- paste0("C", dat@meta.data$seurat_clusters)
    Idents(dat) <- "seurat_clusters"

} else if (sum(c("seurat_clusters", "leiden") %in% colnames(dat@meta.data)) == 0){

    dat <- FindNeighbors(dat, dims = 1:20)
    dat <- FindClusters(dat, resolution = 0.5)
    dat@meta.data$seurat_clusters <- paste0("C", dat@meta.data$seurat_clusters)
    Idents(dat) <- "seurat_clusters"

}

# find gene markers for clusters & format
markers <- FindAllMarkers(dat)
markers <- markers %>%
            group_by(cluster) %>%
            top_n(n = 15, wt = avg_log2FC) # extract top 15 per cluster
markers <- data.frame(markers)
markers <- markers[ ,c("gene", "cluster", "p_val", "avg_log2FC")]

# write tsv file
write.table(markers,
            file = paste0(outPrefix, "_deMarkers.tsv"),
            quote = FALSE,
            sep = "\t",
            row.names = FALSE
            )


#----- (2) METADATA -----#

print("Metadata...")

# extract metadata and format Broad Portal style
dat@meta.data$orig.ident <- sapply(strsplit(file,"_"), `[`, 1)
dat@meta.data$IntegrationMethod <- sapply(strsplit(file,"_"), `[`, 2)
meta <- data.frame(dat@meta.data)

NAME <- rownames(meta)
meta <- cbind(NAME, meta)
row2 <- sapply(meta, is.numeric)
row2 <- gsub("TRUE", "numeric", row2)
row2 <- gsub("FALSE", "group", row2)
row2[1] <- "TYPE"
meta <- remove.factors(meta)
meta <- rbind(row2, meta)

# write out file file
write.table(meta,
            file = paste0(outPrefix, "_meta.tsv"),
            quote = FALSE,
            sep = "\t",
            row.names = FALSE
            )



#----- (3) COORDINATES FILE -----#

print("Coordinate File...")

# extract UMAP, unless its Conos, then it will be largevis
if (unique(dat@meta.data$IntegrationMethod) == "Conos"){

    coords <- dat@reductions$largeVis@cell.embeddings
    coords <- data.frame(coords)
    Barcode <- rownames(coords)
    coords <- cbind(Barcode, coords)

} else {

    coords <- dat@reductions$umap@cell.embeddings
    coords <- data.frame(coords)
    Barcode <- rownames(coords)
    coords <- cbind(Barcode, coords)

}

# write out file
write.table(coords,
            file = paste0(outPrefix, "_coordinates.tsv"),
            quote = FALSE,
            sep = "\t",
            row.names = FALSE
          )



#----- (4) NORMALIZED EXPRESSION MATRIX -----#

print("Normalized Expression Matrix...")

# format
exp <- dat@assays$RNA@data
exp <- as.matrix(exp)
# exp <- data.frame(exp)
# GENE <- rownames(exp)
# exp <- cbind(GENE, exp)

# write out file file
write.table(exp,
            file = paste0(outPrefix, "_normalizedExpression.tsv"),
            quote = FALSE,
            sep = "\t",
            row.names = TRUE
            )


############################
print("End")


---
## 2.0 Reformat public datasets for CRESCENT pipline (Javier)
----

Location of counts on H4H (one directory per sample, inside the directory, have 10x counts matrix): /cluster/projects/pughlab/data/CRESCENT/REFERENCE_DATASETS/v2.0/ORIGINAL_AUTHOR_CELL_RANGER_COUNT

Location of metadata on H4H (.tsv.gz): cluster/projects/pughlab/data/CRESCENT/REFERENCE_DATASETS/v2.0/CELL_LEVEL_METADATA

Name the directories for each study as "FirstAuthorLastName_Journal_Year"

In [None]:
suppressMessages(library(Seurat))
suppressMessages(library(optparse))
suppressMessages(library(dplyr))
suppressMessages(library(future))
suppressMessages(library(taRifx))
suppressMessages(library(earlycross)) # v0.1

setwd("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/")

#  set up 
file.path <- "/cluster/projects/pughlab/projects/cancer_scrna_integration/integration/"
files <- list.files(file.path, pattern = "NoBatchCorrection_seurat.rds")

#### Bi-RCC

In [None]:
file <- "Bi-RCC_NoBatchCorrection_seurat.rds"
newname <- "Bi_CancerCell_2021"
sampleCol <- "SampleID"

# load data
dat <- readRDS(paste0(file.path, "/", file))


########### COUNTS ################

# make a project dir
dir.create(paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/ORIGINAL_AUTHOR_CELL_RANGER_COUNT/", newname))

# split seurat object by sample
seurats <- SplitObject(dat, split.by = sampleCol)

# write out 10x matrices for each sample into own dir
for (i in 1:length(seurats)){
    
    out <- paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/ORIGINAL_AUTHOR_CELL_RANGER_COUNT/",
                  "/", newname, "/",
                  names(seurats)[i]
                 )
    dir.create(out)
    Write10X(seurats[[i]], dir = out)
    
}


########### CELL META ################

# add first column of Barcode
meta <- data.frame(dat@meta.data)
Barcode <- rownames(meta)
meta <- cbind(Barcode, meta)

# make a project dir 
dir.create(paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/CELL_LEVEL_METADATA/", newname))

# write out metadata 
out <- paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/CELL_LEVEL_METADATA/",
                  "/", newname, "/",
                  newname, "_metadata.tsv.gz"
                 )

write.table(meta,
            file = gzfile(out),
            quote = FALSE,
            sep = "\t",
            row.names = FALSE
            )

#### Caron-ALL

In [None]:
file <- "Caron-ALL_NoBatchCorrection_seurat.rds"
newname <- "Caron_ScientificReports_2020"
sampleCol <- "SampleID"

# load data
dat <- readRDS(paste0(file.path, "/", file))


########### COUNTS ################

# make a project dir
dir.create(paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/ORIGINAL_AUTHOR_CELL_RANGER_COUNT/", newname))

# split seurat object by sample
seurats <- SplitObject(dat, split.by = sampleCol)

# write out 10x matrices for each sample into own dir
for (i in 1:length(seurats)){
    
    out <- paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/ORIGINAL_AUTHOR_CELL_RANGER_COUNT/",
                  "/", newname, "/",
                  names(seurats)[i]
                 )
    dir.create(out)
    Write10X(seurats[[i]], dir = out)
    
}


########### CELL META ################

# add first column of Barcode
meta <- data.frame(dat@meta.data)
Barcode <- rownames(meta)
meta <- cbind(Barcode, meta)

# make a project dir 
dir.create(paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/CELL_LEVEL_METADATA/", newname))

# write out metadata 
out <- paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/CELL_LEVEL_METADATA/",
                  "/", newname, "/",
                  newname, "_metadata.tsv.gz"
                 )

write.table(meta,
            file = gzfile(out),
            quote = FALSE,
            sep = "\t",
            row.names = FALSE
            )

#### Ma-LIHC

In [None]:
file <- "Ma-LIHC_NoBatchCorrection_seurat.rds"
newname <- "Ma_CancerCell_2019"
sampleCol <- "Sample"

# load data
dat <- readRDS(paste0(file.path, "/", file))


########### COUNTS ################

# make a project dir
dir.create(paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/ORIGINAL_AUTHOR_CELL_RANGER_COUNT/", newname))

# split seurat object by sample
seurats <- SplitObject(dat, split.by = sampleCol)

# write out 10x matrices for each sample into own dir
for (i in 1:length(seurats)){
    
    out <- paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/ORIGINAL_AUTHOR_CELL_RANGER_COUNT/",
                  "/", newname, "/",
                  names(seurats)[i]
                 )
    dir.create(out)
    Write10X(seurats[[i]], dir = out)
    
}


########### CELL META ################

# add first column of Barcode
meta <- data.frame(dat@meta.data)
Barcode <- rownames(meta)
meta <- cbind(Barcode, meta)

# make a project dir 
dir.create(paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/CELL_LEVEL_METADATA/", newname))

# write out metadata 
out <- paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/CELL_LEVEL_METADATA/",
                  "/", newname, "/",
                  newname, "_metadata.tsv.gz"
                 )

write.table(meta,
            file = gzfile(out),
            quote = FALSE,
            sep = "\t",
            row.names = FALSE
            )

#### Yost-BCC

In [None]:
file <- "Yost-BCC_NoBatchCorrection_seurat.rds"
newname <- "Yost_NatureMedicine_2019"
sampleCol <- "SampleID"

# load data
dat <- readRDS(paste0(file.path, "/", file))


########### COUNTS ################

# make a project dir
dir.create(paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/ORIGINAL_AUTHOR_CELL_RANGER_COUNT/", newname))

# split seurat object by sample
seurats <- SplitObject(dat, split.by = sampleCol)

# write out 10x matrices for each sample into own dir
for (i in 1:length(seurats)){
    
    out <- paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/ORIGINAL_AUTHOR_CELL_RANGER_COUNT/",
                  "/", newname, "/",
                  names(seurats)[i]
                 )
    dir.create(out)
    Write10X(seurats[[i]], dir = out)
    
}


########### CELL META ################

# add first column of Barcode
meta <- data.frame(dat@meta.data)
Barcode <- rownames(meta)
meta <- cbind(Barcode, meta)

# make a project dir 
dir.create(paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/CELL_LEVEL_METADATA/", newname))

# write out metadata 
out <- paste0("/cluster/projects/pughlab/projects/cancer_scrna_integration/crescent/samplesplit-javier/CELL_LEVEL_METADATA/",
                  "/", newname, "/",
                  newname, "_metadata.tsv.gz"
                 )

write.table(meta,
            file = gzfile(out),
            quote = FALSE,
            sep = "\t",
            row.names = FALSE
            )