In [None]:
## Notebook env: archr (R kernel)
## This notebook starts requires alignment_and_quantification.ipynb to be run first

In [1]:
suppressMessages(suppressWarnings(library(ArchR)))
set.seed(1)
suppressMessages(suppressWarnings(library('BSgenome.Hsapiens.UCSC.hg38')))

## Load data ##

In [2]:
savepath = '/khayyam/ssobti/projects/heterogeneity_brian/output_data/scATACseq/091724_BW/archr_analysis'
setwd(savepath)
addArchRThreads(threads = 35)
addArchRGenome("hg38")
addArchRLocking(locking = TRUE)

folder = '/khayyam/ssobti/projects/heterogeneity_brian/output_data/scATACseq/091724_BW/align'
files = list.files(folder)
files = files[endsWith(files, '.tsv.gz')]
names(files) = unlist(lapply(files, function(x) strsplit(x, '\\.')[[1]][1]))

Setting default number of Parallel threads to 35.

Setting default genome to Hg38.

Setting ArchRLocking to TRUE.



In [None]:
ArrowFiles <- createArrowFiles(
  inputFiles = paste0(folder, '/', files),
  sampleNames = names(files),
  minTSS = 4, #Dont set this too high because you can always increase later
  minFrags = 1000, 
  addTileMat = TRUE,
  addGeneScoreMat = TRUE
)

Using GeneAnnotation set by addArchRGenome(Hg38)!



Using GeneAnnotation set by addArchRGenome(Hg38)!

ArchR logging to : ArchRLogs/ArchR-createArrows-ca8d478cd8584-Date-2024-11-30_Time-17-10-23.880557.log
If there is an issue, please report to github with logFile!

Cleaning Temporary Files

subThreading Disabled since ArchRLocking is TRUE see `addArchRLocking`

2024-11-30 17:10:27.342567 : Batch Execution w/ safelapply!, 0 mins elapsed.



In [None]:
arrow_files = list.files(paste0(savepath), include.dirs = F, full.names = T)
arrow_files = arrow_files[endsWith(arrow_files, '.arrow')]
ArrowFiles = arrow_files
names(ArrowFiles) = gsub(pattern = '.arrow', replacement = '', x = sapply(stringr::str_split(arrow_files, '/'), '[[', 10))

## Preprocessing ##

In [None]:
doubScores <- addDoubletScores(
  input = ArrowFiles,
  k = 10, #Refers to how many cells near a "pseudo-doublet" to count.
  knnMethod = "UMAP", #Refers to the embedding to use for nearest neighbor search.
  LSIMethod = 1
)

proj <- ArchRProject(
  ArrowFiles = ArrowFiles, 
  outputDirectory = savepath,
  copyArrows = TRUE #This is recommened so that you maintain an unaltered copy for later usage.
)

proj <- filterDoublets(ArchRProj = proj)
proj <- saveArchRProject(ArchRProj = proj)

ArchR logging to : ArchRLogs/ArchR-addDoubletScores-bd44736d2b2be-Date-2024-11-30_Time-20-19-54.143091.log
If there is an issue, please report to github with logFile!

2024-11-30 20:19:54.471117 : Batch Execution w/ safelapply!, 0 mins elapsed.

2024-11-30 20:19:54.486039 : MIS18A-Ca-1 (1 of 12) :  Computing Doublet Statistics, 0 mins elapsed.

“Data is of class matrix. Coercing to dgCMatrix.”
Filtering 1 dims correlated > 0.75 to log10(depth + 1)

Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'

Also defined by ‘spam’

Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'

Also defined by ‘spam’

MIS18A-Ca-1 (1 of 12) : UMAP Projection R^2 = 0.97957

MIS18A-Ca-1 (1 of 12) : UMAP Projection R^2 = 0.97957

2024-11-30 20:22:50.857612 : MIS18A-Ca-2 (2 of 12) :  Computing Doublet Statistics, 2.94 mins elapsed.

Filtering 1 dims correlated > 0.75 to log10(depth + 1)

“Data is of class matrix. Coercing to dgCMatrix

In [None]:
getAvailableMatrices(proj)

## Obtaining matrices ##

In [None]:
proj <- addIterativeLSI(ArchRProj = proj, useMatrix = "TileMatrix", name = "IterativeLSI")
proj <- addClusters(input = proj, reducedDims = "IterativeLSI")
proj <- addGroupCoverages(proj, force = F, groupBy = 'Sample', threads = 1)
proj <- addReproduciblePeakSet(proj, peakMethod = "macs2", groupBy = 'Sample')
#head(as.data.frame(proj@cellColData))
txnhet_archr_macs2_project <- addPeakMatrix(proj, threads = 1)
txnhet_archr_macs2_project <- saveArchRProject(ArchRProj = txnhet_archr_macs2_project)

In [3]:
txnhet_archr_macs2_project <- loadArchRProject(savepath)

Successfully loaded ArchRProject!


                                                   / |
                                                 /    \
            .                                  /      |.
            \\\                              /        |.
              \\\                          /           `|.
                \\\                      /              |.
                  \                    /                |\
                  \\#####\           /                  ||
                ==###########>      /                   ||
                 \\##==......\    /                     ||
            ______ =       =|__ /__                     ||      \\\
       \               '        ##_______ _____ ,--,__,=##,__   ///
        ,    __==    ___,-,__,--'#'  ==='      `-'    | ##,-/
        -,____,---'       \\####\\________________,--\\_##,/
           ___      .______        ______  __    __  .______      
          /   \     |   _  \      /      ||  |  |  | |   _ 

#### Global peak matrix ####

In [None]:
## currently the MACS2 peaks for ALL regions laoded in the peak matrix
txnhet_archr_macs2_peaks = getMatrixFromProject(txnhet_archr_macs2_project, useMatrix = 'PeakMatrix')
txnhet_archr_macs2_peak_mtx = assays(txnhet_archr_macs2_peaks)$PeakMatrix
txnhet_archr_macs2_peak_meta = as.data.frame(colData(txnhet_archr_macs2_peaks))
saveRDS(txnhet_archr_macs2_peak_mtx, file = paste0(savepath,'/txnhet_archr_macs2_peak_mtx.rds'))
saveRDS(txnhet_archr_macs2_peak_meta, file = paste0(savepath,'/txnhet_archr_macs2_peak_meta.rds'))

ArchR logging to : ArchRLogs/ArchR-getMatrixFromProject-50d8c2dfbaac-Date-2024-12-03_Time-14-54-17.07112.log
If there is an issue, please report to github with logFile!

2024-12-03 14:56:06.052969 : Organizing colData, 1.816 mins elapsed.

2024-12-03 14:56:07.548004 : Organizing rowData, 1.841 mins elapsed.

2024-12-03 14:56:07.597262 : Organizing rowRanges, 1.842 mins elapsed.

2024-12-03 14:56:07.666852 : Organizing Assays (1 of 1), 1.843 mins elapsed.

2024-12-03 14:56:58.489257 : Constructing SummarizedExperiment, 2.69 mins elapsed.

2024-12-03 15:00:29.880607 : Finished Matrix Creation, 6.214 mins elapsed.



#### TSS peak matrix ####

In [None]:
TSS_bed_path = '/khayyam/ssobti/projects/heterogeneity_brian/uploaded_data/scATACseq/genome/GRCh38_transcriptsOnly.tss.bed'
TSS_granges_obj = genomation::readBed(TSS_bed_path, track.line = FALSE, remove.unusual = FALSE, zero.based = TRUE)
seqlevelsStyle(TSS_granges_obj) <- "UCSC"

txnhet_archr_macs2_TSS_project <- addPeakSet(ArchRProj = txnhet_archr_macs2_project, peakSet = TSS_granges_obj, force = TRUE)
txnhet_archr_macs2_TSS_project <- addPeakMatrix(txnhet_archr_macs2_TSS_project)
txnhet_archr_macs2_TSS_peaks = getMatrixFromProject(txnhet_archr_macs2_TSS_project, useMatrix = 'PeakMatrix')
txnhet_archr_macs2_peak_TSS_mtx = assays(txnhet_archr_macs2_TSS_peaks)$PeakMatrix
txnhet_archr_macs2_peak_TSS_meta = as.data.frame(colData(txnhet_archr_macs2_TSS_peaks))
rownames(txnhet_archr_macs2_peak_TSS_mtx) = TSS_granges_obj$name
saveRDS(txnhet_archr_macs2_peak_TSS_mtx, file = paste0(savepath,'/txnhet_archr_macs2_peak_TSS_mtx.rds'))
saveRDS(txnhet_archr_macs2_peak_TSS_meta, file = paste0(savepath,'/txnhet_archr_macs2_peak_TSS_meta.rds'))

#### Gene score matrix ####

In [None]:
txnhet_archr_macs2_gene_scores = getMatrixFromProject(txnhet_archr_macs2_project, useMatrix = 'GeneScoreMatrix')
txnhet_archr_macs2_gene_scores_mtx = assays(txnhet_archr_macs2_gene_scores)$GeneScoreMatrix
txnhet_archr_macs2_gene_scores_meta = as.data.frame(colData(txnhet_archr_macs2_gene_scores))
rownames(txnhet_archr_macs2_gene_scores_mtx) = rowData(txnhet_archr_macs2_gene_mtx)$name
saveRDS(txnhet_archr_macs2_gene_scores_mtx, file = paste0(savepath,'/txnhet_archr_macs2_gene_scores_mtx.rds'))
saveRDS(txnhet_archr_macs2_gene_scores_meta, file = paste0(savepath,'/txnhet_archr_macs2_gene_scores_meta.rds'))

ArchR logging to : ArchRLogs/ArchR-getMatrixFromProject-363f2e7879c1fd-Date-2024-12-03_Time-23-56-41.352873.log
If there is an issue, please report to github with logFile!

2024-12-03 23:58:29.625284 : Organizing colData, 1.805 mins elapsed.

2024-12-03 23:58:30.341722 : Organizing rowData, 1.817 mins elapsed.

2024-12-03 23:58:30.3545 : Organizing rowRanges, 1.817 mins elapsed.

2024-12-03 23:58:30.367208 : Organizing Assays (1 of 1), 1.817 mins elapsed.

2024-12-03 23:59:40.731705 : Constructing SummarizedExperiment, 2.99 mins elapsed.

2024-12-03 23:59:43.586056 : Finished Matrix Creation, 3.037 mins elapsed.



In [None]:
#head(as.data.frame(txnhet_archr_macs2_project@cellColData))