# Section one, cell clustering and visulization

In [None]:
# load all packages
require(Seurat) # single cell transcriptome analysis pipelin, ref: https://satijalab.org/seurat/
require(Yano) # for ReadPISA and alternative expression analysis, ref: https://github.com/shiquan/Yano
require(dplyr) # data pipeline %>%

In [None]:
# read gene count files, in MEX format
count <- ReadPISA("/course/bgi23/quan/exp/")

In [None]:
# Create a Seurat object
# Set min.features to 5000 to reduce memory usage
# obj <- CreateSeuratObject(count, min.features = 1000, min.cells = 10)
obj <- CreateSeuratObject(count, min.features = 5000, min.cells = 10)

In [None]:
obj

In [None]:
# Setup the mitochrondria gene expression ratio and red blood pollution
obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-")
obj[["percent.hg"]] <- PercentageFeatureSet(obj, pattern = "^HB[ABDEGQZ12]+$")

In [None]:
# Visulizate the features per cell
VlnPlot(obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.hg"), ncol = 4)

In [None]:
# filter outliers
obj <- subset(obj, nFeature_RNA < 9000 & percent.mt < 20)
obj

In [None]:
# normalize (usually log scaled) the gene expression value to stablize the variance
# Comparsion of different normlize method. ref: Ahlmann-Eltze, C., Huber, W. Comparison of transformations for single-cell RNA-seq data. Nat Methods 20, 665–672 (2023). https://doi.org/10.1038/s41592-023-01814-1  

obj <- NormalizeData(obj)

In [None]:
# select the high variable expressed genes 
obj <- FindVariableFeatures(obj, selection.method = "vst", nfeatures = 2000)

In [None]:
# scaled the features for PCA analysis and clust cells by these features
obj <- ScaleData(obj, features = VariableFeatures(obj))
obj <- RunPCA(obj, features =  VariableFeatures(obj))

# Cluster cell groups
obj <- FindNeighbors(obj, dims = 1:10)
obj <- FindClusters(obj, resolution = 0.5)

In [None]:
DimPlot(obj, reduction = "pca")

In [None]:
# Reduce feature space to 2D space
obj <- RunUMAP(obj, dims = 1:10)
DimPlot(obj, reduction = "umap")

In [None]:
# label the clusters
DimPlot(obj, label=TRUE, label.size = 10, label.box = TRUE)

In [None]:
# find gene markers for each cell group 
markers <- FindAllMarkers(obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

In [None]:
markers %>% group_by(cluster) %>% slice_max(n = 5, order_by = avg_log2FC)

In [None]:
# select top 1 gene marker for each cell group and put to `sel`
markers %>% group_by(cluster) %>% slice_max(n = 1, order_by = avg_log2FC) %>% pull(gene) %>% unique() -> sel

In [None]:
options(repr.plot.width=15, repr.plot.height=10) # change the size of figure
# plot expression level of marker genes on umap plot
FeaturePlot(obj, features = sel, ncol = 4) 

In [None]:
# merge gene expression for cells in the same cell group and generate a gene-group expression matrix.
DefaultAssay(obj) <- "RNA"
mat <- AggregateExpression(obj, assays = "RNA", slot="counts")
markers %>% group_by(cluster) %>% slice_max(n = 100, order_by = avg_log2FC) %>% pull(gene) %>% unique()->sel
mat <- mat$RNA[sel,]
dim(mat)

In [None]:
# enrichment analysis
require(msigdbr) # to load database
require(GSVA)
require(ComplexHeatmap)

kegg.dat <-  msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:KEGG") 
kegg.genes <- split(kegg.dat$gene_symbol, kegg.dat$gs_name)
gsva.kegg.result <- gsva(expr=mat, gset.idx.list=kegg.genes, kcdf="Poisson", verbose=FALSE, parallel.sz = 16, mx.diff=1)
options(repr.plot.width=10, repr.plot.height=20) # change the size of figure
Heatmap(gsva.kegg.result)

#go.dat <- msigdbr(species = "Homo sapiens", category = "C5") %>% filter(gs_subcat != "HPO")
#go.genes <- split(go.dat$gene_symbol, go.dat$gs_name)

#gsva.go.result <- gsva(expr=mat, gset.idx.list=go.genes, kcdf="Poisson", verbose=FALSE, parallel.sz = 16, mx.diff=1)
#Heatmap(gsva.go.result)

# Section two, alternative expression analysis

In [None]:
# Load the exon expression file and add the exon-cell matrix to the Seurat object
exon <- ReadPISA("/course/bgi23/quan/exon/")
obj[['EXON']] <- CreateAssayObject(exon[, colnames(obj)], min.cells=30)

In [None]:
rm(exon)
# A new EPT assay now created
names(obj)

In [None]:
# Check the meta data for features. Try to understand between meta.features and obj@meta.data
head(obj[['RNA']]@meta.features)

In [None]:
# set up the default assay to "EXON", let's move to exon analysis
DefaultAssay(obj) <- "EXON"

In [None]:
head(obj[['EXON']]@meta.features)

In [None]:
obj <- ParseExonName(obj)

In [None]:
# update exon annotation
head(obj[['EXON']]@meta.features)

In [None]:
# calculate the autocorrelation score (here use Moran's I) for all exon features
obj <- RunAutoCorr(obj, threads = 8)

In [None]:
# select the spatial autocorrelated features for downstream analysis
obj <- SetAutoCorrFeatures(obj, moransi.min = 0.1)

In [None]:
# This step may take a while; depending on the feature number and cell number, the runtime may range from seconds to hours. 
# The default permutation step is 1000. It's probably too overwhelming. Here, we change perm to 100 to save time.
obj <- RunBlockCorr(obj, block.name = "gene_name", block.assay = "RNA", threads=8, perm=100)

In [None]:
options(repr.plot.width=15, repr.plot.height=5) # change the size of figure
# Plot Genome wide feature binding test plot
FbtPlot(obj, val = "gene_name.pval")

In [None]:
obj[['EXON']]@meta.features %>% filter(gene_name.pval < 0.001) 

In [None]:
# Normalize Exon expression for featureplot
obj <- NormalizeData(obj)

In [None]:
options(repr.plot.width=10, repr.plot.height=5) # change the size of figure
# random pick one exon and its related gene, visulise these features on all cells
FeaturePlot(obj, features = c("chr19:16095264-16095357/+/TPM4", "TPM4"), order = TRUE, pt.size=1)

In [None]:
# Prepare GTF database for track plots 
db <- gtf2db("/course/bgi23/quan/gencode.v44.annotation.gtf.gz")

In [None]:
options(repr.plot.width=15, repr.plot.height=10) # change the size of figure
#plot alignment tracks
plotTracks(bamfile="/course/bgi23/quan/Parent_SC3v3_Human_Glioblastoma_possorted_genome_bam.bam", db=db, gene="TPM4",cell.group =  Idents(obj), highlight=c(16095264,16095357))

In [None]:
#Set max.depth to 1000 and plot tracks again
plotTracks(bamfile="/course/bgi23/quan/Parent_SC3v3_Human_Glioblastoma_possorted_genome_bam.bam", max.depth=1000, db=db, gene="TPM4",cell.group =  Idents(obj), highlight=c(16095264,16095357))

## Task 1: Try to explain the biased coverage of gene expression with the knowledge you learned today.

## Task 2:

* plot more alternative expressed exons and genes on UMAP
* plot alignment tracks of these genes