In [None]:
# preliminaries

# load the expression matrix
data.dir <- "/home/rcortini/work/CRG/projects/sc_hiv/data"
matrix.fname <- sprintf('%s/matrices/exprMatrix.tsv', data.dir)
exprMatrix <- read.table(matrix.fname, header = TRUE, row.names = 1,
                                       sep = "\t", check.names = FALSE)

# load the sample sheet
sample.sheet.fname <- sprintf("%s/metadata/sampleSheet.tsv", data.dir)
sampleSheet <- read.delim(sample.sheet.fname, header = TRUE, row.names = 1)

# load gene annotations file
gene.annotations <- sprintf("%s/matrices/gene_annotations.tsv", data.dir)
gene.data <- read.delim(gene.annotations, header = TRUE, sep = "\t",
                        row.names = 1, stringsAsFactors = FALSE)
gene.data <- subset(gene.data, rownames(gene.data) %in% rownames(exprMatrix))

# remove genes that have no expression
norm.exprMatrix <- exprMatrix[rowSums(exprMatrix)>1, ]

# normalize by row sum
total <- colSums(norm.exprMatrix)
norm.exprMatrix <- t(norm.exprMatrix)
norm.exprMatrix <- norm.exprMatrix / rowSums(norm.exprMatrix)
norm.exprMatrix <- t(norm.exprMatrix)

# do the PCA
exprMatrix.pca <- prcomp(t(norm.exprMatrix), scale = TRUE)
pca <- as.data.frame(exprMatrix.pca$x)

# filter out the dead cells from the sample sheet and from the matrix
alive.cells <- rownames(pca)[pca$PC1 < 0]
exprMatrix <- exprMatrix[, alive.cells]
sampleSheet <- subset(sampleSheet, rownames(sampleSheet) %in% alive.cells)

In [None]:
library(scatterpie)

# 2019-03-15 Clustering again
After the miserable failure of the differential expression analysis approach, we return to the question of whether we can extract some useful information from the data set, by doing clustering of the gene expression patterns.

There are many, many, many, packages out there which allow you to do clustering.

Here, I'll try to use some of the packages and find out what they (don't) tell us.

## CountClust

github.com/kkdey/CountClust

Based on Grade of Membership Models.

### Reproducing results of the tutorial

In [None]:
library(CountClust)

In [None]:
library(singleCellRNASeqMouseDeng2014)
deng.counts <- exprs(Deng2014MouseESC)
deng.meta_data <- pData(Deng2014MouseESC)
deng.gene_names <- rownames(deng.counts)

In [None]:
MouseDeng2014.FitGoM.results <- FitGoM(t(deng.counts),K=3,path_rda="MouseDeng2014.FitGoM.rda")

In [None]:
data("MouseDeng2014.FitGoM")
names(MouseDeng2014.FitGoM)
omega <- MouseDeng2014.FitGoM$clust_6$omega

annotation <- data.frame(
  sample_id = paste0("X", c(1:NROW(omega))),
  tissue_label = factor(rownames(omega),
                        levels = rev( c("zy", "early2cell",
                                        "mid2cell", "late2cell",
                                        "4cell", "8cell", "16cell",
                                        "earlyblast","midblast",
                                         "lateblast") ) ) )

rownames(omega) <- annotation$sample_id;

StructureGGplot(omega = omega,
                annotation = annotation,
                palette = RColorBrewer::brewer.pal(8, "Accent"),
                yaxis_label = "Amplification batch",
                order_sample = TRUE,
                axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 7,
                                 axis_label_face = "bold"))

In [None]:
library(RColorBrewer)

In [None]:
StructurePie(t(deng.counts), input_type="apply_tsne",
             use_voom=FALSE, omega = omega, xlab="TSNE1",
             ylab = "TSNE2",
             main = "STRUCTURE K=6 pie on tSNE",
             control = list(bg = "lightcyan"))

In [None]:
StructurePie(t(deng.counts), input_type="apply_pca",
             use_voom = TRUE, omega = omega, xlab="PCA1",
             ylab = "PCA2",
             main = "STRUCTURE K=6 pie on PCA",
             control = list(bg = "lightcyan"))

### Our data

The first step is to invoke the clustering function.

In [None]:
K <- 15
myresults <- FitGoM(t(exprMatrix), K=K, path_rda="CountClust.FitGoM.rda")
# myresults <- FitGoM(t(exprMatrix[-which(rownames(exprMatrix) == "FILIONG01"), ]), K=K, path_rda="CountClust.FitGoM.rda")

We then arrange the output results according to our needs.

In [None]:
# get the results into variables that can be managed more easily
fit <- myresults$fit
theta <- as.data.frame(fit$theta)
omega <- as.data.frame(fit$omega)

# prepare the "annotation" data frame
cell.types <- as.character(unique(sampleSheet$label))
match.idx <- match(rownames(omega), rownames(sampleSheet))
cell.ids <- sampleSheet$label[match.idx]
annotation <- data.frame(
  sample_id = rownames(omega),
  tissue_label = cell.ids)

# use the function from the CountClust package to visualize the results
options(repr.plot.width = 5, repr.plot.height = 7)
StructureGGplot(omega = omega,
                annotation = annotation,
                palette = colorRampPalette(RColorBrewer::brewer.pal(8, "PRGn"))(K),
                yaxis_label = "Cell type",
                order_sample = TRUE,
                axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 10,
                                 axis_label_face = "bold"))

By looking at this graph, we can actually see that the patterns between cells that undergo the same treatment look similar. This is a good sign.

Let's try to have a look at the results for the SAHA-treated cells.

In [None]:
jlat.SAHA.omega <- subset(omega, sampleSheet[rownames(omega), "label"] == "J-LatA2+SAHA")
colnames(jlat.SAHA.omega) <- c(paste0("module", 1:K))
jlat.SAHA.omega$hiv <- t(exprMatrix["FILIONG01", rownames(jlat.SAHA.omega)])

Let's play around with the correlation between module score and HIV expression levels.

In [None]:
pvals <- list()
for (i in 1:K) {
    m <- lm(jlat.SAHA.omega[, "hiv"] ~ jlat.SAHA.omega[, paste0("module", i)])
    p <- summary(m)$coefficients[2, 4]
    pvals[i] <- p
}
which.min(pvals)

In [None]:
options(repr.plot.width = 3.5, repr.plot.height = 3)
ggplot() + geom_point(aes(x = module11, y = hiv), data = jlat.SAHA.omega)

It seems that module 11 has something to tell us.

In [None]:
module.11.order <- order(theta[, "11"], decreasing = TRUE)
head(theta[module.11.order, ])

In [None]:
library(biomaRt)

In [None]:
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

In [None]:
mygenes <- rownames(theta[module.11.order, ])[1:5]
mygenes.uniprot <- getBM(attributes = c("ensembl_gene_id_version", "uniprotswissprot"),
                  filters = "ensembl_gene_id_version",
                  values = mygenes,
                  mart = mart)
ids <- subset(mygenes.uniprot$uniprotswissprot, mygenes.uniprot$uniprotswissprot != "")
cat(paste(ids, collapse = "\n"))

In [None]:
listAttributes(mart)[which(startsWith(listAttributes(mart)$name, "uniprot")), ]