In [None]:
# ggplot stuff
library(ggplot2)
library(RColorBrewer)
theme_set(theme_bw())

# 2019-03-12 Combinations of Differential Expression Analysis

So far I did the differential expressin analysis (DEA) using only the separated groups of responders and non-responders. What happens if I try to do the DEA between non-treated cells and responders, or non-responders?

In [None]:
# load the expression matrix
data.dir <- "/home/rcortini/work/CRG/projects/sc_hiv/data"
matrix.fname <- sprintf('%s/matrices/exprMatrix.tsv', data.dir)
exprMatrix <- read.table(matrix.fname, header = TRUE, row.names = 1,
                                       sep = "\t", check.names = FALSE)

In [None]:
# load the sample sheet
sample.sheet.fname <- sprintf("%s/metadata/sampleSheet.tsv", data.dir)
sampleSheet <- read.delim(sample.sheet.fname, header = TRUE, row.names = 1)

In [None]:
# load gene annotations file
gene.annotations <- sprintf("%s/matrices/gene_annotations.tsv", data.dir)
gene.data <- read.delim(gene.annotations, header = TRUE, sep = "\t",
                        row.names = 1, stringsAsFactors = FALSE)
gene.data <- subset(gene.data, rownames(gene.data) %in% rownames(exprMatrix))

## Preliminaries

In [None]:
# DESeq
library(DESeq)

In [None]:
do.DEA <- function(expr.matrix, groups, gene.data,
                   g1, g2, method = "per-condition") {
    
    # cast to integer the expression matrix, otherwise DESeq will complain
    expr.matrix.int <- as.data.frame(lapply(expr.matrix, as.integer))
    
    # give the same names to the new matrix as the ones before
    rownames(expr.matrix.int) <- rownames(expr.matrix)

    # this is the basic data structure that DESeq understands
    cds <- newCountDataSet(expr.matrix.int, groups)

    # estimate size factors
    cds <- estimateSizeFactors(cds)

    # estimate dispersion
    if (method == "per-gene") {
        cds <- estimateDispersions(cds, sharingMode="gene-est-only")
    }
    else if (method == "per-condition"){
        cds <- estimateDispersions(cds, method="per-condition", fitType="local")
    }
    else {
        stop("Invalid method")
    }
    
    # do the differential expression analysis
    de.test <- nbinomTest(cds, g1, g2)
    
    # now attach the information on the genes to the data frames that we obtained
    de.test$symbol <- gene.data[de.test$id, ]
    
    # return
    de.test
}

In [None]:
# this function allows to filter and sort the results of the differential
# expression analysis
find.significant.genes <- function(de.result, alpha = 0.05) {

  # filter out significant genes based on FDR adjusted p-values
  filtered <- de.result[(de.result$padj < alpha) &
                        !is.infinite(de.result$log2FoldChange) & 
                        !is.nan(de.result$log2FoldChange),]

  # order by p-value
  sorted <- filtered[order(filtered$pval),]
}

In [None]:
# this block of code is to identify the alive cells
norm.exprMatrix <- exprMatrix[rowSums(exprMatrix)>1, ]
total <- colSums(norm.exprMatrix)
norm.exprMatrix <- t(norm.exprMatrix)
norm.exprMatrix <- norm.exprMatrix / rowSums(norm.exprMatrix)
norm.exprMatrix <- t(norm.exprMatrix)
exprMatrix.pca <- prcomp(t(norm.exprMatrix), scale = TRUE)
pca <- as.data.frame(exprMatrix.pca$x)
alive.cells <- rownames(pca)[pca$PC1 < 0]

In [None]:
# filter out the dead cells from the sample sheet and from the matrix
exprMatrix <- exprMatrix[, alive.cells]
sampleSheet <- subset(sampleSheet, rownames(sampleSheet) %in% alive.cells)

In the following blocks, I'll give a label to all the cells, so that I don't have to prepare an expression matrix for each different test that I want to make.

In [None]:
sampleSheet

In [None]:
# treated cells
SAHA.treated <- rownames(sampleSheet)[sampleSheet$label == "J-LatA2+SAHA"]
PMA.treated <- rownames(sampleSheet)[sampleSheet$label == "J-LatA2+PMA"]
all.treated <- union(SAHA.treated, PMA.treated)

# non-treated cells
jlat.DMSO.treated <- rownames(sampleSheet$label == "J-LatA2+DMSO")
jkt.DMSO.treated <- rownames(sampleSheet)[sampleSheet$label == "Jurkat+DMSO"]
all.nontreated <- union(jlat.DMSO.treated, jkt.DMSO.treated)

# responders and nonresponders
responders <- colnames(exprMatrix[,all.treated])[exprMatrix["FILIONG01", all.treated] > 0]
nonresponders <- colnames(exprMatrix[,all.treated])[exprMatrix["FILIONG01", all.treated] == 0]

# intersections
SAHA.responders <- intersect(SAHA.treated, responders)
SAHA.nonresponders <- intersect(SAHA.treated, nonresponders)
PMA.responders <- intersect(PMA.treated, responders)
PMA.nonresponders <- intersect(PMA.treated, nonresponders)

In [None]:
# define the groups
mylevels <- c("DMSO-treated", "SAHA-treated", "PMA-treated")
groups <- factor(rep("DMSO-treated", ncol(exprMatrix)),
                 levels = mylevels)
groups[colnames(exprMatrix) %in% SAHA.treated] <- "SAHA-treated"
groups[colnames(exprMatrix) %in% PMA.treated] <- "PMA-treated"

In [None]:
head(colnames(exprMatrix))

In [None]:
head(groups)

In [None]:
head(colnames(exprMatrix))

In [None]:
sampleSheet['P2771_N715-S506', ]

In [None]:
de.test <- do.DEA(exprMatrix, groups, gene.data,
                    "DMSO-treated", "SAHA-treated", method = "per-condition")
de.genes <- find.significant.genes(de.test, alpha = 0.1)

In [None]:
dim(de.genes)

Responders versus non-responders given the same drug.

### SAHA

In [None]:
mylevels <- c("none", "SAHA-responders", "SAHA-nonresponders")
groups <- factor(rep("none", ncol(exprMatrix)),
                 levels = mylevels)
groups[colnames(exprMatrix) %in% SAHA.responders] <- "SAHA-responders"
groups[colnames(exprMatrix) %in% SAHA.nonresponders] <- "SAHA-nonresponders"

In [None]:
de.test <- do.DEA(exprMatrix, groups, gene.data,
                    "SAHA-responders", "SAHA-nonresponders", method = "per-condition")
de.genes <- find.significant.genes(de.test, alpha = 0.1)

In [None]:
de.test[de.test$id == PUS10, ]

### PMA

In [None]:
mylevels <- c("none", "PMA-responders", "PMA-nonresponders")
groups <- factor(rep("none", ncol(exprMatrix)),
                 levels = mylevels)
groups[colnames(exprMatrix) %in% PMA.responders] <- "PMA-responders"
groups[colnames(exprMatrix) %in% PMA.nonresponders] <- "PMA-nonresponders"

In [None]:
de.test <- do.DEA(exprMatrix, groups, gene.data,
                    "PMA-responders", "PMA-nonresponders", method = "per-condition")
de.genes <- find.significant.genes(de.test, alpha = 0.1)

In [None]:
de.genes

## Some other thoughts
The genes that appear as significant in the differential expression analysis do not seem to have much of a biological connection to HIV. This means that something here is going wrong.

Let's have a look at some descriptive statistics.

In [None]:
# the names of all the cells
cell.names <- colnames(exprMatrix)

# is it from the old experiments or the new?
batch <- factor(rep("old", length(cell.names)), levels = c("old", "new"))
batch[startsWith(cell.names, "P2769") |
      startsWith(cell.names, "P2770") |
      startsWith(cell.names, "P2771")] <- "new"

In [None]:
# the cell "treated" variable as to if it was treated or not treated
match.order <- match(cell.names, rownames(sampleSheet))
treatment <- factor(rep("nontreated", length(cell.names)), levels = c("nontreated", "SAHA", "PMA"))
treatment[(sampleSheet$label == "J-LatA2+SAHA" | 
           sampleSheet$label == "Jurkat+SAHA")[match.order]] <- "SAHA"
treatment[(sampleSheet$label == "J-LatA2+PMA" |
           sampleSheet$label == "Jurkat+PMA")[match.order]] <- "PMA"

In [None]:
# the "responder" variable, as to whether the HIV levels are detectable
responder <- factor(rep("nonresponder", length(cell.names)),
                    levels = c("nonresponder", "responder"))
responder[exprMatrix["FILIONG01", ] > 2] <- "responder"

In [None]:
# the "infected" variable
infected <- factor(rep("noninfected", length(cell.names)),
                   levels = c("noninfected", "infected"))
infected[(startsWith(as.character(sampleSheet$label), "J-LatA2"))[match.order]] <- "infected"

In [None]:
# put all together
celldata <- data.frame(row.names = cell.names,
                       batch = batch,
                       treatment = treatment,
                       responder = responder, 
                       infected = infected)

Now we make a table to see whether things make sense.

In [None]:
table(celldata, exclude = "FILIONG01")

Okay, so we have that about 50% of the cells are responders in all cases.

Let's now add the information about PUS10.

In [None]:
PUS10 <- "ENSG00000162927.13"
PUS10.active <- factor(rep("PUS10-inactive", length(cell.names)),
                       levels = c("PUS10-inactive", "PUS10-active"))
PUS10.active[exprMatrix[PUS10, ] > 2] <- "PUS10-active"
celldata$PUS10 <- PUS10.active

In [None]:
my.celldata <- subset(celldata, celldata$infected == "infected")
my.celldata <- my.celldata[, -which(colnames(celldata) == "infected")]
my.celldata <- subset(my.celldata, my.celldata$treatment == "SAHA")
my.celldata <- my.celldata[, -which(colnames(my.celldata) == "treatment")]
# my.celldata <- subset(my.celldata, my.celldata$batch == "new")
my.celldata <- my.celldata[, -which(colnames(my.celldata) == "batch")]
my.celldata
# table(celldata)

In [None]:
fisher.test(table(my.celldata))