# 2018-11-28 Modules again
We identified that there is a gene module that contains genes that collectively are significantly associated to HIV expression.

By redefining maximally varying genes based on a more robust criterion (using a negative binomial distribution instead of a very rough criterion) we discovered that there is one candidate gene that has significant association to the difference between responders and non-responders to the SAHA treatment: PUS10.

The disappointing news is that PUS10 does not belong to any of the significant gene modules that we identified earlier. A rapid analysis shows that the problem might be that the definition of which genes go in the analysis and which ones stay out was based on a criterion that has a large impact on the results of the analysis. Therefore, here I want to try to use the criterion adopted for the differential expression analysis, based on negative binomial distribution probabilities, to establish whether a gene will or will not participate in the definition of the modules.

In [None]:
# load WGCNA library
library(WGCNA)
allowWGCNAThreads()

# biomaRt for obtaining information on genes
library(biomaRt)

# ggplot stuff
library(ggplot2)
library(RColorBrewer)
theme_set(theme_bw())

# DESeq
library(DESeq)

# extra goodies
library(Rfast)

In [None]:
# load the data
matrices.dir <- "/home/rcortini/work/CRG/projects/sc_hiv/data/matrices"
merged <- read.table(sprintf('%s/exprMatrix.csv', matrices.dir),
                     header = TRUE, row.names = 1,
                     sep = "\t", check.names = FALSE)

# load sample sheet
sampleSheet <- read.table(sprintf('%s/samplesheet.csv', matrices.dir),
                          header = TRUE,
                          row.names = 1)

# remove dead cells
sampleSheet <- sampleSheet[sampleSheet$status != "dead", ]

In [None]:
# load gene annotations file
gene.annotations <- sprintf("%s/gene_annotations.tsv", matrices.dir)
gene.data <- read.delim(gene.annotations, header = TRUE, sep = "\t",
                        row.names = 1, stringsAsFactors = FALSE)
gene.data <- subset(gene.data, rownames(gene.data) %in% rownames(merged))

In [None]:
# load script
source("/home/rcortini/work/CRG/projects/sc_hiv/scripts/GeneExpressionClustering.R")

In [None]:
# normalize
totalExpression <- colSums(merged)
merged.normalized <- t(t(merged)/totalExpression)

So now instead of using the naive criterion for the definition of maximally varying genes, I'll use a new approach. Prepare first the data for interfacing to DESeq.

In [None]:
# select the cells from the sample that we want
P2449.cells <- substring(colnames(merged), 0, 5) == "P2449"
P2449 <- merged[, P2449.cells]
P2449.sampleSheet <- sampleSheet[P2449.cells, ]

In [None]:
# group the cell types together as factors
groups <- factor(P2449.sampleSheet$label,
                 levels = c("Jurkat", "J-Lat+DMSO", "J-Lat+SAHA"))
table(groups)

# cast to integer
P2449.int <- as.data.frame(lapply(P2449, as.integer))
rownames(P2449.int) <- rownames(P2449)

# this is the basic data structure that DESeq understands
cds <- newCountDataSet(P2449.int, groups)

# estimate size factors
cds <- estimateSizeFactors(cds)

# estimate dispersion
cds <- estimateDispersions(cds, sharingMode="gene-est-only")

Now, let's do the differential expression analysis between the treated and non-treated cells.

In [None]:
de.test <- nbinomTest(cds, "J-Lat+DMSO", "J-Lat+SAHA")

In [None]:
# this function allows to filter and sort the results of the differential
# expression analysis
find.significant.genes <- function(de.result, alpha = 0.05) {

  # filter out significant genes based on FDR adjusted p-values
  filtered <- de.result[(de.result$padj < alpha) &
                        !is.infinite(de.result$log2FoldChange) & 
                        !is.nan(de.result$log2FoldChange),]

  # order by p-value
  sorted <- filtered[order(filtered$pval),]
}

In [None]:
# perform the filtering and sorting here
de.genes <- find.significant.genes(de.test)

In [None]:
# get the names of the genes
genes <- de.genes$id

In [None]:
# now proceed with the clustering
P2449.datExpr <- PrepareDataForClustering(P2449, P2449.sampleSheet,
                                          genes = genes,
                                          cut = 10000)

In [None]:
PrepareClustering(P2449.datExpr)

In [None]:
P2449.net <- blockwiseModules(P2449.datExpr,
                        power             = 7,
                        TOMType           = "unsigned",
                        inModuleSize      = 30,
                        reassignThreshold = 0,
                        mergeCutHeight    = 0.25,
                        numericLabels     = TRUE,
                        pamRespectsDendro = FALSE,
                        verbose           = 0)

In [None]:
VisualizeClustering(P2449.net)

In [None]:
P2449.colors <- GeneColors(P2449.datExpr, P2449.net)
P2449.modules <- ModuleAnalysis(P2449.colors, P2449, P2449.sampleSheet)
P2449.modules$stats

In [None]:
PUS10 <- "ENSG00000162927.13"
P2449.modules$MMP #[PUS10,]

In [None]:
nSamples <- ncol(merged)
nGenes <- nrow(merged)

In [None]:
threshold <- 0.5
percent.zero <- rowSums(merged == 0)/nSamples
active.genes <- rownames(merged)[percent.zero<threshold]

In [None]:
# now proceed with the clustering
P2449.datExpr <- PrepareDataForClustering(P2449, P2449.sampleSheet,
                                          genes = active.genes,
                                          cut = 14000)

In [None]:
PrepareClustering(P2449.datExpr)

In [None]:
P2449.net <- blockwiseModules(P2449.datExpr,
                        power             = 7,
                        TOMType           = "unsigned",
                        inModuleSize      = 30,
                        reassignThreshold = 0,
                        mergeCutHeight    = 0.25,
                        numericLabels     = TRUE,
                        pamRespectsDendro = FALSE,
                        verbose           = 0)

In [None]:
VisualizeClustering(P2449.net)

In [None]:
P2449.colors <- GeneColors(P2449.datExpr, P2449.net)
P2449.modules <- ModuleAnalysis(P2449.colors, P2449, P2449.sampleSheet)
P2449.modules$stats