# 2018-11-21 New modules
In the last few notes we redefined dead and alive cells. Let's now recalculate the gene expression modules with this new definition.

I'll copy and paste the function from the scripts "sc_hiv.R" and "GeneExpressionClustering.R" because we need to adapt them to the new definition of alive vs dead.

In [None]:
# load useful libraries
library(WGCNA)
allowWGCNAThreads()
library(biomaRt)
library(ggplot2)
library(RColorBrewer)
theme_set(theme_bw())

In [None]:
MostVariableGenesNaive <- function(X, ngenes, sampleSheet) {
    # select the group of genes from the untreated J-Lat cells
    jlat.untreated <- X[, sampleSheet$status == 'nontreated']

    # establish which are the most highly varying genes, based on a simple
    # criterion of maximum variance/mean.
    gene.variances <- apply(jlat.untreated, 1, var)
    gene.means <- apply(jlat.untreated, 1, mean)
    gene.variability <- gene.variances/gene.means

    # get the names of the genes that have the greatest biological variation, 
    selected <- order(gene.variability, decreasing = TRUE)[1:ngenes]
    rownames(jlat.untreated[selected, ])
}

In [None]:
PrepareDataForClustering <- function(X, sampleSheet, genes, cut) {
    # select the group of genes from the untreated J-Lat cells
    jlat.untreated <- X[, sampleSheet$status == 'nontreated']

    # extract a data frame with the values of the expressions for each of the genes
    # that the user selected
    datExpr0 <- as.data.frame(t(jlat.untreated[genes, ]))

    # do quality control
    gsg <- goodSamplesGenes(datExpr0, verbose = 3);
    if (!gsg$allOK) {
        stop("Do proper quality control on genes!") 
    }

    # extract the hierarchical clustering tree of the samples
    sampleTree <- hclust(dist(datExpr0), method = "average");

    # plot size
    options(repr.plot.width = 10, repr.plot.height = 6)

    # detect outliers
    par(cex = 0.6);
    par(mar = c(0,4,2,0))
    plot(sampleTree,
         main     = "Sample clustering to detect outliers",
         sub      = "",
         xlab     = "",
         cex.lab  = 1.5,
         cex.axis = 1.5,
         cex.main = 2)

    # Plot a line to show the cut
    abline(h = cut, col = "red");

    # cut the tree according to the user-supplied `cut` parameter, and then 
    # clust 1 will contains the samples we want to keep.
    clust <- cutreeStatic(sampleTree, cutHeight = cut, minSize = 10)
    keepSamples <- (clust == 1)

    # if everything is okay, define a new data expression data.frame
    datExpr0[keepSamples, ]
}

In [None]:
# this function outputs a plot that allows to choose the best value of the
# soft thresholding power
PrepareClustering <- function (datExpr) {
    # Choose a set of soft-thresholding powers
    powers <- c(c(1:10), seq(from = 12, to=20, by=2))

    # Call the network topology analysis function
    sft <- pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
    
    # number of genes and number of samples
    nGenes <- ncol(datExpr)
    nSamples <- nrow(datExpr)

    # Plot the results:
    par(mfrow = c(1,2))
    cex1 = 0.9
    options(repr.plot.width = 10, repr.plot.height = 6)

    # Scale-free topology fit index as a function of the soft-thresholding power
    plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         xlab = "Soft Threshold (power)",
         ylab = "Scale Free Topology Model Fit,signed R^2",
         type = "n",
         main = paste("Scale independence"))

    text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         labels = powers,
         cex    = cex1,
         col    = "red");

    # this line corresponds to using an R^2 cut-off of h
    abline(h = 0.90, col = "red")

    # Mean connectivity as a function of the soft-thresholding power
    plot(sft$fitIndices[,1], sft$fitIndices[,5],
         xlab = "Soft Threshold (power)",
         ylab = "Mean Connectivity",
         type = "n",
         main = paste("Mean connectivity"))

    text(sft$fitIndices[,1], sft$fitIndices[,5],
         labels = powers,
         cex    = cex1,
         col    = "red")
}

In [None]:
VisualizeClustering <- function (net) {
    # plot size
    options(repr.plot.width = 10, repr.plot.height = 6)

    # Convert labels to colors for plotting
    mergedColors <- labels2colors(net$colors)

    # Plot the dendrogram and the module colors underneath
    plotDendroAndColors(net$dendrograms[[1]],
                        mergedColors[net$blockGenes[[1]]],
                        "Module colors",
                        dendroLabels = FALSE,
                        hang = 0.03,
                        addGuide = TRUE,
                        guideHang = 0.05)
}

In [None]:
GeneColors <- function (datExpr, net) {
    # get the module labels, transform them into colors
    colors <- labels2colors(net$colors)

    # get the names of the genes
    genes <- colnames(datExpr)
    
    # put the things together
    C <- data.frame(color = colors)
    rownames(C) <- genes
    
    # return
    C
}

In [None]:
ModuleAnalysis <- function (colors, exprMatrix, sampleSheet) {

    # here, "colors" is a data frame that has as row names the names of 
    # the genes that were use in the identification of the modules. Then
    # there is a column that is called "color" that represents what module that
    # particular gene belongs to.

    # select only the genes that we selected before, of the treated cells,
    # and then transpose the matrix to be given to WGCNA
    myExprMatrix <- t(exprMatrix[rownames(colors), sampleSheet$status == "treated"])

    # get the module eigengenes of the *new* data set: that is, we assign the
    # expression profiles of the treated data set based on the gene modules of the
    # untreated cells
    MEs <- moduleEigengenes(myExprMatrix, colors$color)$eigengenes
    MEs <- orderMEs(MEs)

    # get the names of the cells that we have selected, and extract the HIV profile
    # of those cells
    myCells <- rownames(myExprMatrix)
    hiv <- t(exprMatrix["FILIONG01", myCells])

    # parameters of our data set
    nGenes <- ncol(myExprMatrix)
    nSamples <- nrow(myExprMatrix)

    # correlate the module eigengenes to the HIV expression patterns, and 
    # calculate the corresponding p value
    moduleHivCor <- cor(MEs, hiv, use = "p")
    moduleHivPvalue <- corPvalueStudent(moduleHivCor, nSamples)

    # prepare the return data structure `module`
    modules <- list()

    # add the information on the module eigengenes, together with the hiv
    # expression associated to each cell
    modules[["MEs"]] <- data.frame(MEs)
    rownames(modules[["MEs"]]) <- myCells
    modules[["MEs"]]$hiv <- hiv

    # add the statistics associated to the module eigengenes - to - HIV
    # correlation
    modules[["stats"]] <- data.frame(cor = moduleHivCor, p = moduleHivPvalue)
    names(modules[["stats"]]) <- c("cor", "p")

    # extract the names from the MEs (because they are MEgrey...)
    modNames <- substring(names(MEs), 3)

    # evaluate gene module membership, with associated p-values, and gene
    # to HIV correlations, together with p-values. The following two data frames are
    # full matrices: the row is the gene, the column is the module membership score,
    # and in the second one it is the p-value associated to belonging to that module.
    geneModuleMembership  <- cor(myExprMatrix, MEs, use = "p")
    MMPvalue              <- corPvalueStudent(as.matrix(geneModuleMembership), nSamples)
    colnames(geneModuleMembership) <- modNames
    colnames(MMPvalue) <- modNames
    modules[["MM"]] <- as.data.frame(geneModuleMembership)
    modules[["MMP"]] <- as.data.frame(MMPvalue)

    # calculate gene correlation to HIV expression, along with its p-value
    geneTraitSignificance <- cor(myExprMatrix, hiv, use = "p")
    GSPvalue              <- corPvalueStudent(as.matrix(geneTraitSignificance), nSamples)
    GS <- data.frame(TS = geneTraitSignificance, TSP = GSPvalue)
    colnames(GS) <- c("GS", "GSP")
    modules[["GS"]] <- GS
    
    # return
    modules
}

## Load data

In [None]:
# load the data
matrices.dir <- "/home/rcortini/work/CRG/projects/sc_hiv/data/matrices"
merged <- read.table(sprintf('%s/exprMatrix.csv', matrices.dir),
                     header = TRUE, row.names = 1,
                     sep = "\t", check.names = FALSE)

# load sample sheet
sampleSheet <- read.table(sprintf('%s/samplesheet.csv', matrices.dir),
                          header = TRUE,
                          row.names = 1)

# remove dead cells
sampleSheet <- sampleSheet[sampleSheet$status != "dead", ]

## Two plates merged
After loading the data, we have the choice on how to treat it. We can either separate the two samples, or we can decide that we merge the two samples and do the clustering on the whole data set. Let's start by the latter.

The thing is that it will be impossible to do clustering on the two merged samples without removing the batch effects first. The simplest possible approach to remove batch effects, without doing the complicated `mnnCorrect` procedure, is to normalize by the total expression.

In [None]:
# normalize
totalExpression <- colSums(merged)
merged.normalized <- t(t(merged)/totalExpression)

In [None]:
merged.most.variable.genes.naive <- MostVariableGenesNaive(merged, sampleSheet, ngenes = 5000)
merged.datExpr <- PrepareDataForClustering(merged.normalized, sampleSheet,
                                          merged.most.variable.genes.naive,
                                          cut = 0.02)

In [None]:
PrepareClustering(merged.datExpr)

In [None]:
merged.net <- blockwiseModules(merged.datExpr,
                        power             = 5,
                        TOMType           = "unsigned",
                        inModuleSize      = 30,
                        reassignThreshold = 0,
                        mergeCutHeight    = 0.25,
                        numericLabels     = TRUE,
                        pamRespectsDendro = FALSE,
                        verbose           = 0)

In [None]:
VisualizeClustering(merged.net)

So here the clustering looks not very encouraging. Let's associate the clustering to the HIV expression.

In [None]:
merged.colors <- GeneColors(merged.datExpr, merged.net)
merged.modules <- ModuleAnalysis(merged.colors, merged, sampleSheet)

In [None]:
merged.modules$stats

So, as we already saw in the previous analysis, there is no significant correlation between the modules here and the HIV expression values.

## Two separate samples

Now let's go back and try to do the analysis with the two samples separated.

### P2449

In [None]:
# select the cells from the sample that we want
P2449.cells <- substring(colnames(merged), 0, 5) == "P2449"
P2449 <- merged[, P2449.cells]
P2449.sampleSheet <- sampleSheet[P2449.cells, ]

In [None]:
P2449.most.variable.genes.naive <- MostVariableGenesNaive(P2449, P2449.sampleSheet, ngenes = 5000)
P2449.datExpr <- PrepareDataForClustering(P2449, P2449.sampleSheet,
                                          genes = P2449.most.variable.genes.naive,
                                          cut = 14000)

In [None]:
PrepareClustering(P2449.datExpr)

In [None]:
P2449.net <- blockwiseModules(P2449.datExpr,
                        power             = 5,
                        TOMType           = "unsigned",
                        inModuleSize      = 30,
                        reassignThreshold = 0,
                        mergeCutHeight    = 0.25,
                        numericLabels     = TRUE,
                        pamRespectsDendro = FALSE,
                        verbose           = 0)

In [None]:
VisualizeClustering(P2449.net)

In [None]:
P2449.colors <- GeneColors(P2449.datExpr, P2449.net)
P2449.modules <- ModuleAnalysis(P2449.colors, P2449, P2449.sampleSheet)

In [None]:
P2449.modules$stats

So in this case it's interesting because we obtain a result that is different from the one that we obtained before.

In [None]:
# define the genes we are interested in
greyGenes <- rownames(P2449.colors)[which(P2449.colors == "grey")]

In [None]:
grey <- data.frame(MM = P2449.modules[["MM"]][greyGenes, "grey"],
                   GS = P2449.modules[["GS"]][greyGenes, "GS"])
rownames(grey) <- greyGenes

In [None]:
options(repr.plot.width = 3, repr.plot.height = 3)
ggplot(grey, aes(x = MM, y = GS)) +
       geom_point(col="darkgreen") + 
       geom_smooth(method="lm", col="black") +
       labs(main = "Grey module",
            x = "Module Membership",
            y = "Gene Significance")

The result of this analysis outputs this interesting module that was not picked up before. This "grey" module is full of genes that have a high correlation to HIV expression. Let's pick the highest performing genes and make plots of their correlation to HIV expression.

In [None]:
# order the grey genes based on the gene significance values
grey.ordered <- rownames(grey[order(grey$GS, decreasing = TRUE),])

In [None]:
# now put in a data frame the expression values of the grey genes in the treated cells,
# together with the HIV expression
treated <- P2449.sampleSheet$status=="treated"
X <- data.frame(expr = t(P2449[grey.ordered, treated]), hiv = t(P2449["FILIONG01", treated]))
colnames(X) <- c(grey.ordered, "hiv")

In [None]:
# let's now plot the columns of our X matrix, which has the most significant genes in 
# the first columns
options(repr.plot.width = 2.5, repr.plot.height = 2)
for (i in seq(1:10)) {
    mygene <- grey.ordered[[i]]
    gg <- ggplot(X, aes_string(mygene, "hiv")) + geom_point()  +
    labs(x = grey.ordered[i], y = "GFP expression",
         title = sprintf("p = %.3f", P2449.modules[["GS"]][mygene, "GS"]))
    print(gg)
}

At this point I could do the analysis also for the P2458 sample, but I would actually like to stop and rethink about how I defined the significant genes in the first place. These plots show that some of the genes that were dubbed as maximally varying in the untreated cells, are actually stably repressed in the treated cells. So it would perhaps be better to think of a better way of defining the maximally varying genes, probably in terms of maximal variability *between the groups* and not *within the groups*. This way we would avoid this tricky problem of having a very high correlation between the HIV expression and the gene expression driven entirely by one outlier.