# 2018-11-09 Genes in significant modules

I was asked to give the list of the genes that belong to the modules with significant association to HIV. I have to recompile the gene networks, the module eigengenes, and add the information on the genes. Then finally save files in a compact format.

In [None]:
# load our lovely script
source("/home/rcortini/work/CRG/projects/sc_hiv/scripts/GeneExpressionClustering.R")
library(biomaRt)

In [None]:
# basic data
matrices.dir <- "/home/rcortini/work/CRG/projects/sc_hiv/data/matrices"
sample.names <- c("P2449", "P2458")

# init data structures that will hold our data
exprMatrices <- list()
sampleSheets <- list()

# load data
for (sample.name in sample.names) {
    
    # file names
    matrix.fname <- sprintf("%s/%s.tsv.gz", matrices.dir, sample.name)
    sampleSheet.fname <- sprintf("%s/monocle/%s.pd.tsv", matrices.dir, sample.name)

    # parse data
    exprMatrices[[sample.name]] <- read.table(matrix.fname, header = TRUE, row.names = 1,
                                sep = "\t", check.names = FALSE)
    sampleSheets[[sample.name]] <- read.delim(sampleSheet.fname, header = TRUE, row.names = 1)
}

# load gene annotations file
gene.annotations <- sprintf("%s/gene_annotations.tsv", matrices.dir)
gene.data <- read.delim(gene.annotations, header = TRUE, row.names = 1, sep = "\t")

In [None]:
# select our sample
sample.name <- "P2449"
exprMatrix <- exprMatrices[[sample.name]]
sampleSheet <- sampleSheets[[sample.name]]

# filter the expression data
datExpr <- PrepareDatExpr(exprMatrix, sampleSheet, ngenes = 5000, cut = 14000)

In [None]:
# reconstruct the network
net <- ClusterGenes(datExpr, softThresholdPower = 5)

# associate network motifs to HIV expression patterns
modules <- AssociateClustersToHIV(datExpr, exprMatrix, sampleSheet, net,
                                  aliveThreshold = 100000)

Now we go on and calculate the module membership (MM) and gene significance (GS) for the individual modules.

In [None]:
# let's go back and calculate the gene module membership and trait significance
myExprMatrix <- exprMatrices[[sample.name]]

# select only the genes that we selected before
myGenes <- colnames(datExpr)
myExprMatrix <- myExprMatrix[myGenes, ]

# select only J-Lat treated cells
myExprMatrix <- myExprMatrix[, sampleSheets[[sample.name]]$label == "J-Lat+SAHA"]

# select only alive cells
myExprMatrix <- myExprMatrix[, colSums(myExprMatrix) > 100000]

# finally, transpose to be interfaced to WGCNA
myExprMatrix <- t(myExprMatrix)

# get the names of the cells that we have selected, and extract the HIV profile
# of those cells
myCells <- rownames(myExprMatrix)
hiv <- t(exprMatrices[[sample.name]]["FILIONG01", myCells])

In [None]:
# parameters of our data set
nGenes <- ncol(myExprMatrix)
nSamples <- nrow(myExprMatrix)

In [None]:
# get module eigengenes
moduleColors <- labels2colors(net$colors)
MEs <- moduleEigengenes(myExprMatrix, moduleColors)$eigengenes
MEs <- orderMEs(MEs)

# module names
modNames <- substring(names(MEs), 3)

# evaluate gene module membership
geneModuleMembership <- as.data.frame(cor(myExprMatrix, MEs, use = "p"))
MMPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) <- modNames
names(MMPvalue) <- paste("p.", modNames, sep="")

# evaluate gene trait significance
geneTraitSignificance <- as.data.frame(cor(myExprMatrix, hiv, use = "p"));
GSPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
names(geneTraitSignificance) <- "GS.HIV"
names(GSPvalue) <- "p.GS.HIV"

The next step is to get the information for the genes in the significant modules. I'll encapsulate the function to generate the table in one place, so I can then invoke the same piece of code for all the modules I want to study. This function fetches information from the Ensembl database through the `biomaRt` R package, which does the magic.

In [None]:
# a function to extract the names of the genes corresponding to a certain module
# of a certain sample name
GenesOfModuleTable <- function(module.color, net, datExpr, mart,
                              geneModuleMembership, MMPvalue,
                              geneTraitSignificance, GSPvalue) {
    
    # get the list of colors
    geneColors <- labels2colors(net$colors)
    
    # select the genes from that module
    genes.idx <- which(geneColors == module.color)
    genes <- colnames(datExpr[, genes.idx])
    
    # get information on the genes
    info <- getBM(attributes = c("ensembl_gene_id_version", "hgnc_symbol", "description"),
                  filters = "ensembl_gene_id_version",
                  values = genes,
                  mart = mart)
    
    # use the "ensembl_gene_id_version" column as row names of the data frame
    rownames(info) <- info$ensembl_gene_id_version
    info$ensembl_gene_id_version <- NULL
    
    # Now the problem is that not all the genes in the gene lists that we provided were found.
    # Therefore we need to define the lists of genes of interest taking that into account.    
    genes <- rownames(info)
    
    # put everything together
    info <- cbind(info, MM  = geneModuleMembership[genes, module.color])
    info <- cbind(info, MMP = MMPvalue[genes, paste("p.", module.color, sep="")])
    info <- cbind(info, GS  = geneTraitSignificance[genes, ])
    info <- cbind(info, GSP = GSPvalue[genes, ])
    
    # save to file
    write.table(info,
            file = sprintf("%s/%s-%s.csv", matrices.dir, sample.name, module.color),
            sep = ",", col.names = NA)
    
    # return
    as.data.frame(info)
}

In [None]:
# load the data corresponding to human genome in the ENSEMBL Mart
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

In [None]:
# here I select the modules that are significantly associated to HIV expression
darkGreen.info <- GenesOfModuleTable("darkgreen", net, datExpr, mart,
                              geneModuleMembership, MMPvalue,
                              geneTraitSignificance, GSPvalue)

darkTurquoise.info <- GenesOfModuleTable("darkturquoise", net, datExpr, mart,
                              geneModuleMembership, MMPvalue,
                              geneTraitSignificance, GSPvalue)