# 2018-11-09 Other analyses part 2
To avoid namespace collisions I open a new notebook.

## Pooling samples together

I want to try to pool samples together when it comes to performing the clustering analysis. Let's see what comes out.

In [None]:
source("/home/rcortini/work/CRG/projects/sc_hiv/scripts/sc_hiv.R")

In [None]:
# basic data
matrices.dir <- "/home/rcortini/work/CRG/projects/sc_hiv/data/matrices"
sample.names <- c("P2449", "P2458")

# init data structures that will hold our data
exprMatrices <- list()
sampleSheets <- list()

# load data
for (sample.name in sample.names) {
    
    # file names
    matrix.fname <- sprintf("%s/%s.tsv.gz", matrices.dir, sample.name)
    sampleSheet.fname <- sprintf("%s/monocle/%s.pd.tsv", matrices.dir, sample.name)

    # parse data
    exprMatrices[[sample.name]] <- read.table(matrix.fname, header = TRUE, row.names = 1,
                                sep = "\t", check.names = FALSE)
    sampleSheets[[sample.name]] <- read.delim(sampleSheet.fname, header = TRUE, row.names = 1)
}

# load gene annotations file
gene.annotations <- sprintf("%s/gene_annotations.tsv", matrices.dir)
gene.data <- read.delim(gene.annotations, header = TRUE, row.names = 1, sep = "\t")

In [None]:
# prepare data
exprMatrix <- cbind(exprMatrices[["P2449"]], exprMatrices[["P2458"]])
sampleSheet <- rbind(sampleSheets[["P2449"]], sampleSheets[["P2458"]])

In [None]:
datExpr <- PrepareDataForClustering(exprMatrix, sampleSheet,
                                    cut = 18000,
                                    ngenes = 5000)

In [None]:
PrepareClustering(datExpr)

In [None]:
net <- blockwiseModules(datExpr,
                        power             = 5,
                        TOMType           = "unsigned", 
                        inModuleSize      = 30,
                        reassignThreshold = 0,
                        mergeCutHeight    = 0.25,
                        numericLabels     = TRUE,
                        pamRespectsDendro = FALSE,
                        verbose           = 0)

In [None]:
VisualizeClustering(net)

In [None]:
# get the module labels, transform them into colors
moduleLabels <- net$colors
moduleColors <- labels2colors(net$colors)

# get the names of the genes we selected from the original ones
myGenes <- colnames(datExpr)

In [None]:
myExprMatrix <- exprMatrices[[sample.name]]

# select only the genes that we selected before
myExprMatrix <- myExprMatrix[myGenes, ]

# select only J-Lat treated cells
myExprMatrix <- myExprMatrix[, sampleSheets[[sample.name]]$label == "J-Lat+SAHA"]

# select only alive cells
myExprMatrix <- myExprMatrix[, colSums(myExprMatrix) > 100000]

# finally, transpose to be interfaced to WGCNA
myExprMatrix <- t(myExprMatrix)

In [None]:
# get the module eigengenes of the *new* data set: that is, we assign the
# expression profiles of the treated data set based on the gene modules of the
# untreated cells
MEs <- moduleEigengenes(myExprMatrix, moduleColors)$eigengenes
MEs <- orderMEs(MEs)

In [None]:
# get the names of the cells that we have selected, and extract the HIV profile
# of those cells
myCells <- rownames(myExprMatrix)
hiv <- t(exprMatrices[[sample.name]]["FILIONG01", myCells])

In [None]:
# parameters of our data set
nGenes <- ncol(myExprMatrix)
nSamples <- nrow(myExprMatrix)

In [None]:
# correlate the module eigengenes to the HIV expression patterns, and 
# calculate the corresponding p value
moduleHivCor <- cor(MEs, hiv, use = "p")
moduleHivPvalue <- corPvalueStudent(moduleHivCor, nSamples)

In [None]:
# look at the module statistics together: correlation and p-value
moduleStats <- data.frame(correlation = moduleHivCor, pvalue = moduleHivPvalue)
names(moduleStats) <- c("correlation", "p")
moduleStats

None of these modules are even remotely as significant as the ones that were identified when considering only the P2449 samples alone.