# 2018-11-16 No dead cells PCA cyclone
Here I want to repeat the analysis did for the PCA and the cell cycle, this time without dead cells.

## Data preprocessing

In [None]:
# load useful libraries
library(ggplot2)
library(RColorBrewer)
theme_set(theme_bw())

In [None]:
# basic data
matrices.dir <- "/home/rcortini/work/CRG/projects/sc_hiv/data/matrices"
sample.names <- c("P2449", "P2458")

# init data structures that will hold our data
exprMatrices <- list()

# load data
for (sample.name in sample.names) {
    
    # file names
    matrix.fname <- sprintf("%s/%s.tsv.gz", matrices.dir, sample.name)
    sampleSheet.fname <- sprintf("%s/monocle/%s.pd.tsv", matrices.dir, sample.name)

    # parse data
    exprMatrices[[sample.name]] <- read.table(matrix.fname, header = TRUE, row.names = 1,
                                sep = "\t", check.names = FALSE)
}

# load sample sheet
sampleSheet <- read.table(sprintf('%s/samplesheet.csv', matrices.dir),
                          header = TRUE,
                          row.names = 1)

In [None]:
# merge the two samples
exprMatrix <- cbind(exprMatrices[["P2449"]], exprMatrices[["P2458"]])

# remove genes with zero expression
geneExpression <- rowSums(exprMatrix)
exprMatrix <- exprMatrix[geneExpression > 1, ]

# remove dead cells
exprMatrix <- exprMatrix[, sampleSheet$status != "dead"]
sampleSheet <- sampleSheet[sampleSheet$status != "dead", ]

# get the names of the cells
cells <- colnames(exprMatrix)
genes <- rownames(exprMatrix)

In [None]:
# load gene annotations file
gene.annotations <- sprintf("%s/gene_annotations.tsv", matrices.dir)
gene.data <- read.delim(gene.annotations, header = TRUE, sep = "\t",
                        row.names = 1, stringsAsFactors = FALSE)
gene.data <- subset(gene.data, rownames(gene.data) %in% genes)

In [None]:
# let's write this file so I don't have to redo this all the time
write.table(x = exprMatrix, file = sprintf("%s/exprMatrix.csv", matrices.dir),
            row.names = TRUE, quote = FALSE, sep="\t")

write.table(x = gene.data, file = sprintf("%s/genedata.csv", matrices.dir),
            row.names = TRUE, quote = FALSE, sep="\t")

In [None]:
# save the information of the unnormalized HIV
HIV <- exprMatrix["FILIONG01", ]

# normalize by total expression of the cells
totalExpression <- colSums(exprMatrix)
normalized <- t(t(exprMatrix) / totalExpression)

# PCA
Let's do the PCA on the treated cells.

In [None]:
# select treated cells
treated.names <- sampleSheet[colnames(exprMatrix), ]$status == "treated"
treated <- exprMatrix[, treated.names]

# remove genes that have zero expression in this data set
geneExpression.treated <- rowSums(treated)
treated <- treated[geneExpression.treated > 0, ]

In [None]:
# do the PCA
treated.pca <- prcomp(t(treated), scale = TRUE)

In [None]:
# prepare data for plotting
pca <- data.frame(treated.pca$x)
pca$HIV <- log(t(HIV[treated.names]))

In [None]:
options(repr.plot.width = 4, repr.plot.height = 3)
ggplot(pca, aes(PC1, PC2)) + geom_point(aes(color = HIV)) +
scale_colour_gradient(low="blue", high="red") + theme_bw()

So, again no interesting information here.

## Cell cycle

In [None]:
# load libraries needed for the analysis
suppressMessages(library(scran))
suppressMessages(library(scater))
suppressMessages(library(SingleCellExperiment))

In [None]:
# build the SingleCellExperiment object
sce <- SingleCellExperiment(list(counts=as.matrix(exprMatrix),
                                 normalized=as.matrix(normalized)),
                rowData = DataFrame(gene.data),
                colData = DataFrame(sampleSheet))

In [None]:
# load the list of pairs of genes
hs.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))

# prepare the data for cyclone
genes.shortNames <- gsub("\\..*", "", rownames(gene.data))
nonDuplicated <- !duplicated(genes.shortNames)
mygenes <- rownames(gene.data)[nonDuplicated]
sce.nonDuplicated = sce[mygenes, ]
rownames(sce.nonDuplicated) <- genes.shortNames[nonDuplicated]

In [None]:
# do the assignment of the cell cycle phases
assignments <- cyclone(sce.nonDuplicated, hs.pairs)
sce$phases <- assignments$phases
sce$scores <- assignments$scores

Now that we have the assignment, we can plot the expression of the normalized or unnormalized GFP levels in the various groups.

In [None]:
# prepare the data for plotting
treated.cells <- sampleSheet$status == "treated"
hiv.cycle <- data.frame(raw = t(HIV[treated.cells]),
                        normalized = normalized["FILIONG01", treated.cells])
hiv.cycle$phases <- sce$phases[treated.cells]

In [None]:
options(repr.plot.width = 2, repr.plot.height = 2)
ggplot(hiv.cycle, aes(x = phases, y = FILIONG01)) + geom_boxplot() +
      xlab("Phase") + ylab('GFP')
ggplot(hiv.cycle, aes(x = phases, y = normalized)) + geom_boxplot() +
      xlab("Phase") + ylab('Normalized GFP')

So this part of the analysis still holds. Let's check that the number of cells in the two groups are kind of similar.

In [None]:
table(sce$phases[treated.cells], treated["FILIONG01", ] > 0)

No they aren't. So let's check that in the case of the untreated cells we have the same number of cells in G1 and G2 (they are growing cells so it should be the case).

In [None]:
nontreated.cells <- sampleSheet$status == "nontreated"
print(sum(sce$phases[nontreated.cells] == 'G1'))
print(sum(sce$phases[nontreated.cells] == 'G2M'))
print(sum(sce$phases[nontreated.cells] == 'S'))

So no, here there is something fishy going on. There cannot be 1 out of 70 cells that is in G2M. So the analysis of the cell cycle did by `cyclone` is in the end not correct and should be performed differently.