In [None]:
library(DESeq)
library(ggplot2)

In [None]:
# load the data
matrices.dir <- "/home/rcortini/work/CRG/projects/sc_hiv/data/matrices"
merged <- read.table(sprintf('%s/exprMatrix.csv', matrices.dir),
                     header = TRUE, row.names = 1,
                     sep = "\t", check.names = FALSE)

# load sample sheet
sampleSheet <- read.table(sprintf('%s/samplesheet.csv', matrices.dir),
                          header = TRUE,
                          row.names = 1)

# remove dead cells
sampleSheet <- sampleSheet[sampleSheet$status != "dead", ]

In [None]:
# load gene annotations file
gene.annotations <- sprintf("%s/gene_annotations.tsv", matrices.dir)
gene.data <- read.delim(gene.annotations, header = TRUE, sep = "\t",
                        row.names = 1, stringsAsFactors = FALSE)
gene.data <- subset(gene.data, rownames(gene.data) %in% rownames(merged))

In [None]:
# fetch the ENSEMBL gene id from the gene.data list
PUS10 <- rownames(gene.data)[gene.data$gene_symbol == "PUS10"]

In [None]:
p <- data.frame(PUS10 = t(merged[PUS10, ]), type = sampleSheet$label)
p <- p[order(match(p$type, c("Jurkat", "J-Lat+DMSO", "J-Lat+SAHA"))),]
p$x <- seq(1:nrow(sampleSheet))
colnames(p) <- c("PUS10", "type", "x")

In [None]:
options(repr.plot.width = 6, repr.plot.height = 2)
ggplot(p, aes(x = x, y = PUS10)) + geom_point(aes(color = type)) + theme_bw() +
labs(x = "")
ggsave("../figures/PUS10_distribution.png", width = 6, height = 2)

In [None]:
pus10.gfp <- data.frame(PUS10 = t(merged[PUS10, sampleSheet$status == "treated"]),
                  GFP   = t(merged["FILIONG01", sampleSheet$status == "treated"]))
colnames(pus10.gfp) <- c("PUS10", "GFP")

In [None]:
options(repr.plot.width = 3, repr.plot.height = 3)
ggplot(pus10.gfp, aes(x = PUS10, y = GFP)) + geom_point() + theme_bw()
ggsave("../figures/PUS10_GFP.png", width = 3, height = 3)

In [None]:
# file names
P2449.matrix.fname <- sprintf("%s/%s.tsv.gz", matrices.dir, "P2449")
P2458.matrix.fname <- sprintf("%s/%s.tsv.gz", matrices.dir, "P2458")

P2449 <- read.table(P2449.matrix.fname,
                    header = TRUE, row.names = 1, sep = "\t", check.names = FALSE)
P2458 <- read.table(P2458.matrix.fname,
                    header = TRUE, row.names = 1, sep = "\t", check.names = FALSE)

full.matrix <- cbind(P2449, P2458)

In [None]:
labels <- c(rep("Jurkat", 6),
            rep("J-Lat+DMSO", 30),
            rep("J-Lat+SAHA", 60))
labels <- c(labels, labels)

In [None]:
sizes <- colSums(full.matrix)
p1 <- data.frame(x = seq(1:192), labels = labels, sizes = sizes)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 3)
ggplot(p1, aes(x = x, y = sizes, fill = labels)) + geom_bar(stat='identity') + theme_bw() +
labs(x = "Cells", y = "Total library size")
ggsave("../figures/library_sizes-labels.png", width = 10, height = 3)

In [None]:
# remove genes that have no expression
exprMatrix <- full.matrix[rowSums(full.matrix)>1, ]

# normalize by row sum
total <- colSums(exprMatrix)
exprMatrix <- t(exprMatrix)
exprMatrix <- exprMatrix / rowSums(exprMatrix)
exprMatrix <- t(exprMatrix)

In [None]:
# do the PCA
exprMatrix.pca <- prcomp(t(exprMatrix), scale = TRUE)

In [None]:
pca <- as.data.frame(exprMatrix.pca$x)

In [None]:
dead.cells <- rownames(pca)[pca$PC1 > 10]
alive.cells <- rownames(pca)[pca$PC1 < 10]

In [None]:
p2 <- as.data.frame(sizes)
p2$x <- seq(1:192)
p2$type <- "alive"
p2[dead.cells, "type"] <- "dead"

In [None]:
options(repr.plot.width = 10, repr.plot.height = 3)
ggplot(p2, aes(x = x, y = sizes, fill = type)) + geom_bar(stat='identity') + theme_bw() +
labs(x = "Cells", y = "Total library size") + scale_fill_manual(values = c("green", "black"))
ggsave("../figures/library_sizes-dead.png", width = 10, height = 3)

In [None]:
pca$label <- labels

In [None]:
options(repr.plot.width = 4, repr.plot.height = 2.5)
ggplot(pca, aes(PC1, PC2)) + geom_point(aes(color=label)) + theme_bw()
ggsave("../figures/PCA_clustering.png", width = 4, height = 3)

In [None]:
treated <- merged[, sampleSheet$status == "treated"]

In [None]:
# remove genes that have no expression
treated <- treated[rowSums(treated)>1, ]

# normalize by row sum
treated <- t(treated)
treated <- treated / rowSums(treated)
treated <- t(treated)

In [None]:
treated.pca <- prcomp(t(treated), scale = TRUE)
t.pca <- as.data.frame(treated.pca$x)

In [None]:
responder.cells <- colnames(treated)[treated["FILIONG01",] > 0]
t.pca$status <- "non-responder"
t.pca[responder.cells, "status"] <- "responder"

In [None]:
options(repr.plot.width = 4, repr.plot.height = 2.5)
ggplot(t.pca, aes(PC1, PC2)) + geom_point(aes(color = status)) + theme_bw()
ggsave("../figures/PCA_responders.png", width = 4, height = 3)

In [None]:
gfp <- data.frame(x = seq(1:ncol(treated)), gfp = treated["FILIONG01", ])

In [None]:
ggplot(gfp, aes(x, gfp)) + geom_bar(stat = "identity") + theme_bw() +
labs(x = "Cell", y = "Normalized GFP expression")
ggsave("../figures/GFP_treated.png", width = 4, height = 2.5)

In [None]:
full.matrix[PUS10, dead.cells]

In [None]:
ERVW1 <- "ENSG00000242950.6"
rowsum(full.matrix[ERVW1, alive.cells])