### HeLa enDR3 transcriptome analysis

RNA-seq data analysis is performed using DESeq2 for differential expression analysis.



In [None]:
library('DESeq2')
library('ggplot2')
library('biomaRt')
library('Cairo')
library('RUVSeq')

Read in the counts data file (featureCounts output)

In [None]:
counts_file <- file.choose()
counts_df <- read.csv(counts_file,sep="\t", header=TRUE, skip = 1, row.names = 1)
head(counts_df)

<BR>
Remove unnecessary columns 1-5 (Chr, Start, End, Strand, Length)(row.names are not counted as a column)

In [None]:
counts_df <- counts_df[,-c(1:5)]

In [None]:
head(counts_df)

Rename columns

In [None]:
col_names <- c("enDR3_1","enDR3_2","enDR3_3","Hela_1", "Hela_2","Hela_3")

colnames(counts_df) <- col_names

Get data summary

In [None]:
nrow(counts_df)
summary(counts_df)
head(counts_df)

<BR>
Filter out non-expressed genes. Accept only genes that have more than 1 read in at least 4 samples.

In [None]:
filter <- apply(counts_df, 1, function(x) length(x[x>0])>=4)
filtered_counts_df <- counts_df[filter,]

In [None]:
sprintf('Number of genes meeting the criteria: %d', nrow(filtered_counts_df))
head(filtered_counts_df)
tail(filtered_counts_df)

Defining experiment design

In [None]:
condition <- factor(c(rep("enDR3", 3), rep("Hela", 3)))

condition
coldata <- data.frame(row.names=colnames(filtered_counts_df), condition)

Creating DESeq data set

In [None]:
dds_p1 <- DESeqDataSetFromMatrix(countData=filtered_counts_df, 
                                        colData=coldata, 
                                        design=~condition)

dds_p1

In [None]:
dds_p1 <- DESeq(dds_p1)

Get differential expression results

In [None]:
results_enDR3 <- results(dds_p1, contrast = c("condition", "enDR3", "Hela"), alpha=0.05)
table(results_enDR3$padj<0.05)

In [None]:
results_enDR3 <- results_enDR3[order(results_enDR3$padj), ]

In [None]:
par(mar=c(5,5,5,5), cex=1.0, cex.main=1.4, cex.axis=1.4, cex.lab=1.4)

topT <- as.data.frame(results_enDR3)

#Adjusted P values (FDR Q values)
with(topT, plot(log2FoldChange, -log10(padj), pch=20, main="Volcano plot", cex=1.0, xlab=bquote(~Log[2]~fold~change), ylab=bquote(~-log[10]~Q~value)))

with(subset(topT, padj<=0.05 & abs(log2FoldChange)>1), points(log2FoldChange, -log10(padj), pch=20, col="red", cex=0.5))

Merge with normalized count data

In [None]:
full_results_enDR3 <- merge(as.data.frame(results_enDR3),
                           as.data.frame(counts(dds_p1, normalized=TRUE)),
                           by="row.names", sort=FALSE)

names(full_results_enDR3)[1] <- "Gene"
head(full_results_enDR3, 20)

Write results to a file

In [None]:
write.csv(full_results_enDR3, file=file.choose(), row.names=FALSE)

Plot quality metrics plots

In [None]:
r_log_dds_p1 <- rlog(dds_p1)

In [None]:
plotPCA(r_log_dds_p1)

In [None]:
pdf(file=file.choose())
plotPCA(r_log_dds_p1)
dev.off()

In [None]:
plotDispEsts(dds_p1)