forked from friedue/course_RNA-seq2015
-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_Alignment_visualizeSTARresults.Rmd
249 lines (199 loc) · 11.1 KB
/
01_Alignment_visualizeSTARresults.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
---
title: "Visualizing the alignment stats of STAR using R"
output: pdf_document
---
This report was generated with \textsf{R} and the help of the \texttt{knitR} package.
The data set used here contains mRNA from 48 replicates of two \textit{S. cerevisiae} populations: wildtype and \textit{snf2} knock-out mutants.
All 96 samples were sequenced on one flow cell (Illumina HiSeq 2000).
The SRA accession number for the entire data set (consisting of 2 x 48 samples) is ERP004763.
\pagenumbering{gobble}
```{r settingUp, message=FALSE, warning=FALSE}
library(knitr) # for turning the original script into a nicely formatted pdf
opts_chunk$set(echo = TRUE, message = FALSE) # knitr options
opts_knit$set(self.contained = FALSE) # knitr options
library(ggplot2)
library(gridExtra) # for composite plotting of ggplots
options(stringsAsFactors = FALSE)
```
```{r definingFunctions, echo = FALSE}
# ################################################################################
# The functions of this script are not very generalized.
# Most of them expect a long data frame with exactly the following format:
# head(align.results.df)
# V1 V2 replicate sample
# SNF2_1.1 Mapping speed, Million of reads per hour 844.70 1 SNF2
# SNF2_1.2 Number of input reads 11966526.00 1 SNF2
# SNF2_1.3 Average input read length 51.00 1 SNF2
# SNF2_1.4 Uniquely mapped reads number 10289736.00 1 SNF2
# SNF2_1.5 Uniquely mapped reads % 85.99 1 SNF2
# SNF2_1.6 Average mapped length 50.73 1 SNF2
##################################################################################
PlottingCorrelation <- function(DF, Var1, Var2, Var1.Label, Var2.Label){
# Convenience function for the simple plot() function that allows for separate
# definition of labels and columns that should be compared against each other
# usage: PlottingCorrelation(DF=aligned.reads.df,
# Var1="Number of input reads", Var2="Uniquely mapped reads %",
# Var1.Label = "InputReads", Var2.Label="UniquelyMappedFraction")
m <- matrix(data = c(DF$V2[which(DF$V1 == Var1)],
DF$V2[which(DF$V1 == Var2)]),
ncol = 2)
colnames(m) <- c(Var1.Label,Var2.Label)
plot(m)
}
PlottingAlignmentResults <- function(Filter, DF, Legend=TRUE, PlotMedian = TRUE){
# this function extracts those lines that correspond to the value stored in Filter and generates a bar plot where each replicate is shown with a different color
library(ggplot2)
library(grid) # for unit() function
filtered.df <- DF[which(DF$V1 == Filter),]
medians <- as.data.frame(aggregate(V2~sample, data=filtered.df, FUN=median))
filtered.df <- merge(filtered.df, medians, by.x = "sample", by.y = "sample", all.x=TRUE)
p <- ggplot(data=filtered.df, aes(fill=replicate, y=V2.x, x=sample)) +
geom_bar(stat="identity",position=position_dodge()) +
theme_bw(base_size = 16) +
theme(legend.position="bottom",
legend.text = element_text(size = 6),
legend.key.size = unit(0.3, "cm"),
legend.title=element_blank()) +
coord_flip() + ylab("") + ggtitle(Filter)
if(PlotMedian){
p <- p + geom_errorbar(aes(y=V2.y, ymax=V2.y, ymin=V2.y), linetype="dashed")
}
if(!Legend){
p <- p + theme(legend.position="none")
}
return(p)
}
ExtractLegend <- function(Plot){
library(ggplot2)
G <- ggplotGrob(Plot)$grobs
Legend <- G[[which(sapply(G, function(x) x$name) == "guide-box")]]
Lheight <- sum(Legend$height)
return(list(legend = Legend, lheight=Lheight))
}
Extract.Histo.Info <- function(InList, # output from hist(...,plot=FALSE)
Percentage = TRUE
){
# this function uses the information from hist() that are stored in lists
# to make a data frame that's suitable for bar plots
ll <- length(InList$breaks)
out.df <- data.frame(breaks1 = InList$breaks[c(1:ll-1)],
breaks2 = InList$breaks[c(2:ll)],
counts = InList$counts)
if(Percentage){
out.df <- transform(out.df,
breaks = paste(out.df$breaks1*100, "-", out.df$breaks2*100, sep = ""),
breaks1 = NULL, breaks2 = NULL)
}else{
out.df <- transform(out.df,
breaks = paste(out.df$breaks1, "-", out.df$breaks2, sep = ""),
breaks1 = NULL, breaks2 = NULL)
}
out.df$breaks <- factor(out.df$breaks, levels = unique(as.character(out.df$breaks)), ordered = TRUE)
return(out.df)
}
```
## Reading in data
```{r dataIn}
# listing files to be read in
infiles <- list.files(path="01_STAR_logs/", # folder with log files
pattern="Log.final.out",
full.names = TRUE)
# iterating over the file list with a function to read in the log files
# generates a __list of data frames__
align.results <- lapply(infiles, function(x)
read.table(x, sep="|",
strip.white=TRUE,
stringsAsFactor=FALSE,
skip=3, fill = TRUE, header = FALSE) )
typeof(align.results)
head(align.results[[1]]) # peek into one of the data.frames within the list
# removing "%" from some of the values to keep just the numeric parts
align.results <- lapply(align.results, function(x)
transform(x, V2 = as.numeric(gsub("%", "", x$V2) )))
# some cosmetics of each data frame's name - this is specific for the sample names
# of the files used here!
names(align.results) <- gsub(".*(SNF2|WT)(\\_[0-9]*).*", "\\1\\2", infiles)
```
## Generating a long data frame for ggplot2-based plotting
```{r longDF}
# concatenating all data frames of align.results
align.results.df <- as.data.frame(do.call(rbind, align.results))
# removing lines without values
align.results.df <- align.results.df[complete.cases(align.results.df),]
# adding additional columns with information about sample and replicate ID,
# using the information from the row names (which are, in turn, based on the
# names of the individual data frames that were stored in the original
# list, align.results)
align.results.df$sample <- gsub("(.*)\\_.*", "\\1", row.names(align.results.df))
align.results.df$replicate <- as.factor(as.numeric(
gsub(".*\\_([0-9]*)\\.[0-9]*", "\\1", row.names(align.results.df))
))
# check the result - we should have a data frame with 4 columns
head(align.results.df)
```
## Making plots
Numbers are usually more easily digestible and comparable if they are represented visually, especially if one has many samples (although you will probably rarely encounter 48 samples per replicate as for this example data used here).
However, we don't need to visualize every entry from the \texttt{STAR} output since some are redundant, and some even not applicable in our case:
```{r}
unique(align.results.df$V1)
```
Thus, we first define those QC entries that we are interested in:
```{r filters}
filters = c("Number of input reads", "Uniquely mapped reads %",
"Number of splices: Total", "Number of splices: Non-canonical")
```
Now, let's generate some bar plots.
The following chunk of code is optimized for returning a composite image with four plots and one legend.
Note that the functions \texttt{PlottingAligmentResults} and \texttt{ExtractLegend} are part of the script that we sourced in the beginning.
```{r barPlots, fig.width = 13, fig.height = 10}
# for each entry in "filters", generate a bar chart
plots <- lapply(filters, function(x)
PlottingAlignmentResults(x, align.results.df, Legend = FALSE))
# getting the legend is a bit tricky (kudos to Luce for figuring it out)
my.legend <- ExtractLegend(PlottingAlignmentResults(align.results.df,
Filter = filters[1],
Legend=TRUE))
# combining plots and legend
grid.arrange(arrangeGrob(plots[[1]], plots[[2]], plots[[3]], plots[[4]], nrow=2),
my.legend$legend, nrow=2,
heights= unit.c(unit(1, "npc") - my.legend$lheight, my.legend$lheight)
)
```
The dashed black line is the median value across all replicates of one condition.
There is quite a bit of variation for all four stats between the replicates.
Although the median sequencing depth of the _snf2_ ko samples is slightly higher than for the WT samples, the median number of splice sites shows a reverse trend.
This is counter-intuitive as the number of splice sites should positively correlate with the number of mapped reads.
Let's check whether that relationship is simply confounded by just looking at the median values.
\clearpage
__WT__
```{r corPlots_WT, fig.width=10, fig.height=10, echo=FALSE}
par(mfrow=c(2,2))
sampletype="WT"
df = subset(align.results.df, grepl(sampletype,row.names(align.results.df)))
PlottingCorrelation(DF = df, Var1 = "Uniquely mapped reads number", Var2 = filters[2], Var1.Label = paste("No. of uniquely mapped reads (",sampletype,")",sep=""), Var2.Label = paste(filters[2]," (",sampletype,")",sep=""))
PlottingCorrelation(DF = df, Var1 = filters[1], Var2 = filters[3], Var1.Label = paste("No. of uniquely mapped reads (",sampletype,")",sep=""), Var2.Label = "Total splice number")
PlottingCorrelation(DF = df, Var1 = "Uniquely mapped reads number", Var2 = filters[4], Var1.Label = paste("No. of uniquely mapped reads (",sampletype,")",sep=""), Var2.Label = "Non-canonical splice events")
PlottingCorrelation(DF = df, Var1 = filters[3], Var2 = filters[4], Var1.Label = paste("Total splice number (",sampletype,")", sep=""), Var2.Label = "Non-canonical splice events")
```
\clearpage
__SNF2__
```{r corPlots_SNF2, fig.width=10, fig.height=13, echo=FALSE}
par(mfrow=c(2,2))
sampletype="SNF2"
df = subset(align.results.df, grepl(sampletype,row.names(align.results.df)))
PlottingCorrelation(DF = df, Var1 = filters[1], Var2 = filters[2], Var1.Label = paste(filters[1], " (",sampletype,")",sep=""), Var2.Label = paste(filters[2]," (",sampletype,")",sep=""))
PlottingCorrelation(DF = df, Var1 = filters[1], Var2 = filters[3], Var1.Label = paste(filters[1]," (",sampletype,")",sep=""), Var2.Label = "Total splice number")
PlottingCorrelation(DF = df, Var1 = filters[1], Var2 = filters[4], Var1.Label = paste(filters[1]," (",sampletype,")",sep=""), Var2.Label = "Non-canonical splice events")
PlottingCorrelation(DF = df, Var1 = filters[3], Var2 = filters[4], Var1.Label = paste("Total splice number (",sampletype,")", sep=""), Var2.Label = "Non-canonical splice events")
```
In both sample types, the number of detected splice sites indeed seems to correlate with the number of input reads.
Apart from that, the samples almost all aligned well (70-80% alignment rate is typically seen for normal RNA-seq experiments).
For future references, let's get those samples that seem to deviate the most:
* samples with low mapping rate (<75%)
```{r}
subset(align.results.df, V1 == filters[2] & V2 < 75)
```
* samples with high numbers of non-canonical splice events (>1,000)
```{r}
subset(align.results.df, V1 == filters[4] & V2 > 1000)
```