/
supplementary.Rmd
209 lines (162 loc) · 5.16 KB
/
supplementary.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
---
title: "Supplementary Material"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Supplementary}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r message=FALSE, warning=FALSE}
# load libraries
library(tibble)
library(dplyr)
library(tidyr)
library(readr)
library(stringr)
library(ggplot2)
library(tidyHeatmap)
library(tidybulk)
library(ggrepel)
library(airway)
```
## How to start from tables
```{r}
# create some example tables to use
data(airway)
# counts table
counts <- assay(airway) %>%
as_tibble(rownames = "geneID")
# sample information table
sampleinfo <- colData(airway) %>%
as_tibble(rownames = "sample")
# data preprocessing
counts_tt <-
# convert to tidy format
pivot_longer(counts, cols = starts_with("SRR"), names_to = "sample", values_to = "counts") %>%
# get gene symbols
ensembl_to_symbol(geneID) %>%
# order the columns for tidybulk
select(sample, geneID, counts, transcript) %>%
# add the sample info
left_join(sampleinfo) %>%
# shorten sample name
mutate(sample=str_remove(sample, "SRR1039")) %>%
# convert to tidybulk tibble
tidybulk(.sample=sample, .transcript=geneID, .abundance=counts)
```
## How to count reads per sample
```{r}
counts_tt %>%
group_by(sample) %>%
summarise(total_reads=sum(counts))
```
We can also check how many counts we have for each sample by making a bar plot. This helps us see whether there are any major discrepancies between the samples more easily.
```{r out.width = "70%"}
ggplot(counts_tt, aes(x=sample, weight=counts, fill=sample)) +
geom_bar() +
theme_bw()
```
As we are using ggplot2, we can also easily view by any other variable that's a column in our dataset, such as cell line, simply by changing `fill`.
We can colour by dex treatment.
```{r out.width = "70%"}
ggplot(counts_tt, aes(x=sample, weight=counts, fill=dex)) +
geom_bar() +
theme_bw()
```
We can colour by cell line.
```{r out.width = "70%"}
ggplot(counts_tt, aes(x=sample, weight=counts, fill=cell)) +
geom_bar() +
theme_bw()
```
## How to examine normalised counts with boxplots
```{r out.width = "70%"}
# scale counts
counts_scaled <- counts_tt %>% scale_abundance(factor_of_interest = dex)
# create box plots
counts_scaled %>%
filter(!lowly_abundant) %>%
pivot_longer(cols = c("counts", "counts_scaled"), names_to = "source", values_to = "abundance") %>%
ggplot(aes(x=sample, y=abundance + 1, fill=dex)) +
geom_boxplot() +
geom_hline(aes(yintercept = median(abundance + 1)), colour="red") +
facet_wrap(~source) +
scale_y_log10() +
theme_bw()
```
## How to create MDS plot
```{r out.width = "70%"}
airway %>%
tidybulk() %>%
scale_abundance(factor_of_interest=dex) %>%
reduce_dimensions(method="MDS", scale = F) %>%
pivot_sample() %>%
ggplot(aes(Dim1, Dim2, color = dex)) +
geom_point()
```
## How to create MA plot
MA plots enable us to visualise amount of expression (logCPM) versus logFC. Highly expressed genes are towards the right of the plot. We can also colour significant genes (e.g. genes with FDR < 0.05)
```{r out.width = "70%"}
# perform differential testing
counts_de <-
counts_tt %>%
test_differential_abundance(
.formula = ~ 0 + dex + cell,
.contrasts = c("dextrt - dexuntrt"),
omit_contrast_in_colnames = TRUE
)
# maplot, minimal
counts_de %>%
pivot_transcript() %>%
filter(!lowly_abundant) %>%
ggplot(aes(x=logCPM, y=-logFC, colour=significant)) +
geom_point()+
theme_bw()
```
A more informative MA plot, integrating some of the packages in tidyverse.
```{r out.width = "70%", warning=FALSE}
counts_de %>%
pivot_transcript() %>%
# Subset data
filter(!lowly_abundant) %>%
mutate(significant = FDR<0.05 & abs(logFC) >=2) %>%
mutate(transcript = ifelse(abs(logFC) >=5, as.character(transcript), "")) %>%
# Plot
ggplot(aes(x = logCPM, y = logFC, label=transcript)) +
geom_point(aes(color = significant, size = significant, alpha=significant)) +
geom_text_repel() +
scale_color_manual(values=c("black", "#e11f28")) +
scale_size_discrete(range = c(0, 2)) +
theme_bw()
```
## How to perform gene enrichment analysis
To run below you'll need the `clusterProfiler` and `org.Hs.eg.db` packages. This is just one suggestion, adapted from [here](https://simon-anders.github.io/data_analysis_course/lecture9.html). If you have other suggestions for how to do a 'tidy' pathway analysis feel free to [let us know](https://github.com/stemangiola/bioc_2020_tidytranscriptomics/blob/master/CONTRIBUTING.md).
```{r eval=FALSE}
library(clusterProfiler)
library(org.Hs.eg.db)
# extract all genes tested for DE
res <- counts_de %>%
pivot_transcript() %>%
filter(!lowly_abundant)
# GO terms
egoCC <- res %>%
filter(FDR < 0.1 & logFC > 0 ) %>%
pull( "transcript" ) %>%
enrichGO(
OrgDb = org.Hs.eg.db,
keyType = 'SYMBOL',
ont = "BP",
universe = (res %>% pull( "transcript" ) ) )
dotplot(egoCC)
goplot(egoCC)
emapplot(egoCC)
# MSigDB Hallmark
gmtH <- read.gmt( "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/6.2/h.all.v6.2.symbols.gmt" )
enrH <- enricher(
gene = ( res %>% filter(FDR < 0.1 & logFC > 0) %>%
pull( "transcript" ) ),
TERM2GENE = gmtH,
universe = ( res %>% pull( "transcript" ) ) )
dotplot( enrH )
emapplot(enrH)
```