-
Notifications
You must be signed in to change notification settings - Fork 0
/
20191102_anno.Rmd
229 lines (161 loc) · 11.2 KB
/
20191102_anno.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
---
title: "Nov2GeoduckDMR functional analysis"
author: "Shelly Trigg"
date: "11/02/2019"
output: html_document
---
load libraries
```{r}
library(gplots)
library(ggplot2)
library(dplyr)
library(plyr)
library(reshape2)
```
read in data
```{r}
##added quote = "", fill =FALSE to read table argument because it was not reading in all lines of input files and gave the error "EOF within quoted string". I found this site which gave that tip:
#https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/
amb_GO <- read.table("amb_AllTimes.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill =FALSE)
day10_GO <- read.table("day10_AllpH.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill =FALSE)
day135_GO <- read.table("day135_AllpH.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE,quote = "", fill =FALSE)
day145_GO <- read.table("day145_AllpH.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE,quote = "", fill =FALSE)
#read in non-significant + significant DMRs
day10_GO_nosig <- read.table("../20191106_anno/day10_AllpH_filtrd_0928GFF3.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = FALSE)
day135_GO_nosig <- read.table("../20191106_anno/day135_AllpH_filtrd_0928GFF3.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = FALSE)
day145_GO_nosig <- read.table("../20191106_anno/day145_AllpH_filtrd_0928GFF3.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = FALSE)
amb_GO_nosig <- read.table("../20191106_anno/amb_AllTimes_filtrd_0928GFF3.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = FALSE)
#remove extra columns
day10_GO_nosig <- day10_GO_nosig[,c(1:3,7)]
day135_GO_nosig <- day135_GO_nosig[,c(1:3,7)]
day145_GO_nosig <- day145_GO_nosig[,c(1:3,7)]
amb_GO_nosig <- amb_GO_nosig[,c(1:3,7)]
#############
#read in bkgd DMRs
#############
#with Gannet mounted
day10_GO_bkgd <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191109/all_day10_bkgd.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = TRUE)
day135_GO_bkgd <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191109/all_day135_bkgd.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = TRUE)
day145_GO_bkgd <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191109/all_day145_bkgd.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = TRUE)
amb_GO_bkgd <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191109/all_amb_bkgd.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = TRUE)
```
add header to data frames
```{r}
colnames(amb_GO) <- c("chr", "start", "end", "num.DMS","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")
colnames(day10_GO) <- c("chr", "start", "end", "num.DMS","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")
colnames(day135_GO) <- c("chr", "start", "end", "num.DMS","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")
colnames(day145_GO) <- c("chr", "start", "end", "num.DMS","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")
colnames(day10_GO_nosig) <- c("chr", "start", "end","feature")
colnames(day135_GO_nosig) <- c("chr", "start", "end","feature")
colnames(day145_GO_nosig) <- c("chr", "start", "end","feature")
colnames(amb_GO_nosig) <- c("chr", "start", "end","feature")
colnames(day10_GO_bkgd) <- c("chr", "start", "end","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")
colnames(day135_GO_bkgd) <- c("chr", "start", "end","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")
colnames(day145_GO_bkgd) <- c("chr", "start", "end","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")
colnames(amb_GO_bkgd) <- c("chr", "start", "end","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")
#remove extra columns
day10_feat_bkgd <- day10_GO_bkgd[,c(1:4)]
day135_feat_bkgd <- day135_GO_bkgd[,c(1:4)]
day145_feat_bkgd <- day145_GO_bkgd[,c(1:4)]
amb_feat_bkgd <- amb_GO_bkgd[,c(1:4)]
```
add column to each GO data frame so they can be combined
```{r}
amb_GO$group <- "amb"
day10_GO$group <- "day10"
day135_GO$group <- "day135"
day145_GO$group <- "day145"
amb_GO_nosig$group <- "amb"
day10_GO_nosig$group <- "day10"
day135_GO_nosig$group <- "day135"
day145_GO_nosig$group <- "day145"
amb_feat_bkgd$group <- "amb"
day10_feat_bkgd$group <- "day10"
day135_feat_bkgd$group <- "day135"
day145_feat_bkgd$group <- "day145"
```
combine GO data frames
```{r}
all_comparisons <- rbind(amb_GO,day10_GO, day135_GO, day145_GO)
#20191109 I'm commenting this out because I've updated the background
#all_comparisons_bkgrd <- rbind(amb_GO_nosig, day10_GO_nosig, day135_GO_nosig, day145_GO_nosig)
all_comparisons_bkgrd <- rbind(amb_feat_bkgd, day10_feat_bkgd, day135_feat_bkgd, day145_feat_bkgd)
all_comparisons_bkgrd$sig <- "BkgdCGs5xCov3idv"
all_comparisons$sig <- "Significant DMRs"
all_comparisons_bkgrd <- rbind(all_comparisons_bkgrd,all_comparisons[,c(1:3,5,12:13)])
all_comparisons_bkgrd$group.sig <- paste(all_comparisons_bkgrd$sig,all_comparisons_bkgrd$group,sep = ".")
```
plot DMR feature frequencies
```{r}
jpeg("features_histograms.jpg", width = 800, height = 800)
#facet by feature
ggplot(all_comparisons) + geom_histogram(aes(group, color = group, fill = group), stat = "count") + facet_wrap(~feature) + xlab("comparison") + ylab("number of features") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))
dev.off()
ggplot(all_comparisons) + geom_bar(aes(group, color = group, fill = group), stat = "count") + facet_wrap(~feature) + xlab("comparison") + ylab("number of features") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))
#facet by exp. group
jpeg("features_histograms_group_facet.jpg", width = 800, height = 800)
ggplot(all_comparisons) + geom_histogram(aes(feature, color = feature, fill = feature), stat = "count") + facet_wrap(~group)
dev.off()
#stacked bar plot of features for each group
jpeg("features_stacked_bar_plot.jpg", width = 800, height = 800)
ggplot(all_comparisons, aes(x = factor(group), group = feature, fill = feature)) + geom_bar(position = "fill") + ylab("fraction of all features") + xlab("comparison")
dev.off()
#plot proportion percentages for features represented in all DMRs vs. significant DMRs for each comparison
#found these sites helpful:
#https://sebastiansauer.github.io/percentage_plot_ggplot2_V2/
#https://www.biostars.org/p/302349/
jpeg("AllvSigDMRsxfeatures_PropBarplot_GroupFacet.jpg", width = 8, height = 8,units = "in", res = 300)
ggplot(all_comparisons_bkgrd, aes(x= feature, group=group.sig)) + geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + geom_text(aes( label = scales::percent(..prop..),y= ..prop.. ), stat= "count", vjust = -.5, size = 2.5) + labs(y = "Percent", fill="feature") + facet_grid(group~sig) + scale_y_continuous(labels = scales::percent,limits = c(0,0.35)) + scale_fill_manual(name="Features", values=c("#F8766D","#BB9D00","#00B81F","#00C0B8","#00A5FF","#E76BF3","#FF6C90"), labels=c("CDS", "exon", "gene","mRNA","repeat_region","rRNA","tRNA")) + theme_bw() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
dev.off()
#plot proportion percentages for features represented in 20191109 background sites vs. significant DMRs for each comparison
jpeg("BkgdvSigDMRsxfeatures_PropBarplot_GroupFacet.jpg", width = 8, height = 8,units = "in", res = 300)
ggplot(all_comparisons_bkgrd, aes(x= feature, group=group.sig)) + geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + geom_text(aes( label = scales::percent(..prop..),y= ..prop.. ), stat= "count", vjust = -.5, size = 2.5) + labs(y = "Percent", fill="feature") + facet_grid(group~sig) + scale_y_continuous(labels = scales::percent,limits = c(0,0.35)) + scale_fill_manual(name="Features", values=c("#F8766D","#BB9D00","#00B81F","#00C0B8","#00A5FF","#E76BF3","#FF6C90"), labels=c("CDS", "exon", "gene","mRNA","repeat_region","rRNA","tRNA")) + theme_bw() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
dev.off()
```
GO annotations
```{r}
#Since GO IDs are listed all in one column, spread them across multiple columns (1 column /term)
day10_sig_pro_GOid <- data.frame()
for (i in 1:nrow(day10_GO)){
day10_sig_pro_GOid_row <- data.frame(t(data.frame(strsplit(as.character(day10_GO$GO.IDs[i]),'; ', fixed = TRUE))))
day10_sig_pro_GOid <- rbind.fill(day10_sig_pro_GOid,day10_sig_pro_GOid_row)
}
#add chr and position info back to GO IDs
day10_sig_pro_GOid <- cbind(day10_GO[,c("chr","start","end")], day10_sig_pro_GOid[,-1])
#this results in a table with "chr","start","end" listed and each next column contains a GO ID that was listed with the protein in Uniprot DB.
#convert all factors to characters
#silo39_sig_pro_GOid_term <- data.frame(lapply(silo39_sig_pro_GOid_term, as.character), stringsAsFactors = FALSE)
#proteins that don't have GO IDs?
#reshape data so that all GO ID columns are gathered in one column called "GO"
STACKED_day10_sig_pro_GOid <- tidyr::gather(day10_sig_pro_GOid,"col","GO", 4:ncol(day10_sig_pro_GOid))
#exlude middle column which just contains the string "gene" in each row
STACKED_day10_sig_pro_GOid <- STACKED_day10_sig_pro_GOid[,-4]
#remove duplicate rows
STACKED_day10_sig_pro_GOid <- unique(STACKED_day10_sig_pro_GOid)
#remove any rows where GO column has NA value.
STACKED_day10_sig_pro_GOid <- STACKED_day10_sig_pro_GOid[which(!is.na(STACKED_day10_sig_pro_GOid$GO)),]
#this resulting data frame has two columns "gene" and "GO"
#creating background gene list:
day10_bkgd_pro_GOid <- data.frame()
for (i in 1:nrow(day10_GO_bkgd[which(substr(day10_GO_bkgd$GO.IDs,1,2)=="GO"),])){
day10_bkgd_pro_GOid_row <- data.frame(t(data.frame(strsplit(as.character(day10_GO_bkgd[which(substr(day10_GO_bkgd$GO.IDs,1,2)=="GO"),"GO.IDs"][i]),'; ', fixed = TRUE))))
day10_bkgd_pro_GOid <- rbind.fill(day10_bkgd_pro_GOid,day10_bkgd_pro_GOid_row)
}
#add chr and position info back to GO IDs
day10_bkgd_pro_GOid <- cbind(day10_GO_bkgd[which(substr(day10_GO_bkgd$GO.IDs,1,2)=="GO"),c("chr","start","end")], day10_bkgd_pro_GOid[,-1])
#this results in a table with "chr","start","end" listed and each next column contains a GO ID that was listed with the protein in Uniprot DB.
#convert all factors to characters
#silo39_sig_pro_GOid_term <- data.frame(lapply(silo39_sig_pro_GOid_term, as.character), stringsAsFactors = FALSE)
#proteins that don't have GO IDs?
#convert factor columns to character
class(day10_bkgd_pro_GOid[,4:ncol(day10_bkgd_pro_GOid)]) <- "character"
#reshape data so that all GO ID columns are gathered in one column called "GO"
STACKED_day10_bkgd_pro_GOid <- tidyr::gather(day10_bkgd_pro_GOid,"col","GO", 4:ncol(day10_bkgd_pro_GOid))
#exlude middle column which just contains the string "gene" in each row
STACKED_day10_sig_pro_GOid <- STACKED_day10_sig_pro_GOid[,-4]
#remove duplicate rows
STACKED_day10_sig_pro_GOid <- unique(STACKED_day10_sig_pro_GOid)
#remove any rows where GO column has NA value.
STACKED_day10_sig_pro_GOid <- STACKED_day10_sig_pro_GOid[which(!is.na(STACKED_day10_sig_pro_GOid$GO)),]
#this resulting data frame has two columns "gene" and "GO"
```