analyses/20191102_anno/20191102_anno.Rmd

---
title: "Nov2GeoduckDMR functional analysis"
author: "Shelly Trigg"
date: "11/02/2019"
output: html_document
---


load libraries
```{r}
library(gplots)
library(ggplot2)
library(dplyr)
library(plyr)
library(reshape2)
```

read in data
```{r}
##added quote = "", fill =FALSE to read table argument because it was not reading in all lines of input files and gave the error "EOF within quoted string". I found this site which gave that tip:
#https://kbroman.org/blog/2017/08/08/eof-within-quoted-string/

amb_GO <- read.table("amb_AllTimes.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill =FALSE)

day10_GO <- read.table("day10_AllpH.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill =FALSE)

day135_GO <- read.table("day135_AllpH.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE,quote = "", fill =FALSE)

day145_GO <- read.table("day145_AllpH.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE,quote = "", fill =FALSE)

#read in non-significant + significant DMRs
day10_GO_nosig <- read.table("../20191106_anno/day10_AllpH_filtrd_0928GFF3.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = FALSE)

day135_GO_nosig <- read.table("../20191106_anno/day135_AllpH_filtrd_0928GFF3.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = FALSE)

day145_GO_nosig <- read.table("../20191106_anno/day145_AllpH_filtrd_0928GFF3.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = FALSE)

amb_GO_nosig <- read.table("../20191106_anno/amb_AllTimes_filtrd_0928GFF3.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = FALSE)

#remove extra columns
day10_GO_nosig <- day10_GO_nosig[,c(1:3,7)]
day135_GO_nosig <- day135_GO_nosig[,c(1:3,7)]
day145_GO_nosig <- day145_GO_nosig[,c(1:3,7)]
amb_GO_nosig <- amb_GO_nosig[,c(1:3,7)]

#############
#read in bkgd DMRs
#############

#with Gannet mounted

day10_GO_bkgd <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191109/all_day10_bkgd.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = TRUE)

day135_GO_bkgd <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191109/all_day135_bkgd.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = TRUE)

day145_GO_bkgd <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191109/all_day145_bkgd.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = TRUE)

amb_GO_bkgd <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191109/all_amb_bkgd.GO.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = "", fill = TRUE)

```

add header to data frames
```{r}
colnames(amb_GO) <- c("chr", "start", "end", "num.DMS","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")

colnames(day10_GO) <- c("chr", "start", "end", "num.DMS","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")

colnames(day135_GO) <- c("chr", "start", "end", "num.DMS","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")

colnames(day145_GO) <- c("chr", "start", "end", "num.DMS","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")


colnames(day10_GO_nosig) <- c("chr", "start", "end","feature")

colnames(day135_GO_nosig) <- c("chr", "start", "end","feature")

colnames(day145_GO_nosig) <- c("chr", "start", "end","feature")

colnames(amb_GO_nosig) <- c("chr", "start", "end","feature")


colnames(day10_GO_bkgd) <- c("chr", "start", "end","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")

colnames(day135_GO_bkgd) <- c("chr", "start", "end","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")

colnames(day145_GO_bkgd) <- c("chr", "start", "end","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")

colnames(amb_GO_bkgd) <- c("chr", "start", "end","feature","gene","entry","entry.name","protein.name", "GO.IDs", "GO.terms")

#remove extra columns
day10_feat_bkgd <- day10_GO_bkgd[,c(1:4)]
day135_feat_bkgd <- day135_GO_bkgd[,c(1:4)]
day145_feat_bkgd <- day145_GO_bkgd[,c(1:4)]
amb_feat_bkgd <- amb_GO_bkgd[,c(1:4)]

```

add column to each GO data frame so they can be combined
```{r}
amb_GO$group <- "amb"
day10_GO$group <- "day10"
day135_GO$group <- "day135"
day145_GO$group <- "day145"


amb_GO_nosig$group <- "amb"
day10_GO_nosig$group <- "day10"
day135_GO_nosig$group <- "day135"
day145_GO_nosig$group <- "day145"


amb_feat_bkgd$group <- "amb"
day10_feat_bkgd$group <- "day10"
day135_feat_bkgd$group <- "day135"
day145_feat_bkgd$group <- "day145"

```

combine GO data frames
```{r}
all_comparisons <- rbind(amb_GO,day10_GO, day135_GO, day145_GO)

#20191109 I'm commenting this out because I've updated the background
#all_comparisons_bkgrd <- rbind(amb_GO_nosig, day10_GO_nosig, day135_GO_nosig, day145_GO_nosig)

all_comparisons_bkgrd <- rbind(amb_feat_bkgd, day10_feat_bkgd, day135_feat_bkgd, day145_feat_bkgd)

all_comparisons_bkgrd$sig <- "BkgdCGs5xCov3idv"
all_comparisons$sig <- "Significant DMRs"

all_comparisons_bkgrd <- rbind(all_comparisons_bkgrd,all_comparisons[,c(1:3,5,12:13)])

all_comparisons_bkgrd$group.sig <- paste(all_comparisons_bkgrd$sig,all_comparisons_bkgrd$group,sep = ".")
```

plot DMR feature frequencies
```{r}
jpeg("features_histograms.jpg", width = 800, height = 800)
#facet by feature
ggplot(all_comparisons) + geom_histogram(aes(group, color = group, fill = group), stat = "count") + facet_wrap(~feature) + xlab("comparison") + ylab("number of features") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))
dev.off()

ggplot(all_comparisons) + geom_bar(aes(group, color = group, fill = group), stat = "count") + facet_wrap(~feature) + xlab("comparison") + ylab("number of features") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))


#facet by exp. group
jpeg("features_histograms_group_facet.jpg", width = 800, height = 800)
ggplot(all_comparisons) + geom_histogram(aes(feature, color = feature, fill = feature), stat = "count") + facet_wrap(~group)
dev.off()

#stacked bar plot of features for each group
jpeg("features_stacked_bar_plot.jpg", width = 800, height = 800)
ggplot(all_comparisons, aes(x = factor(group), group = feature, fill = feature)) + geom_bar(position = "fill") + ylab("fraction of all features") + xlab("comparison")
dev.off()

#plot proportion percentages for features represented in all DMRs vs. significant DMRs for each comparison
#found these sites helpful:
#https://sebastiansauer.github.io/percentage_plot_ggplot2_V2/
#https://www.biostars.org/p/302349/
jpeg("AllvSigDMRsxfeatures_PropBarplot_GroupFacet.jpg", width = 8, height = 8,units = "in", res = 300)
ggplot(all_comparisons_bkgrd, aes(x= feature,  group=group.sig)) + geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + geom_text(aes( label = scales::percent(..prop..),y= ..prop.. ), stat= "count", vjust = -.5, size = 2.5) + labs(y = "Percent", fill="feature") + facet_grid(group~sig) + scale_y_continuous(labels = scales::percent,limits = c(0,0.35)) + scale_fill_manual(name="Features", values=c("#F8766D","#BB9D00","#00B81F","#00C0B8","#00A5FF","#E76BF3","#FF6C90"), labels=c("CDS", "exon", "gene","mRNA","repeat_region","rRNA","tRNA")) + theme_bw() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
dev.off()


#plot proportion percentages for features represented in 20191109 background sites vs. significant DMRs for each comparison
jpeg("BkgdvSigDMRsxfeatures_PropBarplot_GroupFacet.jpg", width = 8, height = 8,units = "in", res = 300)
ggplot(all_comparisons_bkgrd, aes(x= feature,  group=group.sig)) + geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + geom_text(aes( label = scales::percent(..prop..),y= ..prop.. ), stat= "count", vjust = -.5, size = 2.5) + labs(y = "Percent", fill="feature") + facet_grid(group~sig) + scale_y_continuous(labels = scales::percent,limits = c(0,0.35)) + scale_fill_manual(name="Features", values=c("#F8766D","#BB9D00","#00B81F","#00C0B8","#00A5FF","#E76BF3","#FF6C90"), labels=c("CDS", "exon", "gene","mRNA","repeat_region","rRNA","tRNA")) + theme_bw() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
dev.off()


```

GO annotations
```{r}
#Since GO IDs are listed all in one column, spread them across multiple columns (1 column /term)
day10_sig_pro_GOid <- data.frame()
for (i in 1:nrow(day10_GO)){
  day10_sig_pro_GOid_row <- data.frame(t(data.frame(strsplit(as.character(day10_GO$GO.IDs[i]),'; ', fixed = TRUE))))
  day10_sig_pro_GOid <- rbind.fill(day10_sig_pro_GOid,day10_sig_pro_GOid_row)
}

#add chr and position info back to GO IDs
day10_sig_pro_GOid <- cbind(day10_GO[,c("chr","start","end")], day10_sig_pro_GOid[,-1])
#this results in a table with "chr","start","end" listed and each next column contains a GO ID that was listed with the protein in Uniprot DB.
#convert all factors to  characters
#silo39_sig_pro_GOid_term <- data.frame(lapply(silo39_sig_pro_GOid_term, as.character), stringsAsFactors = FALSE)
#proteins that don't have GO IDs?

#reshape data so that all GO ID columns are gathered in one column called "GO" 
STACKED_day10_sig_pro_GOid <- tidyr::gather(day10_sig_pro_GOid,"col","GO", 4:ncol(day10_sig_pro_GOid))
#exlude middle column which just contains the string "gene" in each row
STACKED_day10_sig_pro_GOid <- STACKED_day10_sig_pro_GOid[,-4]
#remove duplicate rows
STACKED_day10_sig_pro_GOid <- unique(STACKED_day10_sig_pro_GOid)

#remove any rows where GO column has NA value. 
STACKED_day10_sig_pro_GOid <- STACKED_day10_sig_pro_GOid[which(!is.na(STACKED_day10_sig_pro_GOid$GO)),]
#this resulting data frame has two columns "gene" and "GO"

#creating background gene list:

day10_bkgd_pro_GOid <- data.frame()
for (i in 1:nrow(day10_GO_bkgd[which(substr(day10_GO_bkgd$GO.IDs,1,2)=="GO"),])){
  day10_bkgd_pro_GOid_row <- data.frame(t(data.frame(strsplit(as.character(day10_GO_bkgd[which(substr(day10_GO_bkgd$GO.IDs,1,2)=="GO"),"GO.IDs"][i]),'; ', fixed = TRUE))))
  day10_bkgd_pro_GOid <- rbind.fill(day10_bkgd_pro_GOid,day10_bkgd_pro_GOid_row)
}

#add chr and position info back to GO IDs
day10_bkgd_pro_GOid <- cbind(day10_GO_bkgd[which(substr(day10_GO_bkgd$GO.IDs,1,2)=="GO"),c("chr","start","end")], day10_bkgd_pro_GOid[,-1])
#this results in a table with "chr","start","end" listed and each next column contains a GO ID that was listed with the protein in Uniprot DB.
#convert all factors to  characters
#silo39_sig_pro_GOid_term <- data.frame(lapply(silo39_sig_pro_GOid_term, as.character), stringsAsFactors = FALSE)
#proteins that don't have GO IDs?

#convert factor columns to character
class(day10_bkgd_pro_GOid[,4:ncol(day10_bkgd_pro_GOid)]) <- "character"

#reshape data so that all GO ID columns are gathered in one column called "GO" 
STACKED_day10_bkgd_pro_GOid <- tidyr::gather(day10_bkgd_pro_GOid,"col","GO", 4:ncol(day10_bkgd_pro_GOid))
#exlude middle column which just contains the string "gene" in each row
STACKED_day10_sig_pro_GOid <- STACKED_day10_sig_pro_GOid[,-4]
#remove duplicate rows
STACKED_day10_sig_pro_GOid <- unique(STACKED_day10_sig_pro_GOid)

#remove any rows where GO column has NA value. 
STACKED_day10_sig_pro_GOid <- STACKED_day10_sig_pro_GOid[which(!is.na(STACKED_day10_sig_pro_GOid$GO)),]
#this resulting data frame has two columns "gene" and "GO"

```