analyses/DMRs_heatmap/Oct29_MCmax30DMR_group_stats.Rmd

---
title: "Oct29_MCmax30DMR_group_stats"
author: "Shelly Trigg"
date: "10/29/2019"
output: rmarkdown::github_document
---

This script was run with Gannet mounted


load libraries
```{r}
library(gplots)
library(ggplot2)
library(dplyr)
library(broom)

```

read in data
```{r}
oct24_MCmax30_DMRs_allAmb <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191024/amb_AllTimes_DMR250bp_MCmax30_cov5x_rms_results_filtered.tsv", header = TRUE, sep = "\t")
oct24_MCmax30_DMRs_day10 <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191024/day10_AllpH_DMR250bp_MCmax30_cov5x_rms_results_filtered.tsv", header = TRUE, sep = "\t")
oct24_MCmax30_DMRs_day135 <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191024/day135_AllpH_DMR250bp_MCmax30_cov5x_rms_results_filtered.tsv", header = TRUE, sep = "\t")
oct24_MCmax30_DMRs_day145 <- read.table("/Volumes/web/metacarcinus/Pgenerosa/analyses/20191024/day145_AllpH_DMR250bp_MCmax30_cov5x_rms_results_filtered.tsv", header = TRUE, sep = "\t")

```


Make a unique ID column in each data frame
```{r}
#for all ambient sample comparison
oct24_MCmax30_DMRs_allAmb$ID <- paste(oct24_MCmax30_DMRs_allAmb$chr,":",oct24_MCmax30_DMRs_allAmb$start,"-",oct24_MCmax30_DMRs_allAmb$end, sep = "")
oct24_MCmax30_DMRs_allAmb$ID <- gsub("__.*__.*:",":",oct24_MCmax30_DMRs_allAmb$ID)

#for day 10 sample comparison

oct24_MCmax30_DMRs_day10$ID <- paste(oct24_MCmax30_DMRs_day10$chr,":",oct24_MCmax30_DMRs_day10$start,"-",oct24_MCmax30_DMRs_day10$end, sep = "")
oct24_MCmax30_DMRs_day10$ID <- gsub("__.*__.*:",":",oct24_MCmax30_DMRs_day10$ID)

#for day 135 sample comparison

oct24_MCmax30_DMRs_day135$ID <- paste(oct24_MCmax30_DMRs_day135$chr,":",oct24_MCmax30_DMRs_day135$start,"-",oct24_MCmax30_DMRs_day135$end, sep = "")
oct24_MCmax30_DMRs_day135$ID <- gsub("__.*__.*:",":",oct24_MCmax30_DMRs_day135$ID)

#for day 145 sample comparison

oct24_MCmax30_DMRs_day145$ID <- paste(oct24_MCmax30_DMRs_day145$chr,":",oct24_MCmax30_DMRs_day145$start,"-",oct24_MCmax30_DMRs_day145$end, sep = "")
oct24_MCmax30_DMRs_day145$ID <- gsub("__.*__.*:",":",oct24_MCmax30_DMRs_day145$ID)


```

calculate group effect
```{r}
#reformat data to long format
day10_STACKED <- tidyr::gather(oct24_MCmax30_DMRs_day10[,7:19], "sample", "perc.meth",1:12)

#make sample number column
day10_STACKED$sample <- as.numeric(gsub("methylation_level_EPI\\.","",day10_STACKED$sample))

#make group column for experimental treatment info
day10_STACKED$group <- NA
for(i in 1:nrow(day10_STACKED)){
  if(day10_STACKED$sample[i] == 103 | day10_STACKED$sample[i] == 104 | day10_STACKED$sample[i] == 127 | day10_STACKED$sample[i] == 128){day10_STACKED$group[i] <- "low.pH"}
  if(day10_STACKED$sample[i] == 111 | day10_STACKED$sample[i] == 113 | day10_STACKED$sample[i] == 143 | day10_STACKED$sample[i] == 145){day10_STACKED$group[i] <- "superlow.pH"}
  if(day10_STACKED$sample[i] == 119 | day10_STACKED$sample[i] == 120 | day10_STACKED$sample[i] == 135 | day10_STACKED$sample[i] == 136){day10_STACKED$group[i] <- "amb"}
}


#plot distributions of % methylation by group facetted by DMR
ggplot(data = day10_STACKED)+ geom_violin(aes(y = perc.meth,x = group, fill = group)) + facet_wrap(~ID, scale = "free") + theme_bw() + theme(axis.text.x = element_text(size = 7,angle = 45, hjust = 1),axis.title=element_text(size=12,face="bold"))

#plot distribution of % methylation in all DMRs in all samples
ggplot(day10_STACKED) + geom_histogram(aes(perc.meth, group = group, color = group,fill = group), bins = 10, position = "identity", alpha = 0.5) + theme_bw()

#plot distribution of % methylation in all DMRs in all samples
ggplot(day10_STACKED_nozero) + geom_histogram(aes(perc.meth, group = group, color = group,fill = group), bins = 20, position = "identity", alpha = 0.5) + theme_bw()


#run anova to assess group differences for each DMR
day10_aov = day10_STACKED %>% group_by(ID) %>%
do(meth_aov_models = aov(perc.meth ~ group, data =  . ))
#summarize anova results
day10_aov_modelsumm <- glance(day10_aov, meth_aov_models)
#write out anova summarized results
write.csv(day10_aov_modelsumm, "day10_MCmax30_aov_modelsumm.csv", row.names = FALSE, quote = FALSE)

#distribution is not normal, so don't choose gaussian. In order to choose gamma, values cannot be zero so convert to very small number

#see what the smallest % methylation is in the data
min(day10_STACKED[which(day10_STACKED$perc.meth !=0),"perc.meth"], na.rm = TRUE)
#[1]  0.003496503


#see (https://bookdown.org/ndphillips/YaRrr/regression-on-non-normal-data-with-glm.html) for advice on which family to choose

#set zero values to 0.5X the minimum % methylation value
day10_STACKED_nozero <- day10_STACKED
day10_STACKED_nozero[day10_STACKED_nozero==0] <- 0.0015

#run glm for each DMR to assess group differences using link = inverse
day10_glm = day10_STACKED_nozero %>% group_by(ID) %>%
  do(meth_glm_models = glm(perc.meth ~ group, family = Gamma(link = "inverse"), data =  . ))


#This part was for calculating the pvalues for the glm

#https://stats.stackexchange.com/questions/129958/glm-in-r-which-pvalue-represents-the-goodness-of-fit-of-entire-model
day10_glm_0 = day10_STACKED_nozero %>% group_by(ID) %>%
  do(meth_glm_0_models = glm(perc.meth ~ 1, family = Gamma(link = "inverse"), data =  . ))

day10_glm_modelsumm <- glance(day10_glm, meth_glm_models)
day10_glm_modelsumm_coeff <- tidy(day10_glm, meth_glm_models) 

day10_glm_0_modelsumm <- glance(day10_glm_0, meth_glm_0_models)
day10_glm_0_modelsumm_coeff <- tidy(day10_glm_0, meth_glm_0_models) 

#calculate chi sq. pvalue for each glm
chi_total <- data.frame(matrix(0,nrow = nrow(day10_glm_modelsumm), ncol = 3))
colnames(chi_total) <- c("DMR", "p.value","FDR")

chi_total$DMR <- day10_glm_0$ID
for(i in 1:nrow(chi_total)){
  chi_total$p.value[i]<- pchisq(deviance(day10_glm_0$meth_glm_0_models[[i]])-deviance(day10_glm$meth_glm_models[[i]]),
     df.residual(day10_glm_0$meth_glm_0_models[[i]])-df.residual(day10_glm$meth_glm_models[[i]]),
    lower.tail=FALSE)
  }

#FDR correct
chi_total$FDR <- p.adjust(chi_total$p.value)

write.csv(chi_total, "day10_MCmax30_glm_Gammainv_modelsumm.csv", row.names = FALSE, quote = FALSE)
```

create matrix for day10 samples
```{r}
#subset out the data and order it
day10_m <- as.matrix(oct24_MCmax30_DMRs_day10[,c(11:14,7:8,15:16,9:10,17:18)])
rownames(day10_m) <- oct24_MCmax30_DMRs_day10$ID
```


What do the anova significant at p < 0.1 look like?
```{r}
aov_0.1_d10_STACKED <- day10_m[which(rownames(day10_m) %in% pull(day10_aov_modelsumm[which(day10_aov_modelsumm$p.value < 0.1),],ID)),]

ColSideColors <- cbind(pH = c(rep("cyan",4),rep("plum2",4),rep("magenta",4)))
jpeg("day10_MCmax30DMR_aov0.1_heatmap.jpg", width = 800, height = 1000)
heatmap.2(aov_0.1_d10_STACKED,margins = c(5,20), cexRow = 1.2, cexCol = 1,ColSideColors = ColSideColors, Colv=NA, col = bluered, na.color = "black", density.info = "none", trace = "none", scale = "row")
dev.off()

#look at the abundance plot

jpeg("day10_MCmax30DMR_aov0.1_boxplots.jpg", width = 1000, height = 700)
ggplot(data = day10_STACKED[which(day10_STACKED$ID %in% pull(day10_aov_modelsumm[which(day10_aov_modelsumm$p.value < 0.1),],ID)),])+ geom_violin(aes(y = perc.meth,x = group, fill = group)) + facet_wrap(~ID, scale = "free") + theme_bw() + theme(axis.text.x = element_text(size = 7,angle = 45, hjust = 1),axis.title=element_text(size=12,face="bold"))
dev.off()

```

What do the glm significant at p < 0.1 (uncorrected) look like?
```{r}
glm_0.1_d10_STACKED <- day10_m[which(rownames(day10_m) %in% pull(chi_total[which(chi_total$p.value < 0.1),],DMR)),]

ColSideColors <- cbind(pH = c(rep("cyan",4),rep("plum2",4),rep("magenta",4)))
jpeg("day10_MCmax30DMR_glm0.1_heatmap.jpg",width=800, height=1000)
heatmap.2(glm_0.1_d10_STACKED,margins = c(5,20), cexRow = 1.2, cexCol = 1,ColSideColors = ColSideColors, Colv=NA, col = bluered, na.color = "black", density.info = "none", trace = "none", scale = "row")
dev.off()

#look at the abundance plot

jpeg("day10_MCmax30DMR_glm0.1_boxplots.jpg", width = 1000, height = 700)
ggplot(data = day10_STACKED[which(day10_STACKED$ID %in% pull(chi_total[which(chi_total$p.value < 0.1),],DMR)),])+ geom_violin(aes(y = perc.meth,x = group, fill = group)) + facet_wrap(~ID, scale = "free") + theme_bw() + theme(axis.text.x = element_text(size = 7,angle = 45, hjust = 1),axis.title=element_text(size=12,face="bold"))
dev.off()

```