analysis/nmds_R/nmds_analysis_like_Emmas/ClusteringTechnicalReplicates.Rmd

---
title: "Clustering technical replicates"
author: "Shelly Trigg"
date: "1/11/2019"
output: rmarkdown::github_document
---

Load packages
```{r}
library(vegan)
library(ggplot2)
library(dplyr)
library(gtools)
```

Load Abacus data, parse out ADJNSAF values, and simplify column names to just sample number
```{r}
#upload data file
ABACUSdata <- read.csv("~/Documents/GitHub/OysterSeedProject/raw_data/ABACUS_output021417.tsv", sep = "\t", header=TRUE, stringsAsFactors = FALSE)
#select only columns containing ADJNSAF and Protein ID
ABACUSdata <- ABACUSdata[,c(1,grep("ADJNSAF", colnames(ABACUSdata)))]

## change column names in ABACUSdata to just sampleID
colnames(ABACUSdata) <- gsub(pattern = "X20161205_SAMPLE_", "", colnames(ABACUSdata))
colnames(ABACUSdata) <- gsub(pattern = "_ADJNSAF", "", colnames(ABACUSdata))
```

Load meta data file with temperature and day information
```{r}
#upload meta data; this was a csv file I create from Rhonda's notebook entry: https://github.com/Ellior2/Ellior2.github.io/blob/master/_posts/2017-3-11-NMDS-analysis.md
meta_data <- read.csv("~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/Rhonda_new_sample_names.csv", header = TRUE, stringsAsFactors = FALSE)
meta_data$silo <- substr(meta_data$Contents,5,5)
meta_data$day <- substr(meta_data$SampleName,5,6)
meta_data$SampleName <- gsub(pattern = "H","",meta_data$SampleName)
meta_data$SampleName <- gsub(pattern = "C","",meta_data$SampleName)
#create a temperature column
meta_data$temp <- "temp"
for(i in 1:nrow(meta_data)){
  if(meta_data$silo[i] == "2"){
    meta_data$temp[i] <- "23"
  }
  if(meta_data$silo[i] == "3"){
    meta_data$temp[i] <- "23"
  }
  if(meta_data$silo[i] == "9"){
    meta_data$temp[i] <- "29"
  }
  if(meta_data$silo[i] == "e"){
    meta_data$temp[i] <- "16"
  }
}
```

Reformat Abacus data for NMDS
```{r}
#Transpose- switch rows and columns
tABACUSdata <- t.data.frame(ABACUSdata[,-1])
colnames(tABACUSdata) <- ABACUSdata[,1]
tABACUSdata <- cbind(data.frame(rownames(tABACUSdata)),tABACUSdata)
colnames(tABACUSdata)[1] <- "SampleID"

#add meta data to abacus data
tABACUSdata <- merge(meta_data[,c(1,2,7,8)],tABACUSdata, by = "SampleID")

#Remove Silo 2 and day 15
silo3and9 <- tABACUSdata[which(substr(tABACUSdata$SampleName,1,2) != "S2" & tABACUSdata$day != "15"),]
#make rownames from Sample ID column so that the NMDS knows what's what
rownames(silo3and9) <- silo3and9$SampleID
#order the data frame by day and temperature so coloring the points on the plot is easier
silo3and9 <- silo3and9[order(as.numeric(silo3and9$day),silo3and9$temp),]
```

Determine if any proteins have zero ADJNSAF vals for all samples; this would be because they were in Silo 2, but not in Silo 3 or 9
```{r}
no_val_proteins <- silo3and9[,which(apply(silo3and9, 2, var) == 0)]
ncol(no_val_proteins)
```

Remove proteins if they have a zero value in all samples
```{r}
silo3and9_nozerovar <- silo3and9[,-c(1:4,which(colnames(silo3and9) %in% colnames(no_val_proteins)))]
#check to make sure it worked
ncol(silo3and9)-ncol(silo3and9_nozerovar)
```

For proteins with a zero value in any sample, replace with very small value
```{r}
silo3and9_nozerovar[silo3and9_nozerovar == 0.0000] <- 0.1000
```

```{r, echo = FALSE, eval=FALSE}
write.csv(cbind(silo3and9$SampleID, silo3and9$day, silo3and9$temp,silo3and9_nozerovar), "~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/silo3and9_nozerovals.csv", row.names = FALSE, quote = FALSE)


```

try PCA

```{r}
#first convert day and temp data to factors for plotting
silo3and9$day <- factor(silo3and9$day, levels = unique(silo3and9$day))
silo3and9$temp <- factor(silo3and9$temp, levels = unique(silo3and9$temp))

#run PCA
pca <- prcomp(silo3and9_nozerovar, center = T, scale = T)
pca_meta <- cbind(silo3and9$day, silo3and9$temp, data.frame(paste(silo3and9$day,silo3and9$temp, sep = "_")),pca$x)
colnames(pca_meta)[1:3] <- c("day","temp","SampleName")
ggplot(pca_meta, aes(PC1, PC2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("PCA of ADJNSAF values where zeros were replaced with 0.1")
```

try PCA on log transformed values
```{r}
silo3and9_log <- log(silo3and9_nozerovar,2)
pca_log <- prcomp(silo3and9_log, center = F, scale = F)
pca_log_meta <- cbind(silo3and9$day, silo3and9$temp, data.frame(paste(silo3and9$day,silo3and9$temp, sep = "_")),pca_log$x)
colnames(pca_log_meta)[1:3] <- c("day","temp","SampleName")
ggplot(pca_log_meta, aes(PC1, PC2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("PCA of log ADJNSAF values with zeros replaced with 0.1")
```

Make MDS dissimilarity matrix
```{r}
nmds.silo3and9 <- metaMDS(silo3and9_nozerovar, distance = 'euclidean', k = 2, trymax = 3000, autotransform = FALSE)

#make data frame of NMDS scores
nmds.silo3and9.scores <- cbind(silo3and9$day, silo3and9$temp,data.frame(scores(nmds.silo3and9)))
colnames(nmds.silo3and9.scores)[1:2] <- c("day","temp")
ggplot(nmds.silo3and9.scores, aes(NMDS1, NMDS2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("NMDS of ADJNSAF values with zeros replaced with 0.1")
```

Make MDS dissimilarity matrix with log tranformed ADJNSAF values
```{r}
nmds.silo3and9_log <- metaMDS(silo3and9_log, distance = 'euclidean', k = 2, trymax = 3000, autotransform = FALSE)

#make data frame of NMDS scores
nmds.silo3and9_log.scores <- cbind(silo3and9$day, silo3and9$temp,data.frame(scores(nmds.silo3and9_log)))
colnames(nmds.silo3and9_log.scores)[1:2] <- c("day","temp")
ggplot(nmds.silo3and9_log.scores, aes(NMDS1, NMDS2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("NMDS of log ADJNSAF values with zeros replaced with 0.1")
```

Make MDS dissimilarity matrix with log transformed (ADJNSAF values + 1) and bray curtis distance
```{r}
nmds.silo3and9_log_bray <- metaMDS(log(silo3and9[,-c(1:4,which(colnames(silo3and9) %in% colnames(no_val_proteins)))]+1,2), distance = 'bray', k = 2, trymax = 3000, autotransform = FALSE)

#make data frame of NMDS scores
nmds.silo3and9_log_bray.scores <- cbind(silo3and9$day, silo3and9$temp,data.frame(scores(nmds.silo3and9_log_bray)))
colnames(nmds.silo3and9_log_bray.scores)[1:2] <- c("day","temp")
ggplot(nmds.silo3and9_log_bray.scores, aes(NMDS1, NMDS2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("bray curtis NMDS of log (ADJNSAF values + 1)")
```

## Seems like technical replicates are pretty close according to either PCA, although a couple are questionable (e.g. 23C day 9, 23C day 11). NMDS plots don't seem as helpful as the PCA plots.

The ADJNSAF values for trypsin across samples seem fairly consistent as expected since this was manually added in during protein prep.  This suggests that sample scaling should not be necessary.

```{r}
mean(silo3and9_nozerovar[,grep("TRYP",colnames(silo3and9_nozerovar))])
sd(silo3and9_nozerovar[,grep("TRYP",colnames(silo3and9_nozerovar))])
```

Looking at percent error in 23C day 9 replicates
```{r}
S3D9 <- ABACUSdata[,c("PROTID","28","28A")]
S3D9$percerr <- abs(S3D9$`28` - S3D9$`28A`)/S3D9$`28` * 100
S3D9$SD <- apply(S3D9[,c("28","28A")],1,sd)
S3D9$Diff <- abs(S3D9$`28` - S3D9$`28A`)
```

There are a number of proteins that show up in one technical replicate, sometimes with ADJNSAF values > 100, but then don't show up in the second technical replicate. This seems suspicious and perhaps these are unreliable proteins 
```{r}
head(S3D9[order(S3D9$SD, decreasing = TRUE),], n = 20)
```

If we look at one of these proteins across all technical replicates, are there other samples that show unreliable detection?

```{r}
silo3and9[,c(1:4,grep("CHOYP_RLA1.5.12|m.15482", colnames(silo3and9)))]
```

In addition to S3D9, this protein is also not reiably detected in technical replicates from S9D5, S3D7, and S9D9. 

## Next steps:
#- Investigate how wide spread a problem this is an determine if certain proteins need to be removed from the dataset due to unreliable detection across technical replicates

```{r, echo=FALSE, eval = FALSE}
###Check if any pair of replicates show 100% error or Inf% error with an SD of > 5; those are the potentially unrealiable proteins

ABACUSdata_SD <- ABACUSdata[,c("PROTID",mixedsort(colnames(ABACUSdata[,-1])))]

ABACUSdataSD <- data.frame(sapply(seq(2,ncol(ABACUSdata_SD),2), function(i) {
    apply(ABACUSdata_SD[,c(i, i+1)],1,sd)
  }))

colnames(ABACUSdataSD) <- mixedsort(colnames(ABACUSdata_SD[,c(-1,-grep("A",colnames(ABACUSdata_SD)))]))
new_colnames <- list()
for (i in 1:length(colnames(ABACUSdataSD))){
  sample <- colnames(ABACUSdataSD)[i]
  new_colnames[[i]] <- paste(meta_data[meta_data$SampleID == sample,"silo"],meta_data[meta_data$SampleID == sample, "day"], sep = "_")
  }

colnames(ABACUSdataSD) <- new_colnames

#exclude silo 2
ABACUSdataSD <- ABACUSdataSD[,-grep("2",colnames(ABACUSdataSD))]

#Add protein IDs back in 
ABACUSdata_SD <- cbind(data.frame(ABACUSdata_SD[,"PROTID"],stringsAsFactors = FALSE), ABACUSdataSD)

#determine which proteins have > 100 SD
largeSD <- ABACUSdata_SD[which(ABACUSdata_SD > 100),]
```