forked from kaitlynrm/RobertsLabNotebook
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ClusteringTechnicalReplicates.Rmd
209 lines (169 loc) · 8.78 KB
/
ClusteringTechnicalReplicates.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
---
title: "Clustering technical replicates"
author: "Shelly Trigg"
date: "1/11/2019"
output: rmarkdown::github_document
---
Load packages
```{r}
library(vegan)
library(ggplot2)
library(dplyr)
library(gtools)
```
Load Abacus data, parse out ADJNSAF values, and simplify column names to just sample number
```{r}
#upload data file
ABACUSdata <- read.csv("~/Documents/GitHub/OysterSeedProject/raw_data/ABACUS_output021417.tsv", sep = "\t", header=TRUE, stringsAsFactors = FALSE)
#select only columns containing ADJNSAF and Protein ID
ABACUSdata <- ABACUSdata[,c(1,grep("ADJNSAF", colnames(ABACUSdata)))]
## change column names in ABACUSdata to just sampleID
colnames(ABACUSdata) <- gsub(pattern = "X20161205_SAMPLE_", "", colnames(ABACUSdata))
colnames(ABACUSdata) <- gsub(pattern = "_ADJNSAF", "", colnames(ABACUSdata))
```
Load meta data file with temperature and day information
```{r}
#upload meta data; this was a csv file I create from Rhonda's notebook entry: https://github.com/Ellior2/Ellior2.github.io/blob/master/_posts/2017-3-11-NMDS-analysis.md
meta_data <- read.csv("~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/Rhonda_new_sample_names.csv", header = TRUE, stringsAsFactors = FALSE)
meta_data$silo <- substr(meta_data$Contents,5,5)
meta_data$day <- substr(meta_data$SampleName,5,6)
meta_data$SampleName <- gsub(pattern = "H","",meta_data$SampleName)
meta_data$SampleName <- gsub(pattern = "C","",meta_data$SampleName)
#create a temperature column
meta_data$temp <- "temp"
for(i in 1:nrow(meta_data)){
if(meta_data$silo[i] == "2"){
meta_data$temp[i] <- "23"
}
if(meta_data$silo[i] == "3"){
meta_data$temp[i] <- "23"
}
if(meta_data$silo[i] == "9"){
meta_data$temp[i] <- "29"
}
if(meta_data$silo[i] == "e"){
meta_data$temp[i] <- "16"
}
}
```
Reformat Abacus data for NMDS
```{r}
#Transpose- switch rows and columns
tABACUSdata <- t.data.frame(ABACUSdata[,-1])
colnames(tABACUSdata) <- ABACUSdata[,1]
tABACUSdata <- cbind(data.frame(rownames(tABACUSdata)),tABACUSdata)
colnames(tABACUSdata)[1] <- "SampleID"
#add meta data to abacus data
tABACUSdata <- merge(meta_data[,c(1,2,7,8)],tABACUSdata, by = "SampleID")
#Remove Silo 2 and day 15
silo3and9 <- tABACUSdata[which(substr(tABACUSdata$SampleName,1,2) != "S2" & tABACUSdata$day != "15"),]
#make rownames from Sample ID column so that the NMDS knows what's what
rownames(silo3and9) <- silo3and9$SampleID
#order the data frame by day and temperature so coloring the points on the plot is easier
silo3and9 <- silo3and9[order(as.numeric(silo3and9$day),silo3and9$temp),]
```
Determine if any proteins have zero ADJNSAF vals for all samples; this would be because they were in Silo 2, but not in Silo 3 or 9
```{r}
no_val_proteins <- silo3and9[,which(apply(silo3and9, 2, var) == 0)]
ncol(no_val_proteins)
```
Remove proteins if they have a zero value in all samples
```{r}
silo3and9_nozerovar <- silo3and9[,-c(1:4,which(colnames(silo3and9) %in% colnames(no_val_proteins)))]
#check to make sure it worked
ncol(silo3and9)-ncol(silo3and9_nozerovar)
```
For proteins with a zero value in any sample, replace with very small value
```{r}
silo3and9_nozerovar[silo3and9_nozerovar == 0.0000] <- 0.1000
```
```{r, echo = FALSE, eval=FALSE}
write.csv(cbind(silo3and9$SampleID, silo3and9$day, silo3and9$temp,silo3and9_nozerovar), "~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/silo3and9_nozerovals.csv", row.names = FALSE, quote = FALSE)
```
try PCA
```{r}
#first convert day and temp data to factors for plotting
silo3and9$day <- factor(silo3and9$day, levels = unique(silo3and9$day))
silo3and9$temp <- factor(silo3and9$temp, levels = unique(silo3and9$temp))
#run PCA
pca <- prcomp(silo3and9_nozerovar, center = T, scale = T)
pca_meta <- cbind(silo3and9$day, silo3and9$temp, data.frame(paste(silo3and9$day,silo3and9$temp, sep = "_")),pca$x)
colnames(pca_meta)[1:3] <- c("day","temp","SampleName")
ggplot(pca_meta, aes(PC1, PC2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("PCA of ADJNSAF values where zeros were replaced with 0.1")
```
try PCA on log transformed values
```{r}
silo3and9_log <- log(silo3and9_nozerovar,2)
pca_log <- prcomp(silo3and9_log, center = F, scale = F)
pca_log_meta <- cbind(silo3and9$day, silo3and9$temp, data.frame(paste(silo3and9$day,silo3and9$temp, sep = "_")),pca_log$x)
colnames(pca_log_meta)[1:3] <- c("day","temp","SampleName")
ggplot(pca_log_meta, aes(PC1, PC2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("PCA of log ADJNSAF values with zeros replaced with 0.1")
```
Make MDS dissimilarity matrix
```{r}
nmds.silo3and9 <- metaMDS(silo3and9_nozerovar, distance = 'euclidean', k = 2, trymax = 3000, autotransform = FALSE)
#make data frame of NMDS scores
nmds.silo3and9.scores <- cbind(silo3and9$day, silo3and9$temp,data.frame(scores(nmds.silo3and9)))
colnames(nmds.silo3and9.scores)[1:2] <- c("day","temp")
ggplot(nmds.silo3and9.scores, aes(NMDS1, NMDS2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("NMDS of ADJNSAF values with zeros replaced with 0.1")
```
Make MDS dissimilarity matrix with log tranformed ADJNSAF values
```{r}
nmds.silo3and9_log <- metaMDS(silo3and9_log, distance = 'euclidean', k = 2, trymax = 3000, autotransform = FALSE)
#make data frame of NMDS scores
nmds.silo3and9_log.scores <- cbind(silo3and9$day, silo3and9$temp,data.frame(scores(nmds.silo3and9_log)))
colnames(nmds.silo3and9_log.scores)[1:2] <- c("day","temp")
ggplot(nmds.silo3and9_log.scores, aes(NMDS1, NMDS2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("NMDS of log ADJNSAF values with zeros replaced with 0.1")
```
Make MDS dissimilarity matrix with log transformed (ADJNSAF values + 1) and bray curtis distance
```{r}
nmds.silo3and9_log_bray <- metaMDS(log(silo3and9[,-c(1:4,which(colnames(silo3and9) %in% colnames(no_val_proteins)))]+1,2), distance = 'bray', k = 2, trymax = 3000, autotransform = FALSE)
#make data frame of NMDS scores
nmds.silo3and9_log_bray.scores <- cbind(silo3and9$day, silo3and9$temp,data.frame(scores(nmds.silo3and9_log_bray)))
colnames(nmds.silo3and9_log_bray.scores)[1:2] <- c("day","temp")
ggplot(nmds.silo3and9_log_bray.scores, aes(NMDS1, NMDS2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("bray curtis NMDS of log (ADJNSAF values + 1)")
```
## Seems like technical replicates are pretty close according to either PCA, although a couple are questionable (e.g. 23C day 9, 23C day 11). NMDS plots don't seem as helpful as the PCA plots.
The ADJNSAF values for trypsin across samples seem fairly consistent as expected since this was manually added in during protein prep. This suggests that sample scaling should not be necessary.
```{r}
mean(silo3and9_nozerovar[,grep("TRYP",colnames(silo3and9_nozerovar))])
sd(silo3and9_nozerovar[,grep("TRYP",colnames(silo3and9_nozerovar))])
```
Looking at percent error in 23C day 9 replicates
```{r}
S3D9 <- ABACUSdata[,c("PROTID","28","28A")]
S3D9$percerr <- abs(S3D9$`28` - S3D9$`28A`)/S3D9$`28` * 100
S3D9$SD <- apply(S3D9[,c("28","28A")],1,sd)
S3D9$Diff <- abs(S3D9$`28` - S3D9$`28A`)
```
There are a number of proteins that show up in one technical replicate, sometimes with ADJNSAF values > 100, but then don't show up in the second technical replicate. This seems suspicious and perhaps these are unreliable proteins
```{r}
head(S3D9[order(S3D9$SD, decreasing = TRUE),], n = 20)
```
If we look at one of these proteins across all technical replicates, are there other samples that show unreliable detection?
```{r}
silo3and9[,c(1:4,grep("CHOYP_RLA1.5.12|m.15482", colnames(silo3and9)))]
```
In addition to S3D9, this protein is also not reiably detected in technical replicates from S9D5, S3D7, and S9D9.
## Next steps:
#- Investigate how wide spread a problem this is an determine if certain proteins need to be removed from the dataset due to unreliable detection across technical replicates
```{r, echo=FALSE, eval = FALSE}
###Check if any pair of replicates show 100% error or Inf% error with an SD of > 5; those are the potentially unrealiable proteins
ABACUSdata_SD <- ABACUSdata[,c("PROTID",mixedsort(colnames(ABACUSdata[,-1])))]
ABACUSdataSD <- data.frame(sapply(seq(2,ncol(ABACUSdata_SD),2), function(i) {
apply(ABACUSdata_SD[,c(i, i+1)],1,sd)
}))
colnames(ABACUSdataSD) <- mixedsort(colnames(ABACUSdata_SD[,c(-1,-grep("A",colnames(ABACUSdata_SD)))]))
new_colnames <- list()
for (i in 1:length(colnames(ABACUSdataSD))){
sample <- colnames(ABACUSdataSD)[i]
new_colnames[[i]] <- paste(meta_data[meta_data$SampleID == sample,"silo"],meta_data[meta_data$SampleID == sample, "day"], sep = "_")
}
colnames(ABACUSdataSD) <- new_colnames
#exclude silo 2
ABACUSdataSD <- ABACUSdataSD[,-grep("2",colnames(ABACUSdataSD))]
#Add protein IDs back in
ABACUSdata_SD <- cbind(data.frame(ABACUSdata_SD[,"PROTID"],stringsAsFactors = FALSE), ABACUSdataSD)
#determine which proteins have > 100 SD
largeSD <- ABACUSdata_SD[which(ABACUSdata_SD > 100),]
```