Shelly Trigg 1/11/2019
Load packages
library(vegan)
## Warning: package 'vegan' was built under R version 3.4.4
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gtools)
## Warning: package 'gtools' was built under R version 3.4.4
##
## Attaching package: 'gtools'
## The following object is masked from 'package:permute':
##
## permute
Load Abacus data, parse out ADJNSAF values, and simplify column names to just sample number
#upload data file
ABACUSdata <- read.csv("~/Documents/GitHub/OysterSeedProject/raw_data/ABACUS_output021417.tsv", sep = "\t", header=TRUE, stringsAsFactors = FALSE)
#select only columns containing ADJNSAF and Protein ID
ABACUSdata <- ABACUSdata[,c(1,grep("ADJNSAF", colnames(ABACUSdata)))]
## change column names in ABACUSdata to just sampleID
colnames(ABACUSdata) <- gsub(pattern = "X20161205_SAMPLE_", "", colnames(ABACUSdata))
colnames(ABACUSdata) <- gsub(pattern = "_ADJNSAF", "", colnames(ABACUSdata))
Load meta data file with temperature and day information
#upload meta data; this was a csv file I create from Rhonda's notebook entry: https://github.com/Ellior2/Ellior2.github.io/blob/master/_posts/2017-3-11-NMDS-analysis.md
meta_data <- read.csv("~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/Rhonda_new_sample_names.csv", header = TRUE, stringsAsFactors = FALSE)
meta_data$silo <- substr(meta_data$Contents,5,5)
meta_data$day <- substr(meta_data$SampleName,5,6)
meta_data$SampleName <- gsub(pattern = "H","",meta_data$SampleName)
meta_data$SampleName <- gsub(pattern = "C","",meta_data$SampleName)
#create a temperature column
meta_data$temp <- "temp"
for(i in 1:nrow(meta_data)){
if(meta_data$silo[i] == "2"){
meta_data$temp[i] <- "23"
}
if(meta_data$silo[i] == "3"){
meta_data$temp[i] <- "23"
}
if(meta_data$silo[i] == "9"){
meta_data$temp[i] <- "29"
}
if(meta_data$silo[i] == "e"){
meta_data$temp[i] <- "16"
}
}
Reformat Abacus data for NMDS
#Transpose- switch rows and columns
tABACUSdata <- t.data.frame(ABACUSdata[,-1])
colnames(tABACUSdata) <- ABACUSdata[,1]
tABACUSdata <- cbind(data.frame(rownames(tABACUSdata)),tABACUSdata)
colnames(tABACUSdata)[1] <- "SampleID"
#add meta data to abacus data
tABACUSdata <- merge(meta_data[,c(1,2,7,8)],tABACUSdata, by = "SampleID")
#Remove Silo 2 and day 15
silo3and9 <- tABACUSdata[which(substr(tABACUSdata$SampleName,1,2) != "S2" & tABACUSdata$day != "15"),]
#make rownames from Sample ID column so that the NMDS knows what's what
rownames(silo3and9) <- silo3and9$SampleID
#order the data frame by day and temperature so coloring the points on the plot is easier
silo3and9 <- silo3and9[order(as.numeric(silo3and9$day),silo3and9$temp),]
Determine if any proteins have zero ADJNSAF vals for all samples; this would be because they were in Silo 2, but not in Silo 3 or 9
no_val_proteins <- silo3and9[,which(apply(silo3and9, 2, var) == 0)]
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
ncol(no_val_proteins)
## [1] 451
Remove proteins if they have a zero value in all samples
silo3and9_nozerovar <- silo3and9[,-c(1:4,which(colnames(silo3and9) %in% colnames(no_val_proteins)))]
#check to make sure it worked
ncol(silo3and9)-ncol(silo3and9_nozerovar)
## [1] 455
For proteins with a zero value in any sample, replace with very small value
silo3and9_nozerovar[silo3and9_nozerovar == 0.0000] <- 0.1000
try PCA
#first convert day and temp data to factors for plotting
silo3and9$day <- factor(silo3and9$day, levels = unique(silo3and9$day))
silo3and9$temp <- factor(silo3and9$temp, levels = unique(silo3and9$temp))
#run PCA
pca <- prcomp(silo3and9_nozerovar, center = T, scale = T)
pca_meta <- cbind(silo3and9$day, silo3and9$temp, data.frame(paste(silo3and9$day,silo3and9$temp, sep = "_")),pca$x)
colnames(pca_meta)[1:3] <- c("day","temp","SampleName")
ggplot(pca_meta, aes(PC1, PC2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("PCA of ADJNSAF values where zeros were replaced with 0.1")
try PCA on log transformed values
silo3and9_log <- log(silo3and9_nozerovar,2)
pca_log <- prcomp(silo3and9_log, center = F, scale = F)
pca_log_meta <- cbind(silo3and9$day, silo3and9$temp, data.frame(paste(silo3and9$day,silo3and9$temp, sep = "_")),pca_log$x)
colnames(pca_log_meta)[1:3] <- c("day","temp","SampleName")
ggplot(pca_log_meta, aes(PC1, PC2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("PCA of log ADJNSAF values with zeros replaced with 0.1")
Make MDS dissimilarity matrix
nmds.silo3and9 <- metaMDS(silo3and9_nozerovar, distance = 'euclidean', k = 2, trymax = 3000, autotransform = FALSE)
## Run 0 stress 0.1649526
## Run 1 stress 0.1741347
## Run 2 stress 0.186594
## Run 3 stress 0.1649516
## ... New best solution
## ... Procrustes: rmse 0.0003515769 max resid 0.001450181
## ... Similar to previous best
## Run 4 stress 0.2693699
## Run 5 stress 0.1631588
## ... New best solution
## ... Procrustes: rmse 0.03627344 max resid 0.1309964
## Run 6 stress 0.2387604
## Run 7 stress 0.1832391
## Run 8 stress 0.2105309
## Run 9 stress 0.2593247
## Run 10 stress 0.1627151
## ... New best solution
## ... Procrustes: rmse 0.01441198 max resid 0.05956846
## Run 11 stress 0.1763837
## Run 12 stress 0.1874621
## Run 13 stress 0.1631589
## ... Procrustes: rmse 0.01448213 max resid 0.06021317
## Run 14 stress 0.1807038
## Run 15 stress 0.1741249
## Run 16 stress 0.1649514
## Run 17 stress 0.2013429
## Run 18 stress 0.1627157
## ... Procrustes: rmse 0.0003196832 max resid 0.00128576
## ... Similar to previous best
## Run 19 stress 0.1649515
## Run 20 stress 0.1649518
## *** Solution reached
#make data frame of NMDS scores
nmds.silo3and9.scores <- cbind(silo3and9$day, silo3and9$temp,data.frame(scores(nmds.silo3and9)))
colnames(nmds.silo3and9.scores)[1:2] <- c("day","temp")
ggplot(nmds.silo3and9.scores, aes(NMDS1, NMDS2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("NMDS of ADJNSAF values with zeros replaced with 0.1")
Make MDS dissimilarity matrix with log tranformed ADJNSAF values
nmds.silo3and9_log <- metaMDS(silo3and9_log, distance = 'euclidean', k = 2, trymax = 3000, autotransform = FALSE)
## 'comm' has negative data: 'autotransform', 'noshare' and 'wascores' set to FALSE
## Run 0 stress 0.1123882
## Run 1 stress 0.1282567
## Run 2 stress 0.1229455
## Run 3 stress 0.1124371
## ... Procrustes: rmse 0.004849062 max resid 0.01680513
## Run 4 stress 0.1230615
## Run 5 stress 0.1230617
## Run 6 stress 0.1155855
## Run 7 stress 0.1124371
## ... Procrustes: rmse 0.004840493 max resid 0.01677708
## Run 8 stress 0.1285297
## Run 9 stress 0.1335596
## Run 10 stress 0.1124366
## ... Procrustes: rmse 0.00480659 max resid 0.01678008
## Run 11 stress 0.1279615
## Run 12 stress 0.1230613
## Run 13 stress 0.1123888
## ... Procrustes: rmse 0.0004994801 max resid 0.001604861
## ... Similar to previous best
## Run 14 stress 0.1263476
## Run 15 stress 0.1124365
## ... Procrustes: rmse 0.004745711 max resid 0.01668024
## Run 16 stress 0.1285279
## Run 17 stress 0.127836
## Run 18 stress 0.1124378
## ... Procrustes: rmse 0.004873351 max resid 0.0168166
## Run 19 stress 0.1460328
## Run 20 stress 0.131862
## *** Solution reached
#make data frame of NMDS scores
nmds.silo3and9_log.scores <- cbind(silo3and9$day, silo3and9$temp,data.frame(scores(nmds.silo3and9_log)))
colnames(nmds.silo3and9_log.scores)[1:2] <- c("day","temp")
ggplot(nmds.silo3and9_log.scores, aes(NMDS1, NMDS2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("NMDS of log ADJNSAF values with zeros replaced with 0.1")
Make MDS dissimilarity matrix with log transformed (ADJNSAF values + 1) and bray curtis distance
nmds.silo3and9_log_bray <- metaMDS(log(silo3and9[,-c(1:4,which(colnames(silo3and9) %in% colnames(no_val_proteins)))]+1,2), distance = 'bray', k = 2, trymax = 3000, autotransform = FALSE)
## Run 0 stress 0.1245128
## Run 1 stress 0.1357539
## Run 2 stress 0.1397267
## Run 3 stress 0.1304776
## Run 4 stress 0.150567
## Run 5 stress 0.1357538
## Run 6 stress 0.1261319
## Run 7 stress 0.1253263
## Run 8 stress 0.1540754
## Run 9 stress 0.1514475
## Run 10 stress 0.1296366
## Run 11 stress 0.1261315
## Run 12 stress 0.1261314
## Run 13 stress 0.1525995
## Run 14 stress 0.1245128
## ... Procrustes: rmse 4.332235e-05 max resid 0.0001472904
## ... Similar to previous best
## Run 15 stress 0.1261314
## Run 16 stress 0.135754
## Run 17 stress 0.1394978
## Run 18 stress 0.1261312
## Run 19 stress 0.1261329
## Run 20 stress 0.1310119
## *** Solution reached
#make data frame of NMDS scores
nmds.silo3and9_log_bray.scores <- cbind(silo3and9$day, silo3and9$temp,data.frame(scores(nmds.silo3and9_log_bray)))
colnames(nmds.silo3and9_log_bray.scores)[1:2] <- c("day","temp")
ggplot(nmds.silo3and9_log_bray.scores, aes(NMDS1, NMDS2)) + geom_point(aes(col = day, shape = temp)) + theme_bw() + ggtitle("bray curtis NMDS of log (ADJNSAF values + 1)")
Average NSAF values
df_avg <- data.frame()
#loop through the data and calculate the mean between replicates for each protein
for (i in seq(1,nrow(silo3and9_nozerovar),2)){
#this calculates the SD for each odd number row and the row following it
df_avg_row <- apply(silo3and9_nozerovar[c(i,i+1),],2,mean)
#this sequencially combines rows of data together after the SD is generated
df_avg <- rbind(df_avg, df_avg_row)
}
#add column names to SD data
colnames(df_avg) <- colnames(silo3and9_nozerovar)
df_avg$Sample.ID <- rownames(silo3and9_nozerovar[-grep("A",rownames(silo3and9_nozerovar)),])
export filtered protein data set
Seems like technical replicates are pretty close according to either PCA, although a couple are questionable (e.g. 23C day 9, 23C day 11). NMDS plots don't seem as helpful as the PCA plots.
The ADJNSAF values for trypsin across samples seem fairly consistent as expected since this was manually added in during protein prep. This suggests that sample scaling should not be necessary.
mean(silo3and9_nozerovar[,grep("TRYP",colnames(silo3and9_nozerovar))])
## [1] 24.39327
sd(silo3and9_nozerovar[,grep("TRYP",colnames(silo3and9_nozerovar))])
## [1] 3.803557
Looking at percent error in 23C day 9 replicates
S3D9 <- ABACUSdata[,c("PROTID","28","28A")]
S3D9$percerr <- abs(S3D9$`28` - S3D9$`28A`)/S3D9$`28` * 100
S3D9$SD <- apply(S3D9[,c("28","28A")],1,sd)
S3D9$Diff <- abs(S3D9$`28` - S3D9$`28A`)
There are a number of proteins that show up in one technical replicate, sometimes with ADJNSAF values > 100, but then don't show up in the second technical replicate. This seems suspicious and perhaps these are unreliable proteins
head(S3D9[order(S3D9$SD, decreasing = TRUE),], n = 20)
## PROTID 28 28A percerr
## 4759 CHOYP_LOC100890099.1.1|m.11505 798.6587 0.0000 100.00000
## 3277 CHOYP_CALM.22.50|m.31010 485.1982 0.0000 100.00000
## 5076 CHOYP_TBB4B.1.7|m.12262 297.2170 598.8666 101.49137
## 3786 CHOYP_LDLR.6.7|m.57984 469.9926 220.1602 53.15667
## 4905 CHOYP_RLA1.5.12|m.15482 0.0000 245.9438 Inf
## 4593 CHOYP_MYSP.4.9|m.2302 734.9748 967.9571 31.69936
## 4305 CHOYP_LOC101072646.1.1|m.8800 390.5889 168.3118 56.90820
## 2978 CHOYP_BRAFLDRAFT_121562.1.1|m.37268 295.8526 80.0234 72.95160
## 4761 CHOYP_ACT.23.27|m.58454 0.0000 198.6892 Inf
## 2923 CHOYP_RS3A.4.15|m.15367 174.5522 0.0000 100.00000
## 81 CHOYP_H3.1.2|m.3829 186.6147 20.3155 89.11367
## 5114 CHOYP_TPM.17.19|m.63919 0.0000 154.3981 Inf
## 1728 CHOYP_NPC2.6.7|m.31030 432.4313 295.4860 31.66868
## 4395 CHOYP_LOC587133.1.1|m.61281 401.9394 267.2274 33.51550
## 4893 CHOYP_RL8.10.10|m.65636 0.0000 134.1296 Inf
## 4585 CHOYP_MLE.1.9|m.15462 361.4183 234.5287 35.10879
## 3223 CHOYP_BRAFLDRAFT_208436.4.32|m.22689 125.2283 0.0000 100.00000
## 732 CHOYP_LOC100375551.1.2|m.27489 174.4442 50.3472 71.13851
## 2770 CHOYP_14332.2.2|m.61737 164.3987 41.3566 74.84372
## 5063 CHOYP_TBA1.2.9|m.9962 0.0000 118.2623 Inf
## SD Diff
## 4759 564.73698 798.6587
## 3277 343.08694 485.1982
## 5076 213.29848 301.6496
## 3786 176.65818 249.8324
## 4905 173.90853 245.9438
## 4593 164.74336 232.9823
## 4305 157.17364 222.2771
## 2978 152.61429 215.8292
## 4761 140.49448 198.6892
## 2923 123.42704 174.5522
## 81 117.59129 166.2992
## 5114 109.17594 154.3981
## 1728 96.83495 136.9453
## 4395 95.25577 134.7120
## 4893 94.84395 134.1296
## 4585 89.72450 126.8896
## 3223 88.54978 125.2283
## 732 87.74983 124.0970
## 2770 87.00390 123.0421
## 5063 83.62407 118.2623
If we look at one of these proteins across all technical replicates, are there other samples that show unreliable detection?
silo3and9[,c(1:4,grep("CHOYP_RLA1.5.12|m.15482", colnames(silo3and9)))]
## SampleID SampleName day temp CHOYP_RLA1.5.12|m.15482
## 1 1 pool0 0 16 0.0000
## 1A 1A pool0 0 16 0.0000
## 4 4 S3D3 3 23 0.0000
## 4A 4A S3D3 3 23 0.0000
## 8 8 S9D3 3 29 0.0000
## 8A 8A S9D3 3 29 0.0000
## 12 12 S3D5 5 23 197.1867
## 12A 12A S3D5 5 23 212.0639
## 16 16 S9D5 5 29 180.6154
## 16A 16A S9D5 5 29 0.0000
## 20 20 S3D7 7 23 225.7157
## 20A 20A S3D7 7 23 0.0000
## 24 24 S9D7 7 29 238.9554
## 24A 24A S9D7 7 29 272.1093
## 28 28 S3D9 9 23 0.0000
## 28A 28A S3D9 9 23 245.9438
## 32 32 S9D9 9 29 218.2319
## 32A 32A S9D9 9 29 0.0000
## 36 36 S3D11 11 23 0.0000
## 36A 36A S3D11 11 23 0.0000
## 40 40 S9D11 11 29 0.0000
## 40A 40A S9D11 11 29 0.0000
## 44 44 S3D13 13 23 0.0000
## 44A 44A S3D13 13 23 0.0000
## 48 48 S9D13 13 29 0.0000
## 48A 48A S9D13 13 29 0.0000
In addition to S3D9, this protein is also not reiably detected in technical replicates from S9D5, S3D7, and S9D9.
- Investigate how wide spread a problem this is an determine if certain proteins need to be removed from the dataset due to unreliable detection across technical replicates ============================================================================================================================================================================