forked from jimmyodonnell/banzai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dup_to_OTU_table.R
145 lines (109 loc) · 4.74 KB
/
dup_to_OTU_table.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# translate duplicates to OTUs: was named collapse_dup_by_OTU
# arguments:
# 1: path to duplicate table (rows = sequence names, cols = sample names, cells = counts (integers))
# 2: dup to otu table (two columns; headers "Query" and "Match"; cells = sequence names of query and corresponding match from OTU clustering process)
# 3: otu table path (to be written)
# 20150813 removed fourth argument: 4: concatenated directory (obsolete?))
# was only used in one place: setwd(arguments[4])
# automated script:
arguments <- commandArgs(TRUE)
# load gtools to use function mixedsort
library(gtools)
# Read in duplicates files
# read.csv("dups.csv", row.names = 1)
dups <- read.csv(arguments[1], row.names = 1)
# as of NextSeq run in July 2015, dereplication had to be majorly overhauled, resulting in transposed duplicate table.
dups <- t(dups)
# Read in dups to OTUs files
dups_to_OTUs <- read.csv(arguments[2], header=TRUE, stringsAsFactor=FALSE)
OTUs <- dups_to_OTUs$Match[
match(
rownames(dups),
dups_to_OTUs$Query
)
]
# for each column (sample), sum the rows (duplicates) that belong to the same OTU
OTU_table <- aggregate(dups, list(OTUs), FUN = sum)
# Make the rownames the values stored in the new first colum
rownames(OTU_table) <- OTU_table[,1]
# remove that column
OTU_table <- OTU_table[,-1]
# sort by rowname
OTU_table <- OTU_table[mixedsort(rownames(OTU_table)),]
# write output to CSV file
write.csv(x = OTU_table, file = arguments[3], quote = FALSE)
# Most of this was an artifact of the weirdness of the old usearch output (uc format)
# unique(c(dup_to_OTU[,1], dup_to_OTU[,2]))
# dups_to_collapse <- split(dup_to_OTU[,1], dup_to_OTU[,2])
# dups_to_collapse <- lapply(dups_to_collapse, as.character)
# dups_to_collapse <- mapply(c, as.list(names(dups_to_collapse)), dups_to_collapse)
# dups_to_collapse <- sapply(dups_to_collapse, unique)
# dups_to_collapse <- sapply(dups_to_collapse, sort)
#
#
# no_clusters <- dups[!rownames(dups) %in% unique(unlist(dups_to_collapse)),]
#
# consolidated_dups <- list()
# for(i in 1:length(dups_to_collapse)){
# consolidated_dups[[i]] <- colSums(dups[dups_to_collapse[[i]],])
# }
# consolidated_dups <- do.call(rbind, consolidated_dups)
# rownames(consolidated_dups) <- sapply(dups_to_collapse, function(x) {x[[1]]})
#
# sort(as.numeric(gsub("DUP_", "", rownames(ALL_CLUSTERS))))
# ALL_CLUSTERS <- rbind(as.data.frame(consolidated_dups), no_clusters)
# order(rownames(ALL_CLUSTERS))
# write.csv(ALL_CLUSTERS, "all_clusters.csv")
# ALL_CLUSTERS <- read.csv("")
# confirm there are no duplicate otus:
# which(duplicated(rownames(ALL_CLUSTERS)))
# this one is weird, among many others
# dups_to_collapse[2416]
# write.table(dups_to_collapse, "dups_to_collapse.txt")
# dups_to_collapse_vec <- unlist(dups_to_collapse)
#
# dupped <- dups_to_collapse_vec[duplicated(dups_to_collapse_vec)]
# dups_removed <- dup_to_OTU[-which(dup_to_OTU[,2] %in% dupped),]
# dups_removed_tmp <- lapply(split(dups_removed[,1], dups_removed[,2]), as.character)
# dups_removed <- mapply(c, as.list(names(dups_removed_tmp)), dups_removed_tmp)
# which(duplicated(unlist(dups_removed)))
# X <- stack(setNames(dups_to_collapse, seq_along(dups_to_collapse)))
# TAB <- table(X)
# TAB.mat <- as.matrix(TAB)
# dup_rows <- TAB.mat[which(rowSums(TAB.mat) > 1),]
# dup_cols <- dup_rows[,which(colSums(dup_rows) > 0)]
# # dup_cols[,which(colSums(dup_cols) > 1)]
# identical(sort(rownames(dup_cols)),sort(dupped))
# # edit(dup_cols)
# dups_to_collapse[as.numeric(colnames(dup_cols))]
#
# TRASH:
# Read in files of chimaeras vs "not chimaeras"
# chimaeras <- read.table("chimaeras.txt")
# not_chimaeras <- read.table("not_chimaeras.txt")
# tail(chimaeras)
# dups_chimaeras <- dups[as.character(chimaeras[,1]),]
# dups_no_chimaeras <- dups[as.character(not_chimaeras[,1]),]
# WHAT THE FUCK IS GOING ON HERE???
# sapply(consolidated_otus, duplicated)
# consolidated_otus <- list()
# for(i in 1:nrow(dup_cols)){
# consolidated_otus[[i]] <- Reduce(union, dups_to_collapse[as.numeric(names(which(dup_cols[i,] > 0)))])
# }
# consol <- stack(setNames(consolidated_otus, seq_along(consolidated_otus)))
# consol.mat <- as.matrix(table(consol))
# colSums(consol.mat)
# duplicated(unlist(consolidated_otus))
# consolidated_2 <- list()
# for(i in 1:nrow(consol.mat)){
# consolidated_2[[i]] <- Reduce(union, consolidated_otus[as.numeric(names(which(consol.mat[i,] > 0)))])
# }
# consol2 <- stack(setNames(consolidated_2, seq_along(consolidated_2)))
# consol2.mat <- as.matrix(table(consol2))
# colSums(consol2.mat)
# consolidated_3 <- list()
# for(i in 1:nrow(consol.mat)){
# consolidated_3[[i]] <- Reduce(union, consolidated_2[as.numeric(names(which(consol2.mat[i,] > 0)))])
# }
# Reduce(union, dups_to_collapse[as.numeric(names(which(dup_cols[35,] > 0)))])
# Reduce(intersect, dups_to_collapse)