-
Notifications
You must be signed in to change notification settings - Fork 2
/
RADome.R
executable file
·279 lines (212 loc) · 11.1 KB
/
RADome.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#! /usr/bin/Rscript
args <- commandArgs(TRUE)
#Exception if no argument is provided
if(length(args) < 1) {
args <- "--help"
}
###########################################################################################
## Help section
if("--help" %in% args) {
cat("
RADOME PARSER
Builds a synthetic genome composed of scaffolds beginning with a single-end RAD-tag
and ending with the corresponding paired-end contig, padded out with Ns up to the
average fragment length.
Input files are:
- single-end: output from Stacks export_sql.pl script, tsv format, with -F pe=1
- paired-end: output from Stacks exec_velvet.pl script
(Requires package seqinr)
Arguments:
--single= path to the single-end export file
--paired= path to the paired-end contigs file
--out= path to output file
--rev_comp if specified, the paired-end contigs will be reverse-complemented
--minDP= minimum coverage to retain the paired-end contig [default=1]
--max_length= maximum allowed paired-end contig length [default: no limit]
--length= scaffold final length [default=5 N insert]
--force_match if specified, name of paired-ends will not be checked (this may lead to false assembly)
--help print this text
Example:
./RADgenome_parse.R --single=~/path/to/export.tsv --paired=~/path/to/collated.fa --out=~/path/to/genome.fa--minDP=1 --length=750\n\n")
q(save="no")
}
###########################################################################################
#Standard argument parser
parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
argsL <- as.list(as.character(argsDF$V2))
names(argsL) <- argsDF$V1
## --single error
if(is.null(argsL$single)) {
stop("No single-end input file has been provided")}
## --paired error
if(is.null(argsL$paired)) {
stop("No paired-end input file has been provided")}
## --out error
if(is.null(argsL$out)) {
stop("No output file has been provided")}
##--rev_comp default
if(is.null(argsL$rev_comp)) {
RC_flag <- 0
} else { RC_flag <- 1 }
## --minDP default
if(is.null(argsL$minDP)) {
minDP <- 1
} else { as.numeric(argsL$minDP) -> minDP }
## --length default
if(is.null(argsL$length)) {
fragment_length <- 1500
length_switch <- 0
} else { as.numeric(argsL$length) -> fragment_length
length_switch <- 1 }
## --max_length default
if(is.null(argsL$max_length)) {
max_length <- 0
} else { as.numeric(argsL$max_length) -> max_length }
##--force_match default
if(is.null(argsL$force_match)) {
force_flag <- 0
} else { force_flag <- 1 }
argsL$single -> path_single.raw
argsL$paired -> path_paired.raw
argsL$out -> path_out
###########################################################################################
#Check whether the data files exist
if(file.exists(eval(path_single.raw))==FALSE){
stop("Single-end input file doesn't exist")}
if(file.exists(eval(path_paired.raw))==FALSE){
stop("Paired-end input file doesn't exist")}
###########################################################################################
#Display recap, conditionnal on whether arguments have been provided or take their default values.
cat(paste("\n\tSingle-end data file:\t\t", eval(path_single.raw),"\n\tPaired-end data file:\t\t", eval(path_paired.raw),"\n\tOutput file:\t\t\t", eval(path_out), "\n", sep="", collapse=""))
if(is.null(argsL$rev_comp)){
cat("\tPaired-end contigs:\t\talready in-line\n")
} else {
cat("\tPaired-end contigs:\t\treverse-complement\n")}
if(is.null(argsL$minDP)) {
cat("\tMinimum paired-end coverage:\t1x (default)\n")
} else {
cat(paste("\tMin. paired-end coverage:\t", eval(minDP), "x\n", sep="", collapse=""))}
if(is.null(argsL$max_length)) {
cat("\tMaximum contig length:\t\tno limit\n")
} else {
cat(paste("\tMax. paired-end contig length:\t", eval(max_length), "bp\n", sep="", collapse=""))}
if(is.null(argsL$length)) {
cat("\tOutput scaffold length:\t\tvariable\n")
} else {
cat(paste("\tOutput scaffold length:\t\t", eval(fragment_length), "bp\n", sep="", collapse=""))}
if(is.null(argsL$force_match)){
cat("\tCheck name matches:\t\tyes\n")
} else {
cat("\tCheck name matches:\t\twill not check names\n")}
###########################################################################################
#Function "rev.comp" applied on a DNA sequence as a string. Requires the "comp" function from seqinr package.
#Used only if --rev_comp flag is used
library(seqinr)
rev.comp <- function(x){paste(rev(comp(unlist(strsplit(x,"")), forceToLower=F)), collapse="")}
###########################################################################################
#Setting up the work environment
dirname(path_out)->workdir
system(paste("mkdir -p ", workdir, "/tmp", sep="", collapse="")) #creates a "tmp" folder in the output directory
#Formatting the collated.fa file into a standard no-newline fasta format
system(paste("cat ", eval(path_paired.raw), " | tr -d '\\n' | sed -E 's/>/\\n>/g' | sed -E 's/([0-9]+)([ACTG]{20})/\\1\\t\\2/g' | tr -d '>' | tr '|' '\\t' | tr '_' '\\t' | cut -f 1,3,5,7,8 >", eval(workdir), "/tmp/collated.df", sep="", collapse=""))
#Formatting the export_sql.pl file. Don't forget the -F pe=1 flag to export only reads with a pe contig, to speed things up (shouldn't crash otherwise, however)
system(paste("cat ", eval(path_single.raw), " | cut -f 1,5 > ", eval(workdir), "/tmp/single.df", sep="", collapse=""))
paste(eval(workdir), "/tmp/single.df", sep="", collapse="")->path_single
paste(eval(workdir), "/tmp/collated.df", sep="", collapse="")->path_paired
###########################################################################################
#PARAMETERS FOR HARDCODING / EXECUTING FROM R CONSOLE
#Get the path of work directory and both input files (if harcoded and run from R)
#path_single <- "~/paired/king/tmp/single.df"
#path_paired <- "~/paired/king/tmp/collated.df"
#Set parameters (if harcoded and run from R)
#Min sequencing coverage to retain a paired-end contig
#minDP <- 5
#Output fragment length
#fragment_length <- 1024
#Extracts the folder above the working folder (assumed to be a temp folder)
#output_dir <- paste(strsplit(dirname(path_single),"/")[[1]][-(length(strsplit(dirname(path_single),"/")[[1]]))], collapse="/")
#path_output <- paste(output_dir, "/rad_genome.fa", sep="", collpase="")
###########################################################################################
cat("\n\tFormatting input files...")
#Import the single-end reads in R from export_sql.pl.
#Prior scan to speed up the import
tab2rows <- read.table(eval(path_single), header = FALSE, sep='\t', nrows = 2)
classes <- c('numeric','character')
nrow_single<-(as.numeric(strsplit(system(paste("wc -l", eval(path_single), collapse=""), intern=T),' ')[[1]][1]))-1
#Import and format
read.table(eval(path_single), header=FALSE, sep='\t', colClasses = classes, nrows=nrow_single)->single
colnames(single)<-c('id','sequence')
toupper(single$sequence)->single$sequence #corrects the occasional case mistakes
#Import the paired-end contigs in R
#Prior scan to speed up the import
tab2rows <- read.table(eval(path_paired), header = FALSE, sep='\t', nrows = 2)
classes <- sapply(tab2rows, class)
classes[5]<-'character'
nrow_paired<-(as.numeric(strsplit(system(paste("wc -l", eval(path_paired), collapse=""), intern=T),' ')[[1]][1]))-1
#Import and format
read.table(eval(path_paired), header=FALSE, sep='\t',colClasses = classes, nrows=nrow_paired)->paired
colnames(paired)<-c('id','node','contig_length','coverage','sequence')
nchar(paired$sequence)->paired$contig_length #corrects the discrepancy between Stacks output contig length and actual contig length
###########################################################################################
if(RC_flag==1){
#Reverse-complement the sequences
cat("done.\n\n\tReverse-complementing the paired-end contigs...")
unname(sapply(paired[,5], rev.comp))->RC_paired
RC_paired->paired$sequence
}
toupper(paired$sequence)->paired$sequence #corrects the occasional case mistakes
cat("done.\n")
###########################################################################################
#Create a vector with the index of duplicated id's in the 'paired' table.
which(duplicated(paired$id)==TRUE)->dup_in_paired #contains the IDs for the duplicated rows in the unsorted 'paired' table
unique(paired[dup_in_paired,1])->dup_id #contains the matching locus IDs
sort(paired$id[!(paired$id %in% dup_id)])->pass_id #extracts IDs for the loci which have only 1 paired-end contig
cat("done.\n\n\tRemoving loci with paired-end coverage under threshold...")
###########################################################################################
#Add the optional filter by paired-contig minimum coverage
paired$id[which(paired$coverage < minDP)]->low_cov_id
pass_id[!(pass_id %in% low_cov_id)]->pass_id
cat("done.\n\n")
###########################################################################################
#Add the optional filter by paired-end-contig maximum length
if(is.null(argsL$max_length)) {
max(nchar(paired$sequence))->max_length }
paired$id[which(paired$contig_length > max_length)]->long_contigs_id
pass_id[!(pass_id %in% long_contigs_id)]->pass_id
###########################################################################################
cat("\tRetained ", length(pass_id), " loci after filtering.\n\n")
###########################################################################################
#To avoid a crash during the parsing, check whether fragment_length will always be long enough
max_scaffold_length <- max(nchar(single$sequence))+max(nchar(paired$sequence))
if(max_scaffold_length >= fragment_length){
stop("Maximum scaffold length longer than fragment length. Set a higher output fragment length or filter out long contigs.")}
###########################################################################################
cat("\tBuilding the scaffolds...\n\n")
#Keep only the paired-end contigs that passed all test:
paired[paired$id %in% pass_id,] -> paired_keep
#Merge the two dataframes:
merge(single, paired_keep, by='id')->merged
#Core loop to actually pull out the sequences matching these IDs and parse them together with Ns to make pseudo-contigs.
system(paste("rm -f", eval(path_out), collapse="")) #remove any eventual previous output file
nrow(merged)->n.loc
pb <- txtProgressBar(min = 0, max = n.loc, style = 3) #set the progress bar
for(locus in 1:n.loc){
read1 <- merged$sequence.x[locus]
read2 <- merged$sequence.y[locus]
contig_length <- merged$contig_length[locus]
coverage <- merged$coverage[locus]
seqid <- paste('>Scaffold_', as.character(merged$id[locus]), '\n', sep='')
if(length_switch==1){
N <- (fragment_length - nchar(read1) - nchar(read2))
} else { N <- 5 }
Ns <- paste(sample('N', N, replace=T), collapse='')
fragment <- paste(seqid,read1, Ns, read2, sep='', collapse='')
write.table(fragment, eval(path_out), append=T, quote=F, row.names=F, col.names=F)
setTxtProgressBar(pb,locus)
}
#cat(paste("\n\n\tOutput in ", eval(path_out), "\n\n", sep="", collapse=""))
###########################################################################################
system(paste("rm -r ", eval(workdir), "/tmp", sep="", collapse=""))
cat('\n\n\tΤΑ ΔΗ ΝΥΝ ΠΑΝΤΑ ΤΕΛΕΙΤΑΙ\n\n')
q(save="no")