/
quality-check-standard.wdl
571 lines (542 loc) · 31.8 KB
/
quality-check-standard.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
version 1.1
import "../../data_structures/flag_filter.wdl"
import "../../tools/fastqc.wdl" as fastqc_tasks
import "../../tools/fq.wdl"
import "../../tools/kraken2.wdl"
import "../../tools/librarian.wdl" as libraran_tasks
import "../../tools/md5sum.wdl"
import "../../tools/mosdepth.wdl"
import "../../tools/multiqc.wdl" as multiqc_tasks
import "../../tools/ngsderive.wdl"
import "../../tools/picard.wdl"
import "../../tools/qualimap.wdl"
import "../../tools/samtools.wdl"
import "../../tools/util.wdl"
import "./markdups-post.wdl" as markdups_post_wf
workflow quality_check {
meta {
description: "Performs comprehensive quality checks, aggregating all analyses and metrics into a final MultiQC report."
help: "Assumes that input BAM is position-sorted."
external_help: "https://multiqc.info/"
outputs: {
bam_checksum: "STDOUT of the `md5sum` command run on the input BAM that has been redirected to a file",
validate_sam_file: "Validation report produced by `picard ValidateSamFile`. Validation warnings and errors are logged.",
flagstat_report: "`samtools flagstat` STDOUT redirected to a file. If `mark_duplicates` is `true`, then this result will be generated from the duplicate marked BAM.",
fastqc_results: "A gzipped tar archive of all FastQC output files",
instrument_file: "TSV file containing the `ngsderive isntrument` report for the input BAM file",
read_length_file: "TSV file containing the `ngsderive readlen` report for the input BAM file",
inferred_encoding: "TSV file containing the `ngsderive encoding` report for the input BAM file",
alignment_metrics: {
description: "The text file output of `picard CollectAlignmentSummaryMetrics`",
external_help: "http://broadinstitute.github.io/picard/picard-metric-definitions.html#AlignmentSummaryMetrics"
},
alignment_metrics_pdf: "The PDF file output of `picard CollectAlignmentSummaryMetrics`",
insert_size_metrics: {
description: "The text file output of `picard CollectInsertSizeMetrics`. If `mark_duplicates` is `true`, then this result will be generated from the duplicate marked BAM.",
external_help: "http://broadinstitute.github.io/picard/picard-metric-definitions.html#InsertSizeMetrics"
},
insert_size_metrics_pdf: "The PDF file output of `picard CollectInsertSizeMetrics`. If `mark_duplicates` is `true`, then this result will be generated from the duplicate marked BAM.",
quality_score_distribution_txt: "The text file output of `picard QualityScoreDistribution`",
quality_score_distribution_pdf: "The PDF file output of `picard QualityScoreDistribution`",
phred_scores: "Headered TSV file containing PHRED score statistics",
kraken_report: {
description: "A Kraken2 summary report",
external_help: "https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#sample-report-output-format"
},
mosdepth_global_dist: "The `$prefix.mosdepth.global.dist.txt` file contains a cumulative distribution indicating the proportion of total bases that were covered for at least a given coverage value. It does this for each chromosome, and for the whole genome.",
mosdepth_global_summary: "A summary of mean depths per chromosome",
mosdepth_region_dist: "The `$prefix.mosdepth.region.dist.txt` file contains a cumulative distribution indicating the proportion of total bases in the region(s) defined by the `coverage_bed` that were covered for at least a given coverage value. There will be one file in this array for each `coverage_beds` input file.",
mosdepth_region_summary: "A summary of mean depths per chromosome and within specified regions per chromosome. There will be one file in this array for each `coverage_beds` input file.",
multiqc_report: "A gzipped tar archive of all MultiQC output files",
orig_read_count: "A TSV report containing the original read count before subsampling. Only present if `subsample_n_reads > 0`.",
kraken_sequences: {
description: "Detailed Kraken2 output that has been gzipped. Only present if `store_kraken_sequences == true`.",
external_help: "https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#standard-kraken-output-format"
},
comparative_kraken_report: "Kraken2 summary report for only the alternatively filtered reads. Only present if `run_comparative_kraken == true`.",
comparative_kraken_sequences: "Detailed Kraken2 output for only the alternatively filtered reads. Only present if `run_comparative_kraken == true && store_kraken_sequences == true`.",
mark_duplicates_metrics: {
description: "The METRICS_FILE result of `picard MarkDuplicates`. Only present if `mark_duplicates == true && optical_distance > 0`.",
external_help: "http://broadinstitute.github.io/picard/picard-metric-definitions.html#DuplicationMetrics"
},
mosdepth_dups_marked_global_dist: "The `$prefix.mosdepth.global.dist.txt` file contains a cumulative distribution indicating the proportion of total bases that were covered for at least a given coverage value. It does this for each chromosome, and for the whole genome. This file is produced from analyzing the duplicate marked BAM. Only present if `mark_duplicates == true`.",
mosdepth_dups_marked_global_summary: "A summary of mean depths per chromosome. This file is produced from analyzing the duplicate marked BAM. Only present if `mark_duplicates == true`.",
mosdepth_dups_marked_region_dist: "The `$prefix.mosdepth.region.dist.txt` file contains a cumulative distribution indicating the proportion of total bases in the region(s) defined by the `coverage_bed` that were covered for at least a given coverage value. There will be one file in this array for each `coverage_beds` input file. This file is produced from analyzing the duplicate marked BAM. Only present if `mark_duplicates == true`.",
mosdepth_dups_marked_region_summary: "A summary of mean depths per chromosome and within specified regions per chromosome. There will be one file in this array for each `coverage_beds` input file. This file is produced from analyzing the duplicate marked BAM. Only present if `mark_duplicates == true`.",
inferred_strandedness: "TSV file containing the `ngsderive strandedness` report. Only present if `rna == true`.",
qualimap_rnaseq_results: "Gzipped tar archive of all QualiMap output files. Only present if `rna == true`.",
junction_summary: "TSV file containing the `ngsderive junction-annotation` summary. Only present if `rna == true`",
junctions: "TSV file containing a detailed list of annotated junctions. Only present if `rna == true`.",
librarian_report: "A tar archive containing the `librarian` report and raw data. Only present if `run_librarian == true`.",
IntermediateFiles: "Any and all files produced as intermediate during pipeline processing. Only output if `output_intermediate_files == true`."
}
allowNestedInputs: true
}
parameter_meta {
bam: "Input BAM format file to quality check"
bam_index: "BAM index file corresponding to the input BAM"
kraken_db: "Kraken2 database. Can be generated with `../reference/make-qc-reference.wdl`. Must be a tarball without a root directory."
standard_filter: "Filter to apply to the input BAM while converting to FASTQ, before running Kraken2 and `librarian` (if `run_librarian == true`). This is a `FlagFilter` object (see ../../data_structures/flag_filter.wdl for more information). By default, it will **remove secondary and supplementary reads** from the created FASTQs. **WARNING:** These filters can be tricky to configure; please read documentation thoroughly before changing the defaults. **WARNING:** If you have set `run_librarian` to `true`, we **strongly** recommend leaving this filter at the default value. `librarian` is trained on a specific set of reads, and changing this filter may produce nonsensical results."
comparative_filter: "Filter to apply to the input BAM while performing a second FASTQ conversion, before running Kraken2 another time. This is a `FlagFilter` object (see ../../data_structures/flag_filter.wdl for more information). By default, it will **remove unmapped, secondary, and supplementary reads** from the created FASTQs. **WARNING** These filters can be tricky to configure; please read documentation thoroughly before changing the defaults."
gtf: "GTF features file. Gzipped or uncompressed. **Required** for RNA-Seq data."
multiqc_config: "YAML file for configuring MultiQC"
extra_multiqc_inputs: "An array of additional files to pass directly into MultiQC"
coverage_beds: "An array of 3 column BEDs which are passed to the `-b` flag of mosdepth, in order to restrict coverage analysis to select regions. Any regional analysis enabled by this option is _in addition_ to whole genome coverage, which is calculated regardless of this setting. An exon BED and a Coding Sequence BED are examples of regions you may wish to restrict coverage analysis to. Those two BEDs can be created with the workflow in `../reference/make-qc-reference.wdl`."
coverage_labels: "An array of equal length to `coverage_beds` which determines the prefix label applied to the output files. If omitted, defaults of `regions1`, `regions2`, etc. will be used. If using the BEDs created by `../reference/make-qc-reference.wdl`, the labels [\"exon\", \"CDS\"] are appropriate. Make sure to provide the coverage BEDs **in the same order** as the labels."
prefix: "Prefix for all results files"
rna: "Is the sequenced molecule RNA? Enabling this option adds RNA-Seq specific analyses to the workflow. If `true`, a GTF file must be provided. If `false`, the GTF file is ignored."
mark_duplicates: "Mark duplicates before select analyses? Default behavior is to set this to the value of the `rna` parameter. This is because DNA files are often duplicate marked already, and RNA-Seq files are usually _not_ duplicate marked. If set to `true`, a BAM will be generated and passed to selected downstream analyses. For more details about what analyses are run, review `./markdups-post.wdl`. **WARNING, this duplicate marked BAM is _not_ ouput by default.** If you would like to output this file, set `output_intermediate_files = true`."
run_librarian: {
description: "Run the `librarian` tool to generate a report of the likely Illumina library prep kit used to generate the data. **WARNING** this tool is not guaranteed to work on all data, and may produce nonsensical results. `librarian` was trained on a limited set of GEO read data (Gene Expression Oriented). This means the input data should be Paired-End, of mouse or human origin, read length should be >50bp, and derived from a library prep kit that is in the `librarian` database. By default, this tool is run when `rna == true`.",
external_help: "https://f1000research.com/articles/11-1122/v2",
}
run_comparative_kraken: "Run Kraken2 a second time with different FASTQ filtering? If `true`, `comparative_filter` is used in a second run of BAM->FASTQ conversion, resulting in differently filtered FASTQs analyzed by Kraken2. If `false`, `comparative_filter` is ignored."
store_kraken_sequences: "Store the Kraken2 sequences output? This will apply to all runs of Kraken2 (see `parameter_meta.run_comparative_kraken`). **WARNING** these files can be very large."
output_intermediate_files: "Output intermediate files? FASTQs; if `rna == true` a collated BAM; if `mark_duplicates == true` a duplicate marked BAM, various accessory files like indexes and md5sums; if subsampling was requested _and_ performed then a sampled BAM and associated index. **WARNING, these files can be large.**"
use_all_cores: "Use all cores? Recommended for cloud environments."
optical_distance: "Maximum distance between read coordinates to consider them optical duplicates instead of library duplicates (e.g. PCR duplicates). If `mark_duplicates == false`, this parameter is ignored. If `0`, then _optical_ duplicate marking is disabled and only traditional duplicate marking will be performed. Suggested settings of 100 for unpatterned versions of the Illumina platform (e.g. HiSeq) or 2500 for patterned flowcell models (e.g. NovaSeq). Calculation of distance depends on coordinate data embedded in the read names, typically produced by Illumina sequencing machines. Optical duplicate detection will not work on non-standard names without a custom regex for tile-data extraction. Review the `mark_duplicates` task in `../../tools/picard.wdl` for more information."
subsample_n_reads: "Only process a random sampling of approximately `n` reads. Any `n <= 0` for processing entire input. Subsampling is done probabalistically so the exact number of reads in the output will have some variation."
}
input {
File bam
File bam_index
File kraken_db
File? gtf
File multiqc_config
= "https://raw.githubusercontent.com/stjudecloud/workflows/main/workflows/qc/inputs/multiqc_config_hg38.yaml"
Array[File] extra_multiqc_inputs = []
Array[File] coverage_beds = []
Array[String] coverage_labels = []
FlagFilter standard_filter = {
"include_if_all": "0x0",
"exclude_if_any": "0x900", # 0x100 (secondary) || 0x800 (supplementary)
"include_if_any": "0x0",
"exclude_if_all": "0x0",
}
# TODO: consider making this an array of FlagFilters similar to coverage_beds
FlagFilter comparative_filter = {
"include_if_all": "0x0",
# 0x4 (unmapped) || 0x100 (secondary) || 0x800 (supplementary)
"exclude_if_any": "0x904",
"include_if_any": "0x0",
"exclude_if_all": "0x0",
}
String prefix = basename(bam, ".bam")
Boolean rna = false
Boolean mark_duplicates = rna
Boolean run_librarian = rna
Boolean run_comparative_kraken = false
Boolean store_kraken_sequences = false
Boolean output_intermediate_files = false
Boolean use_all_cores = false
Int optical_distance = 0
Int subsample_n_reads = -1
}
call parse_input { input:
gtf_provided=defined(gtf),
rna,
coverage_beds_len=length(coverage_beds),
coverage_labels=coverage_labels,
}
call flag_filter.validate_FlagFilter as kraken_filter_validator { input:
flags = standard_filter
}
if (run_comparative_kraken) {
call flag_filter.validate_FlagFilter as comparative_kraken_filter_validator { input:
flags = comparative_filter
}
}
call md5sum.compute_checksum after parse_input { input: file=bam }
call samtools.quickcheck after parse_input { input: bam=bam }
call util.compression_integrity after parse_input { input: bgzipped_file=bam }
if (subsample_n_reads > 0) {
call samtools.subsample after quickcheck { input:
bam=bam,
prefix=prefix,
desired_reads=subsample_n_reads,
use_all_cores=use_all_cores,
}
if (defined(subsample.sampled_bam)) {
call samtools.index as subsample_index { input:
bam=select_first([subsample.sampled_bam, "undefined"]),
use_all_cores=use_all_cores,
}
}
}
# If subsampling is disabled **or** input BAM has fewer reads than
# `subsample_n_reads` this will be `bam`
File post_subsample_bam = select_first([
subsample.sampled_bam,
bam
])
File post_subsample_bam_index = select_first([
subsample_index.bam_index,
bam_index
])
String post_subsample_prefix = if (defined(subsample.sampled_bam))
then prefix + ".subsampled"
else prefix
call picard.validate_bam after quickcheck { input:
bam=post_subsample_bam,
outfile_name=post_subsample_prefix + ".ValidateSamFile.txt",
succeed_on_errors=true,
ignore_list=[],
summary_mode=true,
}
call picard.collect_alignment_summary_metrics after quickcheck { input:
bam=post_subsample_bam,
prefix=post_subsample_prefix + ".CollectAlignmentSummaryMetrics",
}
call picard.quality_score_distribution after quickcheck { input:
bam=post_subsample_bam,
prefix=post_subsample_prefix + ".QualityScoreDistribution",
}
call fastqc_tasks.fastqc after quickcheck { input:
bam=post_subsample_bam,
prefix=post_subsample_prefix + ".fastqc_results",
use_all_cores=use_all_cores,
}
call ngsderive.instrument after quickcheck { input:
bam=post_subsample_bam,
outfile_name=post_subsample_prefix + ".instrument.tsv",
}
call ngsderive.read_length after quickcheck { input:
bam=post_subsample_bam,
bam_index=post_subsample_bam_index,
outfile_name=post_subsample_prefix + ".readlength.tsv",
}
call ngsderive.encoding after quickcheck { input:
ngs_files=[post_subsample_bam],
outfile_name=post_subsample_prefix + ".encoding.tsv",
num_reads=-1,
}
call ngsderive.endedness after quickcheck { input:
bam=post_subsample_bam,
outfile_name=post_subsample_prefix + ".endedness.tsv",
lenient=true,
}
call util.global_phred_scores after quickcheck { input:
bam=post_subsample_bam,
prefix=post_subsample_prefix,
}
call samtools.bam_to_fastq after quickcheck
after kraken_filter_validator
{ input:
bam = post_subsample_bam,
bitwise_filter = standard_filter,
prefix = post_subsample_prefix,
# RNA needs a collated BAM for Qualimap
# DNA can skip the associated storage costs
retain_collated_bam = rna,
# disabling fast_mode enables writing of secondary and supplementary alignments
# to the collated BAM when processing RNA.
# Those alignments are used downstream by Qualimap.
fast_mode = (!rna),
paired_end = true, # matches default but prevents user from overriding
interleaved = false, # matches default but prevents user from overriding
use_all_cores = use_all_cores,
}
call fq.fqlint { input:
read_one_fastq = select_first([bam_to_fastq.read_one_fastq_gz, "undefined"]),
read_two_fastq = select_first([bam_to_fastq.read_two_fastq_gz, "undefined"]),
}
call kraken2.kraken after fqlint { input:
read_one_fastq_gz
= select_first([bam_to_fastq.read_one_fastq_gz, "undefined"]),
read_two_fastq_gz
= select_first([bam_to_fastq.read_two_fastq_gz, "undefined"]),
db = kraken_db,
store_sequences = store_kraken_sequences,
prefix = post_subsample_prefix,
use_all_cores = use_all_cores,
}
if (run_librarian) {
call libraran_tasks.librarian after fqlint { input:
read_one_fastq = select_first([bam_to_fastq.read_one_fastq_gz, "undefined"]),
}
}
if (run_comparative_kraken) {
call samtools.bam_to_fastq as alt_filtered_fastq after quickcheck
after comparative_kraken_filter_validator
{ input:
bam = post_subsample_bam,
bitwise_filter = comparative_filter,
prefix = post_subsample_prefix + ".alt_filtered",
# matches default but prevents user from overriding
# If the user wants a collated BAM, they should save the one
# from the first bam_to_fastq call.
retain_collated_bam = false,
# matches default but prevents user from overriding
# Since the only output here is FASTQs, we can disable fast mode.
# This discards secondary and supplementary alignments, which should not
# be converted to FASTQs. (Is that true?)
fast_mode = true,
paired_end = true, # matches default but prevents user from overriding
interleaved = false, # matches default but prevents user from overriding
use_all_cores = use_all_cores,
}
call fq.fqlint as alt_filtered_fqlint { input:
read_one_fastq
= select_first([alt_filtered_fastq.read_one_fastq_gz, "undefined"]),
read_two_fastq
= select_first([alt_filtered_fastq.read_two_fastq_gz, "undefined"]),
}
call kraken2.kraken as comparative_kraken after alt_filtered_fqlint { input:
read_one_fastq_gz
= select_first([alt_filtered_fastq.read_one_fastq_gz, "undefined"]),
read_two_fastq_gz
= select_first([alt_filtered_fastq.read_two_fastq_gz, "undefined"]),
db = kraken_db,
store_sequences = store_kraken_sequences,
prefix = post_subsample_prefix + ".alt_filtered",
use_all_cores = use_all_cores,
}
}
call mosdepth.coverage as wg_coverage after quickcheck { input:
bam=post_subsample_bam,
bam_index=post_subsample_bam_index,
prefix=post_subsample_prefix + ".whole_genome",
}
scatter(coverage_pair in zip(coverage_beds, parse_input.labels)) {
call mosdepth.coverage as regions_coverage after quickcheck { input:
bam=post_subsample_bam,
bam_index=post_subsample_bam_index,
coverage_bed=coverage_pair.left,
prefix=post_subsample_prefix + "." + coverage_pair.right,
}
}
if (rna) {
call ngsderive.junction_annotation after quickcheck { input:
bam=post_subsample_bam,
bam_index=post_subsample_bam_index,
gene_model=select_first([gtf, "undefined"]),
prefix=post_subsample_prefix,
}
call ngsderive.strandedness after quickcheck { input:
bam=post_subsample_bam,
bam_index=post_subsample_bam_index,
gene_model=select_first([gtf, "undefined"]),
outfile_name=post_subsample_prefix + ".strandedness.tsv",
}
call qualimap.rnaseq as qualimap_rnaseq { input:
bam=select_first([bam_to_fastq.collated_bam, "undefined"]),
prefix=post_subsample_prefix + ".qualimap_rnaseq_results",
gtf=select_first([gtf, "undefined"]),
name_sorted=true,
paired_end=true, # matches default but prevents user from overriding
}
}
if (mark_duplicates) {
call picard.mark_duplicates as markdups after quickcheck { input:
bam = post_subsample_bam,
create_bam = true,
prefix = post_subsample_prefix + ".MarkDuplicates",
optical_distance = optical_distance,
}
if (optical_distance > 0) {
File markdups_metrics = markdups.mark_duplicates_metrics
}
call markdups_post_wf.markdups_post { input:
markdups_bam = select_first([
markdups.duplicate_marked_bam,
"undefined",
]),
markdups_bam_index = select_first([
markdups.duplicate_marked_bam_index,
"undefined",
]),
coverage_beds = coverage_beds,
coverage_labels = parse_input.labels,
prefix = post_subsample_prefix + ".MarkDuplicates",
}
}
if (!mark_duplicates) {
# These analyses are called in the markdups_post workflow.
# They should still be run if duplicates were not marked.
call picard.collect_insert_size_metrics after quickcheck { input:
bam = post_subsample_bam,
prefix = post_subsample_prefix + ".CollectInsertSizeMetrics",
}
call samtools.flagstat after quickcheck { input:
bam = post_subsample_bam,
outfile_name = post_subsample_prefix + ".flagstat.txt",
}
}
call multiqc_tasks.multiqc { input:
input_files=select_all(flatten([
[
validate_bam.validate_report,
flagstat.flagstat_report,
markdups_post.flagstat_report,
instrument.instrument_file,
read_length.read_length_file,
encoding.encoding_file,
endedness.endedness_file,
fastqc.raw_data,
collect_alignment_summary_metrics.alignment_metrics,
collect_insert_size_metrics.insert_size_metrics,
markdups_post.insert_size_metrics,
quality_score_distribution.quality_score_distribution_txt,
kraken.report,
librarian.raw_data,
comparative_kraken.report,
wg_coverage.summary,
wg_coverage.global_dist,
global_phred_scores.phred_scores,
subsample.orig_read_count,
markdups_post.mosdepth_global_summary,
markdups_post.mosdepth_global_dist,
strandedness.strandedness_file,
junction_annotation.junction_summary,
qualimap_rnaseq.raw_summary,
qualimap_rnaseq.raw_coverage,
],
regions_coverage.summary,
regions_coverage.region_dist,
select_first([markdups_post.mosdepth_region_summary, []]),
select_first([markdups_post.mosdepth_region_dist, []]),
(
if (mark_duplicates && optical_distance > 0)
then [markdups.mark_duplicates_metrics]
else []
),
extra_multiqc_inputs,
])),
config=multiqc_config,
prefix=post_subsample_prefix + ".multiqc",
}
if (output_intermediate_files) {
IntermediateFiles optional_files = {
"sampled_bam": subsample.sampled_bam,
"sampled_bam_index": subsample_index.bam_index,
"collated_bam": bam_to_fastq.collated_bam,
"read_one_fastq_gz": bam_to_fastq.read_one_fastq_gz,
"read_two_fastq_gz": bam_to_fastq.read_two_fastq_gz,
"singleton_reads_fastq_gz": bam_to_fastq.singleton_reads_fastq_gz,
"alt_filtered_read_one_fastq_gz": alt_filtered_fastq.read_one_fastq_gz,
"alt_filtered_read_two_fastq_gz": alt_filtered_fastq.read_two_fastq_gz,
"alt_filtered_singleton_reads_fastq_gz": alt_filtered_fastq.singleton_reads_fastq_gz,
"duplicate_marked_bam": markdups.duplicate_marked_bam,
"duplicate_marked_bam_index": markdups.duplicate_marked_bam_index,
"duplicate_marked_bam_md5": markdups.duplicate_marked_bam_md5,
}
}
output {
File bam_checksum = compute_checksum.md5sum
File validate_sam_file = validate_bam.validate_report
File flagstat_report = select_first([
markdups_post.flagstat_report,
flagstat.flagstat_report
])
File fastqc_results = fastqc.results
File instrument_file = instrument.instrument_file
File read_length_file = read_length.read_length_file
File inferred_encoding = encoding.encoding_file
File inferred_endedness = endedness.endedness_file
File alignment_metrics = collect_alignment_summary_metrics.alignment_metrics
File alignment_metrics_pdf
= collect_alignment_summary_metrics.alignment_metrics_pdf
File insert_size_metrics = select_first([
markdups_post.insert_size_metrics,
collect_insert_size_metrics.insert_size_metrics
])
File insert_size_metrics_pdf = select_first([
markdups_post.insert_size_metrics_pdf,
collect_insert_size_metrics.insert_size_metrics_pdf
])
File quality_score_distribution_txt
= quality_score_distribution.quality_score_distribution_txt
File quality_score_distribution_pdf
= quality_score_distribution.quality_score_distribution_pdf
File phred_scores = global_phred_scores.phred_scores
File kraken_report = kraken.report
File mosdepth_global_dist = wg_coverage.global_dist
File mosdepth_global_summary = wg_coverage.summary
Array[File] mosdepth_region_dist = select_all(regions_coverage.region_dist)
Array[File] mosdepth_region_summary = regions_coverage.summary
File multiqc_report = multiqc.multiqc_report
File? orig_read_count = subsample.orig_read_count
File? kraken_sequences = kraken.sequences
File? comparative_kraken_report = comparative_kraken.report
File? comparative_kraken_sequences = comparative_kraken.sequences
File? mosdepth_dups_marked_global_dist = markdups_post.mosdepth_global_dist
File? mosdepth_dups_marked_global_summary = markdups_post.mosdepth_global_summary
Array[File]? mosdepth_dups_marked_region_summary
= markdups_post.mosdepth_region_summary
Array[File?]? mosdepth_dups_marked_region_dist
= markdups_post.mosdepth_region_dist
File? mark_duplicates_metrics = markdups_metrics
File? inferred_strandedness = strandedness.strandedness_file
File? qualimap_rnaseq_results = qualimap_rnaseq.results
File? junction_summary = junction_annotation.junction_summary
File? junctions = junction_annotation.junctions
File? librarian_report = librarian.report
IntermediateFiles? intermediate_files = optional_files
}
}
task parse_input {
meta {
description: "Parses and validates the `quality_check` workflow's provided inputs"
outputs: {
check: "Dummy output to indicate success and to enable call-caching", # TODO: is this still needed with labels?
labels: "An array of labels to use on the result coverage files associated with each coverage BED"
}
}
parameter_meta {
coverage_labels: "An array of equal length to `coverage_beds_len` which determines the prefix label applied to coverage output files. If an empty array is supplied, defaults of `regions1`, `regions2`, etc. will be used."
rna: "Is the sequenced molecule RNA?"
gtf_provided: "Was a GTF supplied by the user? Must be `true` if `rna == true`."
coverage_beds_len: "Length of the provided `coverage_beds` array"
}
input {
Array[String] coverage_labels
Boolean rna
Boolean gtf_provided
Int coverage_beds_len
}
Int coverage_labels_len = length(coverage_labels)
command <<<
EXITCODE=0
if ~{rna} && ! ~{gtf_provided}; then
>&2 echo "Must supply a GTF if 'rna == true'"
EXITCODE=1
fi
if [ "~{coverage_labels_len}" = 0 ]; then
for (( i=1; i<=~{coverage_beds_len}; i++ )); do
echo regions$i >> labels.txt
done
elif [ "~{coverage_labels_len}" != "~{coverage_beds_len}" ]; then
>&2 echo "Unequal amount of coverage BEDs and coverage labels."
>&2 echo "If no labels are provided, generic labels will be created."
>&2 echo "Otherwise the exact same amount must be supplied."
EXITCODE=1
else
echo "~{sep('\n', coverage_labels)}" >> labels.txt
fi
exit $EXITCODE
>>>
output {
Array[String] labels = read_lines("labels.txt")
}
runtime {
memory: "4 GB"
disk: "10 GB"
container: 'ghcr.io/stjudecloud/util:1.3.0'
maxRetries: 1
}
}
# TODO does this need documentation?
struct IntermediateFiles {
File? sampled_bam
File? sampled_bam_index
File? collated_bam
File? read_one_fastq_gz
File? read_two_fastq_gz
File? singleton_reads_fastq_gz
File? alt_filtered_read_one_fastq_gz
File? alt_filtered_read_two_fastq_gz
File? alt_filtered_singleton_reads_fastq_gz
File? duplicate_marked_bam
File? duplicate_marked_bam_index
File? duplicate_marked_bam_md5
}