Skip to content

Commit

Permalink
refactor: Update ReadGroup to force uppercase PL. (#149)
Browse files Browse the repository at this point in the history
* refactor: Update ReadGroup to force uppercase PL. Add sample_override to dnaseq-standard

* fix: explicitly sort by RG ID. This matches the sorting from bam-to-fastq.
  • Loading branch information
adthrasher committed May 23, 2024
1 parent fde88f1 commit 2970ed4
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 11 deletions.
6 changes: 5 additions & 1 deletion data_structures/read_group.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,11 @@ import json # lint-check: ignore
sam = pysam.AlignmentFile(os.environ["BAM"], "rb")

out_file = open(os.environ["OUTFILE"], "w")
json.dump(sam.header.to_dict()["RG"], out_file)
header = sam.header.to_dict()["RG"]
modified_header = []
for read_group in sorted(header, key=lambda d: d['ID']):
modified_header.append({k:v.upper() if k=='PL' else v for k,v in read_group.items()})
json.dump(modified_header, out_file)
out_file.close()
END
>>>
Expand Down
40 changes: 31 additions & 9 deletions workflows/dnaseq/dnaseq-core.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ workflow dnaseq_core_experimental {
choices: ["mem", "aln"]
}
use_all_cores: "Use all cores? Recommended for cloud environments."
sample_override: "Value to override the SM field of *every* read group."
}
input {
Array[File] read_one_fastqs_gz
Expand All @@ -41,18 +42,39 @@ workflow dnaseq_core_experimental {
String prefix
String aligner = "mem"
Boolean use_all_cores = false
String? sample_override
}

scatter (rg in read_groups) {
call read_group.ReadGroup_to_string { input: read_group = rg }
}

Array[String] read_groups_bwa = prefix("@RG ", ReadGroup_to_string.stringified_read_group)

scatter (tuple in zip(
zip(read_one_fastqs_gz, read_two_fastqs_gz),
read_groups_bwa
read_groups
)) {
if (defined(sample_override)) {
# override the SM field of every read group
ReadGroup rg = ReadGroup{
ID: tuple.right.ID,
BC: tuple.right.BC,
CN: tuple.right.CN,
DS: tuple.right.DS,
DT: tuple.right.DT,
FO: tuple.right.FO,
KS: tuple.right.KS,
LB: tuple.right.LB,
PG: tuple.right.PG,
PI: tuple.right.PI,
PL: tuple.right.PL,
PM: tuple.right.PM,
PU: tuple.right.PU,
SM: sample_override
}
}

call read_group.ReadGroup_to_string { input:
read_group = select_first([rg, tuple.right])
}

String rg_string = "@RG " + ReadGroup_to_string.stringified_read_group

call util.split_fastq as read_ones { input:
fastq = tuple.left.left,
reads_per_file = reads_per_file
Expand All @@ -76,7 +98,7 @@ workflow dnaseq_core_experimental {
), "\\.([rR][12])\\.", "."),
# find spaces, replace with '\\t' (which must be written as '\\\\t')
# '\\t' is subbed into command blocks as '\t'
read_group = sub(tuple.right, " ", "\\\\t"),
read_group = sub(rg_string, " ", "\\\\t"),
use_all_cores,
}
}
Expand All @@ -92,7 +114,7 @@ workflow dnaseq_core_experimental {
), "\\.([rR][12])\\.", "."),
# find tab literals, replace with '\\t' (which must be written as '\\\\t')
# '\\t' is subbed into command blocks as '\t'
read_group = sub(tuple.right, " ", "\\\\t"),
read_group = sub(rg_string, " ", "\\\\t"),
use_all_cores,
}
}
Expand Down
2 changes: 1 addition & 1 deletion workflows/dnaseq/dnaseq-standard-fastq.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ workflow dnaseq_standard_fastq_experimental {
description: "BWA aligner to use",
choices: ["mem", "aln"]
}
validate_input: "Ensure input BAM is well-formed before beginning harmonization?"
validate_input: "Ensure input FASTQs ares well-formed before beginning harmonization?"
use_all_cores: "Use all cores? Recommended for cloud environments."
subsample_n_reads: "Only process a random sampling of `n` reads. Any `n`<=`0` for processing entire input."
}
Expand Down
4 changes: 4 additions & 0 deletions workflows/dnaseq/dnaseq-standard.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ workflow dnaseq_standard_experimental {
validate_input: "Ensure input BAM is well-formed before beginning harmonization?"
use_all_cores: "Use all cores? Recommended for cloud environments."
subsample_n_reads: "Only process a random sampling of `n` reads. Any `n`<=`0` for processing entire input."
sample_override: "Value to override the SM field of *every* read group."
}
input {
File bam
Expand All @@ -39,6 +40,7 @@ workflow dnaseq_standard_experimental {
Boolean validate_input = true
Boolean use_all_cores = false
Int subsample_n_reads = -1
String? sample_override
}

call parse_input { input:
Expand Down Expand Up @@ -78,6 +80,8 @@ workflow dnaseq_standard_experimental {
read_groups = get_ReadGroups.read_groups,
prefix,
aligner,
use_all_cores,
sample_override,
}

output {
Expand Down

0 comments on commit 2970ed4

Please sign in to comment.