Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Update ReadGroup to force uppercase PL. #149

Merged
merged 2 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion data_structures/read_group.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,11 @@ import json # lint-check: ignore
sam = pysam.AlignmentFile(os.environ["BAM"], "rb")

out_file = open(os.environ["OUTFILE"], "w")
json.dump(sam.header.to_dict()["RG"], out_file)
header = sam.header.to_dict()["RG"]
modified_header = []
for read_group in sorted(header, key=lambda d: d['ID']):
modified_header.append({k:v.upper() if k=='PL' else v for k,v in read_group.items()})
adthrasher marked this conversation as resolved.
Show resolved Hide resolved
json.dump(modified_header, out_file)
out_file.close()
END
>>>
Expand Down
40 changes: 31 additions & 9 deletions workflows/dnaseq/dnaseq-core.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ workflow dnaseq_core_experimental {
choices: ["mem", "aln"]
}
use_all_cores: "Use all cores? Recommended for cloud environments."
sample_override: "Value to override the SM field of *every* read group."
}
input {
Array[File] read_one_fastqs_gz
Expand All @@ -41,18 +42,39 @@ workflow dnaseq_core_experimental {
String prefix
String aligner = "mem"
Boolean use_all_cores = false
String? sample_override
}

scatter (rg in read_groups) {
call read_group.ReadGroup_to_string { input: read_group = rg }
}

Array[String] read_groups_bwa = prefix("@RG ", ReadGroup_to_string.stringified_read_group)

scatter (tuple in zip(
zip(read_one_fastqs_gz, read_two_fastqs_gz),
read_groups_bwa
read_groups
)) {
if (defined(sample_override)) {
# override the SM field of every read group
ReadGroup rg = ReadGroup{
ID: tuple.right.ID,
BC: tuple.right.BC,
CN: tuple.right.CN,
DS: tuple.right.DS,
DT: tuple.right.DT,
FO: tuple.right.FO,
KS: tuple.right.KS,
LB: tuple.right.LB,
PG: tuple.right.PG,
PI: tuple.right.PI,
PL: tuple.right.PL,
PM: tuple.right.PM,
PU: tuple.right.PU,
SM: sample_override
}
}

call read_group.ReadGroup_to_string { input:
read_group = select_first([rg, tuple.right])
}

String rg_string = "@RG " + ReadGroup_to_string.stringified_read_group

call util.split_fastq as read_ones { input:
fastq = tuple.left.left,
reads_per_file = reads_per_file
Expand All @@ -76,7 +98,7 @@ workflow dnaseq_core_experimental {
), "\\.([rR][12])\\.", "."),
# find spaces, replace with '\\t' (which must be written as '\\\\t')
# '\\t' is subbed into command blocks as '\t'
read_group = sub(tuple.right, " ", "\\\\t"),
read_group = sub(rg_string, " ", "\\\\t"),
use_all_cores,
}
}
Expand All @@ -92,7 +114,7 @@ workflow dnaseq_core_experimental {
), "\\.([rR][12])\\.", "."),
# find tab literals, replace with '\\t' (which must be written as '\\\\t')
# '\\t' is subbed into command blocks as '\t'
read_group = sub(tuple.right, " ", "\\\\t"),
read_group = sub(rg_string, " ", "\\\\t"),
use_all_cores,
}
}
Expand Down
2 changes: 1 addition & 1 deletion workflows/dnaseq/dnaseq-standard-fastq.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ workflow dnaseq_standard_fastq_experimental {
description: "BWA aligner to use",
choices: ["mem", "aln"]
}
validate_input: "Ensure input BAM is well-formed before beginning harmonization?"
validate_input: "Ensure input FASTQs ares well-formed before beginning harmonization?"
use_all_cores: "Use all cores? Recommended for cloud environments."
subsample_n_reads: "Only process a random sampling of `n` reads. Any `n`<=`0` for processing entire input."
}
Expand Down
4 changes: 4 additions & 0 deletions workflows/dnaseq/dnaseq-standard.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ workflow dnaseq_standard_experimental {
validate_input: "Ensure input BAM is well-formed before beginning harmonization?"
use_all_cores: "Use all cores? Recommended for cloud environments."
subsample_n_reads: "Only process a random sampling of `n` reads. Any `n`<=`0` for processing entire input."
sample_override: "Value to override the SM field of *every* read group."
}
input {
File bam
Expand All @@ -39,6 +40,7 @@ workflow dnaseq_standard_experimental {
Boolean validate_input = true
Boolean use_all_cores = false
Int subsample_n_reads = -1
String? sample_override
}

call parse_input { input:
Expand Down Expand Up @@ -78,6 +80,8 @@ workflow dnaseq_standard_experimental {
read_groups = get_ReadGroups.read_groups,
prefix,
aligner,
use_all_cores,
sample_override,
}

output {
Expand Down
Loading