refactor: Update ReadGroup to force uppercase PL. (#149)

* refactor: Update ReadGroup to force uppercase PL. Add sample_override to dnaseq-standard * fix: explicitly sort by RG ID. This matches the sorting from bam-to-fastq.
stjudecloud · May 23, 2024 · 2970ed4 · 2970ed4
1 parent fde88f1
commit 2970ed4
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 11 deletions.
diff --git a/data_structures/read_group.wdl b/data_structures/read_group.wdl
@@ -123,7 +123,11 @@ import json  # lint-check: ignore
 sam = pysam.AlignmentFile(os.environ["BAM"], "rb")
 
 out_file = open(os.environ["OUTFILE"], "w")
-json.dump(sam.header.to_dict()["RG"], out_file)
+header = sam.header.to_dict()["RG"]
+modified_header = []
+for read_group in sorted(header, key=lambda d: d['ID']):
+  modified_header.append({k:v.upper() if k=='PL' else v for k,v in read_group.items()})
+json.dump(modified_header, out_file)
 out_file.close()
 END
     >>>

diff --git a/workflows/dnaseq/dnaseq-core.wdl b/workflows/dnaseq/dnaseq-core.wdl
@@ -31,6 +31,7 @@ workflow dnaseq_core_experimental {
             choices: ["mem", "aln"]
         }
         use_all_cores: "Use all cores? Recommended for cloud environments."
+        sample_override: "Value to override the SM field of *every* read group."
     }
     input {
         Array[File] read_one_fastqs_gz
@@ -41,18 +42,39 @@ workflow dnaseq_core_experimental {
         String prefix
         String aligner = "mem"
         Boolean use_all_cores = false
+        String? sample_override
     }
 
-    scatter (rg in read_groups) {
-        call read_group.ReadGroup_to_string { input: read_group = rg }
-    }
-
-    Array[String] read_groups_bwa = prefix("@RG ", ReadGroup_to_string.stringified_read_group)
-
     scatter (tuple in zip(
         zip(read_one_fastqs_gz, read_two_fastqs_gz),
-        read_groups_bwa
+        read_groups
     )) {
+        if (defined(sample_override)) {
+            # override the SM field of every read group
+            ReadGroup rg = ReadGroup{
+                ID: tuple.right.ID,
+                BC: tuple.right.BC,
+                CN: tuple.right.CN,
+                DS: tuple.right.DS,
+                DT: tuple.right.DT,
+                FO: tuple.right.FO,
+                KS: tuple.right.KS,
+                LB: tuple.right.LB,
+                PG: tuple.right.PG,
+                PI: tuple.right.PI,
+                PL: tuple.right.PL,
+                PM: tuple.right.PM,
+                PU: tuple.right.PU,
+                SM: sample_override
+            }
+        }
+
+        call read_group.ReadGroup_to_string { input:
+            read_group = select_first([rg, tuple.right])
+        }
+
+        String rg_string = "@RG " + ReadGroup_to_string.stringified_read_group
+
         call util.split_fastq as read_ones { input:
             fastq = tuple.left.left,
             reads_per_file = reads_per_file
@@ -76,7 +98,7 @@ workflow dnaseq_core_experimental {
                     ), "\\.([rR][12])\\.", "."),
                     # find spaces, replace with '\\t' (which must be written as '\\\\t')
                     # '\\t' is subbed into command blocks as '\t'
-                    read_group = sub(tuple.right, " ", "\\\\t"),
+                    read_group = sub(rg_string, " ", "\\\\t"),
                     use_all_cores,
                 }
             }
@@ -92,7 +114,7 @@ workflow dnaseq_core_experimental {
                     ), "\\.([rR][12])\\.", "."),
                     # find tab literals, replace with '\\t' (which must be written as '\\\\t')
                     # '\\t' is subbed into command blocks as '\t'
-                    read_group = sub(tuple.right, " ", "\\\\t"),
+                    read_group = sub(rg_string, " ", "\\\\t"),
                     use_all_cores,
                 }
             }

diff --git a/workflows/dnaseq/dnaseq-standard-fastq.wdl b/workflows/dnaseq/dnaseq-standard-fastq.wdl
@@ -29,7 +29,7 @@ workflow dnaseq_standard_fastq_experimental {
             description: "BWA aligner to use",
             choices: ["mem", "aln"]
         }
-        validate_input: "Ensure input BAM is well-formed before beginning harmonization?"
+        validate_input: "Ensure input FASTQs ares well-formed before beginning harmonization?"
         use_all_cores: "Use all cores? Recommended for cloud environments."
         subsample_n_reads: "Only process a random sampling of `n` reads. Any `n`<=`0` for processing entire input."
     }

diff --git a/workflows/dnaseq/dnaseq-standard.wdl b/workflows/dnaseq/dnaseq-standard.wdl
@@ -29,6 +29,7 @@ workflow dnaseq_standard_experimental {
         validate_input: "Ensure input BAM is well-formed before beginning harmonization?"
         use_all_cores: "Use all cores? Recommended for cloud environments."
         subsample_n_reads: "Only process a random sampling of `n` reads. Any `n`<=`0` for processing entire input."
+        sample_override: "Value to override the SM field of *every* read group."
     }
     input {
         File bam
@@ -39,6 +40,7 @@ workflow dnaseq_standard_experimental {
         Boolean validate_input = true
         Boolean use_all_cores = false
         Int subsample_n_reads = -1
+        String? sample_override
     }
 
     call parse_input { input:
@@ -78,6 +80,8 @@ workflow dnaseq_standard_experimental {
         read_groups = get_ReadGroups.read_groups,
         prefix,
         aligner,
+        use_all_cores,
+        sample_override,
     }
 
     output {