From 707ef772e08eb951ef631f54748bcf372994fb21 Mon Sep 17 00:00:00 2001 From: Felix Wiegand Date: Wed, 7 Jul 2021 21:49:48 +0200 Subject: [PATCH] feat: New subcommand for plotting bam files (#156) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add new subcommand plot-bam * Minor fixes * Visual updates * Remove debug * Format * Support for multiple bam files * Format * Plot BAM files below each other * Update README.md * Minor clippy fixes * Write output to stdout * Format * Add navigation for samples * Bump xlsxwriter version * Update Cargo.toml * Update Cargo.toml * Update Cargo.toml * Fix cli help text and wrong default value * pack chrom, start, stop into Region struct instead; replace Path arg types with P: AsRef * Use context for anyhow error message * Update README.md Co-authored-by: Till Hartmann Co-authored-by: Johannes Köster --- README.md | 8 +- src/bam/mod.rs | 1 + src/bam/plot/bam_plot.html.tera | 86 +++++++++++++++++++ src/bam/plot/mod.rs | 1 + src/bam/plot/plot_bam.rs | 50 +++++++++++ .../report/table_report/alignment_reader.rs | 16 ++-- .../table_report/create_report_table.rs | 67 ++++++++------- src/bcf/report/table_report/fasta_reader.rs | 13 ++- src/bcf/report/table_report/static_reader.rs | 39 ++++++--- src/cli.rs | 25 +++++- src/common.rs | 25 ++++++ src/main.rs | 6 ++ 12 files changed, 274 insertions(+), 63 deletions(-) create mode 100644 src/bam/plot/bam_plot.html.tera create mode 100644 src/bam/plot/mod.rs create mode 100644 src/bam/plot/plot_bam.rs diff --git a/README.md b/README.md index 4a4b0b18..a00f0e6a 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Rust-Bio-Tools provides a command `rbt`, which currently supports the following * a tool to generate interactive HTML based reports that offer multiple plots visualizing the provided genomics data in VCF and BAM format (`rbt vcf-report`) * a tool to generate an interactive HTML based report from a csv file including visualizations (`rbt csv-report`) * a tool for splitting VCF/BCF files into N equal chunks, including BND support (`rbt vcf-split`) +* a tool to generate visualizations for a specific region of one or multiple BAM files with a given reference contained in a single HTML file (`rbt plot-bam`) Further functionality is added as it is needed by the authors. Check out the [Contributing](#Contributing) section if you want contribute anything yourself. For a list of changes, take a look at the [CHANGELOG](CHANGELOG.md). @@ -63,6 +64,7 @@ This should format, check and lint your code when committing. ## Authors -* Johannes Köster (https://koesterlab.github.io) -* Felix Mölder -* Henning Timm +* [Johannes Köster](https://github.com/johanneskoester) (https://koesterlab.github.io) +* [Felix Mölder](https://github.com/FelixMoelder) +* [Henning Timm](https://github.com/HenningTimm) +* [Felix Wiegand](https://github.com/fxwiegand) diff --git a/src/bam/mod.rs b/src/bam/mod.rs index e5d400f2..a2238a16 100644 --- a/src/bam/mod.rs +++ b/src/bam/mod.rs @@ -1,3 +1,4 @@ //! Tools that work on BAM files pub mod collapse_reads_to_fragments; pub mod depth; +pub mod plot; diff --git a/src/bam/plot/bam_plot.html.tera b/src/bam/plot/bam_plot.html.tera new file mode 100644 index 00000000..a1623cd5 --- /dev/null +++ b/src/bam/plot/bam_plot.html.tera @@ -0,0 +1,86 @@ + + + + rbt bam report + + + + + + + + + + + + + + + + + +{% for plot in plots %}
+
+
{% endfor %} + + + diff --git a/src/bam/plot/mod.rs b/src/bam/plot/mod.rs new file mode 100644 index 00000000..53eebc8d --- /dev/null +++ b/src/bam/plot/mod.rs @@ -0,0 +1 @@ +pub mod plot_bam; diff --git a/src/bam/plot/plot_bam.rs b/src/bam/plot/plot_bam.rs new file mode 100644 index 00000000..5b902658 --- /dev/null +++ b/src/bam/plot/plot_bam.rs @@ -0,0 +1,50 @@ +use crate::bcf::report::table_report::create_report_table::create_report_data; +use crate::bcf::report::table_report::create_report_table::manipulate_json; +use crate::common::Region; +use anyhow::Result; +use chrono::{DateTime, Local}; +use itertools::Itertools; +use std::io; +use std::io::Write; +use std::path::Path; +use tera::{Context, Tera}; + +pub(crate) fn plot_bam>( + bam_paths: &[P], + fasta_path: P, + region: &Region, + max_read_depth: u32, +) -> Result<()> { + let mut plots = Vec::new(); + + let Region { target, start, end } = region.clone(); + for bam_path in bam_paths { + let (content, max_rows) = + create_report_data(&fasta_path, None, bam_path, region, max_read_depth)?; + let visualization = manipulate_json(content, start, end, max_rows)?; + + plots.push(visualization); + } + + let bams = bam_paths + .iter() + .map(|b| b.as_ref().iter().last().unwrap().to_str().unwrap()) + .collect_vec(); + + let mut templates = Tera::default(); + templates.add_raw_template("bam_plot.html.tera", include_str!("bam_plot.html.tera"))?; + let mut context = Context::new(); + let local: DateTime = Local::now(); + context.insert("time", &local.format("%a %b %e %T %Y").to_string()); + context.insert("version", &env!("CARGO_PKG_VERSION")); + context.insert("plots", &plots); + context.insert("bams", &bams); + context.insert("chrom", &target); + context.insert("start", &start); + context.insert("end", &end); + + let html = templates.render("bam_plot.html.tera", &context)?; + io::stdout().write_all(html.as_bytes())?; + + Ok(()) +} diff --git a/src/bcf/report/table_report/alignment_reader.rs b/src/bcf/report/table_report/alignment_reader.rs index 0d3ddc12..f98d732a 100644 --- a/src/bcf/report/table_report/alignment_reader.rs +++ b/src/bcf/report/table_report/alignment_reader.rs @@ -2,6 +2,7 @@ extern crate rust_htslib; use self::rust_htslib::bam::FetchDefinition; use crate::bcf::report::table_report::fasta_reader::read_fasta; +use crate::common::Region; use anyhow::Result; use rust_htslib::bam::record::CigarStringView; use rust_htslib::{bam, bam::Read}; @@ -73,8 +74,10 @@ pub fn decode_flags(code: u16) -> Vec { read_map } -pub fn read_indexed_bam(path: &Path, chrom: String, from: u64, to: u64) -> Result> { +pub fn read_indexed_bam>(path: P, region: &Region) -> Result> { let mut bam = bam::IndexedReader::from_path(&path)?; + let chrom = ®ion.target; + let (from, to) = (region.start, region.end); let tid = bam.header().tid(chrom.as_bytes()).unwrap() as i32; let mut alignments: Vec = Vec::new(); @@ -133,17 +136,16 @@ fn make_alignment(record: &bam::Record) -> Alignment { } } -pub fn make_nucleobases( - fasta_path: &Path, - chrom: String, +pub fn make_nucleobases>( + fasta_path: P, + region: &Region, snippets: Vec, - from: u64, - to: u64, ) -> Result<(Vec, Vec)> { let mut bases: Vec = Vec::new(); let mut matches: Vec = Vec::new(); - let ref_bases = read_fasta(fasta_path, chrom, from, to, false)?; + let ref_bases = read_fasta(fasta_path, region, false)?; + let (from, to) = (region.start, region.end); for snippet in snippets { let mut cigar_offset: i64 = 0; diff --git a/src/bcf/report/table_report/create_report_table.rs b/src/bcf/report/table_report/create_report_table.rs index 143408ae..a0750534 100644 --- a/src/bcf/report/table_report/create_report_table.rs +++ b/src/bcf/report/table_report/create_report_table.rs @@ -1,5 +1,6 @@ use crate::bcf::report::table_report::fasta_reader::{get_fasta_lengths, read_fasta}; use crate::bcf::report::table_report::static_reader::{get_static_reads, Variant}; +use crate::common::Region; use anyhow::Context as AnyhowContext; use anyhow::Result; use chrono::{DateTime, Local}; @@ -206,9 +207,9 @@ pub(crate) fn make_table_report( .get(&field.to_owned()) .context({ format!( - "No field named {} found. Please only use VEP-annotated VCF-files.", - field - ) + "No field named {} found. Please only use VEP-annotated VCF-files.", + field + ) }) .unwrap()], ) @@ -222,7 +223,7 @@ pub(crate) fn make_table_report( warn!("Warning! Found allele without SYMBOL or Gene field in record at {}:{}. Using HGVSg instead.", &chrom, variant.pos()); get_field("HGVSg")? } else { - warn!("Warning! Found allele without SYMBOL, Gene or HGVSg field in record at {}:{}. This record will be skipped!", &chrom, variant.pos()); + warn!("Warning! Found allele without SYMBOL, Gene or HGVSg field in record at {}:{}. This record will be skipped!", &chrom, variant.pos()); continue; }; genes.push(gene.to_owned()); @@ -325,11 +326,13 @@ pub(crate) fn make_table_report( if pos < 75 { let (content, max_rows) = create_report_data( fasta_path, - var.clone(), + Some(var.clone()), bam_path, - chrom.clone(), - 0, - end_position as u64 + 75, + &Region { + target: chrom.clone(), + start: 0, + end: end_position as u64 + 75, + }, max_read_depth, )?; visualization = @@ -337,11 +340,13 @@ pub(crate) fn make_table_report( } else if pos + 75 >= fasta_length as i64 { let (content, max_rows) = create_report_data( fasta_path, - var.clone(), + Some(var.clone()), bam_path, - chrom.clone(), - pos as u64 - 75, - fasta_length - 1, + &Region { + target: chrom.clone(), + start: pos as u64 - 75, + end: fasta_length - 1, + }, max_read_depth, )?; visualization = @@ -349,11 +354,13 @@ pub(crate) fn make_table_report( } else { let (content, max_rows) = create_report_data( fasta_path, - var.clone(), + Some(var.clone()), bam_path, - chrom.clone(), - pos as u64 - 75, - end_position as u64 + 75, + &Region { + target: chrom.clone(), + start: pos as u64 - 75, + end: end_position as u64 + 75, + }, max_read_depth, )?; visualization = manipulate_json( @@ -490,18 +497,16 @@ pub(crate) fn read_tag_entries( Ok(()) } -fn create_report_data( - fasta_path: &Path, - variant: Variant, - bam_path: &Path, - chrom: String, - from: u64, - to: u64, +pub(crate) fn create_report_data>( + fasta_path: P, + variant: Option, + bam_path: P, + region: &Region, max_read_depth: u32, ) -> Result<(Json, usize)> { let mut data = Vec::new(); - for f in read_fasta(fasta_path, chrom.clone(), from, to, true)? { + for f in read_fasta(&fasta_path, region, true)? { let nucleobase = json!(f); data.push(nucleobase); } @@ -509,11 +514,9 @@ fn create_report_data( let (bases, matches, max_rows) = get_static_reads( bam_path, fasta_path, - chrom, - from, - to, + region, max_read_depth, - &variant, + variant.as_ref(), )?; for b in bases { @@ -526,14 +529,16 @@ fn create_report_data( data.push(mat); } - data.push(json!(variant)); + if variant.is_some() { + data.push(json!(variant)); + } Ok((Json::from_str(&json!(data).to_string()).unwrap(), max_rows)) } /// Inserts the json containing the genome data into the vega specs. /// It also changes keys and values of the json data for the vega plot to look better and compresses the json with jsonm. -fn manipulate_json(data: Json, from: u64, to: u64, max_rows: usize) -> Result { +pub(crate) fn manipulate_json(data: Json, from: u64, to: u64, max_rows: usize) -> Result { let json_string = include_str!("vegaSpecs.json"); let mut vega_specs: Value = serde_json::from_str(json_string)?; @@ -545,7 +550,7 @@ fn manipulate_json(data: Json, from: u64, to: u64, max_rows: usize) -> Result>( + path: P, + region: &Region, compensate_0_basing: bool, ) -> Result> { let mut reader = fasta::IndexedReader::from_file(&path).unwrap(); @@ -18,11 +17,11 @@ pub fn read_fasta( let mut seq: Vec = Vec::new(); - reader.fetch(&chrom, start, stop)?; + reader.fetch(®ion.target, region.start, region.end)?; reader.read(&mut seq)?; let mut fasta = Vec::new(); - let mut ind = start; + let mut ind = region.start; if compensate_0_basing { ind += 1; } diff --git a/src/bcf/report/table_report/static_reader.rs b/src/bcf/report/table_report/static_reader.rs index 2698e725..9597dcf2 100644 --- a/src/bcf/report/table_report/static_reader.rs +++ b/src/bcf/report/table_report/static_reader.rs @@ -2,6 +2,7 @@ use crate::bcf::report::table_report::alignment_reader::{ make_nucleobases, read_indexed_bam, AlignmentMatch, AlignmentNucleobase, }; use crate::bcf::report::table_report::create_report_table::VariantType; +use crate::common::Region; use anyhow::Result; use rand::rngs::StdRng; use rand::seq::IteratorRandom; @@ -39,7 +40,7 @@ fn calc_rows( reads: Vec, matches: Vec, max_read_depth: u32, - variant: &Variant, + variant: Option<&Variant>, ) -> ( Vec, Vec, @@ -55,8 +56,14 @@ fn calc_rows( let mut max_row = 0; for r in matches { - if r.read_start < variant.start_position as u32 && r.read_end > variant.end_position as u32 - { + let overlaps = if variant.is_some() { + r.read_start < variant.unwrap().start_position as u32 + && r.read_end > variant.unwrap().end_position as u32 + } else { + true + }; + + if overlaps { let mut row: u16 = 0; if read_names.contains_key(&r.name) { @@ -85,8 +92,14 @@ fn calc_rows( } for r in reads { - if r.read_start < variant.start_position as u32 && r.read_end > variant.end_position as u32 - { + let overlaps = if variant.is_some() { + r.read_start < variant.unwrap().start_position as u32 + && r.read_end > variant.unwrap().end_position as u32 + } else { + true + }; + + if overlaps { let mut row: u16 = 0; if read_names.contains_key(&r.name) { @@ -134,20 +147,18 @@ fn calc_rows( (reads_wr, matches_wr, max_row) } -pub fn get_static_reads( - path: &Path, - fasta_path: &Path, - chrom: String, - from: u64, - to: u64, +pub fn get_static_reads>( + path: P, + fasta_path: P, + region: &Region, max_read_depth: u32, - variant: &Variant, + variant: Option<&Variant>, ) -> Result<( Vec, Vec, usize, )> { - let alignments = read_indexed_bam(path, chrom.clone(), from, to)?; - let (msm, m) = make_nucleobases(fasta_path, chrom, alignments, from, to)?; + let alignments = read_indexed_bam(path, region)?; + let (msm, m) = make_nucleobases(fasta_path, region, alignments)?; Ok(calc_rows(msm, m, max_read_depth, variant)) } diff --git a/src/cli.rs b/src/cli.rs index 7fc143bb..e96c035e 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,3 +1,4 @@ +use crate::common::Region; use std::path::PathBuf; use structopt::StructOpt; @@ -215,6 +216,28 @@ pub(crate) enum Command { output_path: String, }, + /// Creates a html file with a vega visualization of the given bam region + /// Example: + /// rbt plot-bam -b input.bam -g 2:132424-132924 -r input.fa > plot.html + #[structopt(author = "Felix Wiegand ")] + PlotBam { + /// BAM file to be visualized. + #[structopt(long, short = "b", parse(from_os_str))] + bam_path: Vec, + + /// Path to the reference fasta file. + #[structopt(long, short = "r", parse(from_os_str))] + reference: PathBuf, + + /// Chromosome and region for the visualization. Example: 2:132424-132924 + #[structopt(long, short = "g")] + region: Region, + + /// Set the maximum rows that will be shown in the alignment plots. + #[structopt(long, short = "d", default_value = "500")] + max_read_depth: u32, + }, + /// Creates report from a given VCF file including a visual plot /// for every variant with the given BAM and FASTA file. /// The VCF file has to be annotated with VEP, using the options --hgvs and --hgvsg. @@ -246,7 +269,7 @@ pub(crate) enum Command { #[structopt(long, short = "c", default_value = "1000")] cells: u32, - /// Set the maximum number of cells in the oncoprint per page. Lowering max-cells should improve the performance of the plots in the browser. Default value is 1000. + /// Set the maximum lines of reads that will be shown in the alignment plots. Default value is 500. #[structopt(long, short = "d", default_value = "500")] max_read_depth: u32, diff --git a/src/common.rs b/src/common.rs index 9a452274..f5bc4ab9 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,8 +1,10 @@ +use anyhow::Context; use approx::relative_eq; use bio::stats::probs::{LogProb, PHREDProb}; use bio_types::sequence::SequenceRead; use ordered_float::NotNaN; use std::cmp; +use std::str::FromStr; const PROB_CONFUSION: LogProb = LogProb(-1.0986122886681098); // (1 / 3).ln() const ALLELES: &[u8] = b"ACGT"; @@ -77,3 +79,26 @@ pub trait CalcConsensus<'a, R: SequenceRead> { fn seqids(&self) -> &'a [usize]; fn uuid(&self) -> &'a str; } + +#[derive(Debug, Clone)] +pub struct Region { + pub(crate) target: String, + pub(crate) start: u64, + pub(crate) end: u64, +} + +impl FromStr for Region { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let (target, range) = s.split_once(':').context("No ':' in region string")?; + let (start, end) = range.split_once('-').context("No '-' in region string")?; + let start = start.parse::()?; + let end = end.parse::()?; + Ok(Region { + target: target.into(), + start, + end, + }) + } +} diff --git a/src/main.rs b/src/main.rs index 345f5f6b..641e2583 100644 --- a/src/main.rs +++ b/src/main.rs @@ -113,6 +113,12 @@ fn main() -> Result<()> { pin_until.as_deref(), )? } + PlotBam { + bam_path, + reference, + region, + max_read_depth, + } => bam::plot::plot_bam::plot_bam(&bam_path, reference, ®ion, max_read_depth)?, VcfReport { fasta, vcfs,