From 9a1c2b76c41a76cbc7fe64b688109bb456a40f58 Mon Sep 17 00:00:00 2001 From: Ragnar Groot Koerkamp Date: Fri, 26 Nov 2021 19:16:40 +0100 Subject: [PATCH] feat: Add conversion to/from Newick format to phylogeny --- Cargo.lock | 24 +++++++ Cargo.toml | 3 +- src/phylogeny.rs | 173 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 198 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 54f19f1..c172e63 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,6 +23,7 @@ version = "0.12.1" dependencies = [ "derive-new", "lazy_static", + "nom", "petgraph", "regex", "serde", @@ -84,6 +85,23 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109" +dependencies = [ + "memchr", + "minimal-lexical", + "version_check", +] + [[package]] name = "petgraph" version = "0.6.0" @@ -210,3 +228,9 @@ name = "unicode-xid" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "version_check" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" diff --git a/Cargo.toml b/Cargo.toml index 24c873d..4b01a34 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ edition = "2018" exclude = [".gitignore", ".github"] [features] -phylogeny = ["petgraph"] +phylogeny = ["petgraph", "nom"] [dependencies] @@ -24,3 +24,4 @@ lazy_static = "1.1" derive-new = "0.5" petgraph = { version = ">=0.5,<0.7", optional = true } strum_macros = ">=0.20, <0.24" +nom = { version = "7.1.0", features=["alloc"], optional = true } diff --git a/src/phylogeny.rs b/src/phylogeny.rs index f593335..1d6152e 100644 --- a/src/phylogeny.rs +++ b/src/phylogeny.rs @@ -7,12 +7,183 @@ //! Each node is a taxon, identified as a string. //! The edges are weighted by the phylogenetic distance if it was defined, or f32::NAN otherwise. -use petgraph::{graph::Graph, Directed}; +use std::{ffi::OsStr, fs, io, path::Path, str::FromStr}; + +use nom::bytes::complete::take_till; +use petgraph::{ + graph::{Graph, NodeIndex}, + visit::EdgeRef, + Directed, + EdgeDirection::Outgoing, +}; pub type Taxon = String; pub type Proximity = f32; pub type TreeGraph = Graph; + +/// Representation of a phylogenetic tree. +/// +/// The root is at NodeIndex 0. +/// +/// String conversions and file IO are to/from the [Newick format](https://en.wikipedia.org/wiki/Newick_format). +/// Extra whitespace, quoting strings, and comments are currently not supported. pub struct Tree { pub g: TreeGraph, } + +impl Tree { + /// Create a new empty Tree. + pub fn new() -> Self { + Tree { + g: TreeGraph::new(), + } + } +} + +impl ToString for Tree { + /// Convert the Tree to the Newick format. + fn to_string(&self) -> String { + fn subtree_to_string(i: NodeIndex, g: &TreeGraph, mut s: String) -> String { + let mut iter = g.edges_directed(i, Outgoing).peekable(); + if iter.peek().is_some() { + s += "("; + let mut first = true; + for edge in iter { + println!("{:?}", edge.target()); + if first { + first = false; + } else { + s += ","; + } + s = subtree_to_string(edge.target(), g, s); + if !edge.weight().is_nan() { + s += ":"; + s += &edge.weight().to_string(); + } + } + s += ")"; + } + if let Some(name) = g.node_weight(i) { + s += name; + } + s + } + subtree_to_string(0.into(), &self.g, String::new()) + ";" + } +} + +impl FromStr for Tree { + type Err = String; + + /// Parse a string in Newick format. + fn from_str(s: &str) -> Result { + use nom::{ + branch::alt, + bytes::complete::tag, + combinator::{map, opt}, + multi::separated_list1, + number::complete::float, + sequence::{delimited, pair, preceded, terminated, tuple}, + IResult, + }; + + type Result<'a, O> = IResult<&'a str, O>; + enum ParseTree<'a> { + Leaf(&'a str), + Internal((Vec<(ParseTree<'a>, f32)>, &'a str)), + } + + impl ParseTree<'_> { + fn to_tree(&self, t: &mut Tree) -> NodeIndex { + match self { + ParseTree::Leaf(name) => t.g.add_node(name.to_string()), + ParseTree::Internal((children, name)) => { + let node = t.g.add_node(name.to_string()); + // Add the children in reverse order, so that PetGraph iterates them in the normal order. + // Useful for tests. + children.iter().rev().for_each(|(pt, d)| { + let child_node = pt.to_tree(t); + t.g.add_edge(node, child_node, *d); + }); + node + } + } + } + } + + // Grammar taken from https://en.wikipedia.org/wiki/Newick_format#Grammar. + fn length(s: &str) -> Result { + map(opt(preceded(tag(":"), float)), |o| o.unwrap_or(f32::NAN))(s) + } + fn name(s: &str) -> Result<&str> { + take_till(|c| ";(),:".find(c).is_some())(s) + } + fn leaf(s: &str) -> Result { + map(name, ParseTree::Leaf)(s) + } + fn branch(s: &str) -> Result<(ParseTree, f32)> { + tuple((subtree, length))(s) + } + fn branchset(s: &str) -> Result> { + separated_list1(tag(","), branch)(s) + } + fn internal(s: &str) -> Result { + map( + pair(delimited(tag("("), branchset, tag(")")), name), + ParseTree::Internal, + )(s) + } + fn subtree(s: &str) -> Result { + alt((internal, leaf))(s) + } + fn tree(s: &str) -> Result { + terminated(subtree, tag(";"))(s) + } + let mut t = Tree::new(); + map(tree, |pt| pt.to_tree(&mut t))(s).map_err(|x| x.to_string())?; + Ok(t) + } +} + +impl Tree { + /// Read from a `.tree` file in Newick format. + pub fn from_file(p: &Path) -> Result { + assert!(p.extension() == Some(OsStr::new("tree"))); + fs::read_to_string(p).map_err(|e| e.to_string())?.parse() + } + + /// Write to a `.tree` file in Newick format. + pub fn to_file(self: &Self, p: &Path) -> io::Result<()> { + assert!(p.extension() == Some(OsStr::new("tree"))); + Ok(fs::write(p, self.to_string())?) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn phylogeny_from_to_string() { + // From Wikipedia: https://en.wikipedia.org/wiki/Newick_format + let strings = vec![ + // This is not supported currently + //"(:0.1,:0.2,(:0.3,:0.4):0.5):0.0;", //all have a distance to parent, including the root. + ";", + "A;", + "(A,B);", + "(,,(,));", //no nodes are named + "(A,B,(C,D));", //leaf nodes are named + "(A,B,(C,D)E)F;", //all nodes are named + "(:0.1,:0.2,(:0.3,:0.4):0.5);", //all but root node have a distance to parent + "(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);", //distances and leaf names (popular) + "(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;", //distances and all names + "((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;", //a tree rooted on a leaf node (rare) + ]; + for s in strings { + let t = s.parse::().unwrap(); + assert_eq!(t.to_string(), s); + } + } +}