Skip to content

Commit

Permalink
feat: Add DistanceMatrix type and conversion from/to Phylip matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
RagnarGrootKoerkamp committed Nov 26, 2021
1 parent 9a1c2b7 commit 8a6f307
Show file tree
Hide file tree
Showing 4 changed files with 288 additions and 1 deletion.
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ edition = "2018"
exclude = [".gitignore", ".github"]

[features]
phylogeny = ["petgraph", "nom"]
phylogeny = ["petgraph", "nom", "itertools"]


[dependencies]
Expand All @@ -25,3 +25,4 @@ derive-new = "0.5"
petgraph = { version = ">=0.5,<0.7", optional = true }
strum_macros = ">=0.20, <0.24"
nom = { version = "7.1.0", features=["alloc"], optional = true }
itertools = { version = "0.10.1", optional = true }
268 changes: 268 additions & 0 deletions src/distancematrix.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
//! A distance matrix in relaxed Phylip format, stored as a list of taxons and a matrix distances.

use itertools::zip;
use std::{
cmp::{max, min},
ffi::OsStr,
fs,
num::ParseIntError,
ops::{Index, IndexMut},
path::Path,
str::FromStr,
};

/// The type of the matrix. Either square, lower triangular (excluding the
/// diagonal), or upper triangular (excluding the diagonal).
/// Square matrices are assumed to be symmetric, but this is not enforced nor checked.
#[derive(Debug, PartialEq, Eq)]
pub enum MatrixType {
Square,
Lower,
Upper,
}

pub type Taxon = String;

/// A distance matrix representing a Phylip format `.dist` file.
#[derive(Debug)]
pub struct DistanceMatrix {
names: Vec<String>,
distances: Vec<Vec<f32>>,
matrix_type: MatrixType,
}

impl DistanceMatrix {
/// The number of sequences (rows) in the matrix.
pub fn len(&self) -> usize {
self.names.len()
}

/// Create a new DistanceMatrix.
///
/// `distances` must be a matrix with the same number of rows as `names`, and follow one of the accepted types.
pub fn new(names: Vec<String>, distances: Vec<Vec<f32>>) -> Result<Self, String> {
if names.len() != distances.len() {
Err("Distances and Names have different length.".to_string())
} else {
let n = names.len();
let square = distances.iter().all(|x| x.len() == n);
let lower = distances.iter().enumerate().all(|(i, x)| x.len() == i);
let upper = distances
.iter()
.enumerate()
.all(|(i, x)| x.len() == n - 1 - i);
let matrix_type = match (square, lower, upper) {
(true, _, _) => MatrixType::Square,
(_, true, _) => MatrixType::Lower,
(_, _, true) => MatrixType::Upper,
_ => Err("Could not determine shape of matrix.")?,
};
Ok(Self {
names,
distances,
matrix_type,
})
}
}

// Converts an index (i,j) to the right index to use given the shape of the matrix.
fn get_index(&self, (i, j): (usize, usize)) -> (usize, usize) {
match self.matrix_type {
MatrixType::Square => (i, j),
MatrixType::Lower => {
assert!(i != j);
(max(i, j), min(i, j))
}
MatrixType::Upper => {
assert!(i != j);
(min(i, j), max(i, j) - min(i, j) - 1)
}
}
}

/// Read a relaxed Phylip format `.dist`` file into a DistanceMatrix.
pub fn from_file(p: &Path) -> Result<Self, String> {
assert!(p.extension() == Some(OsStr::new("dist")));
fs::read_to_string(p).map_err(|e| e.to_string())?.parse()
}

/// Write the DistanceMatrix to a relaxed Phylip format `.dist`` file.
pub fn to_file(self: &Self, p: &Path) -> Result<(), String> {
assert!(p.extension() == Some(OsStr::new("dist")));
Ok(fs::write(p, self.to_string()).map_err(|e| e.to_string())?)
}
}

impl FromStr for DistanceMatrix {
type Err = String;

/// Read a relaxed Phylib distance matrix.
fn from_str(s: &str) -> Result<Self, String> {
let mut lines = s.lines();
let n = lines
.next()
.ok_or("Expected n")?
.parse()
.map_err(|e: ParseIntError| e.to_string())?;

let mut names = Vec::with_capacity(n);
let mut distances = Vec::with_capacity(n);

for line in lines.into_iter() {
let mut it = line.split_ascii_whitespace();
names.push(
it.next()
.ok_or("Line does not start with a name")?
.to_string(),
);
distances.push(it.map(|chars| chars.parse().unwrap()).collect());
}

Ok(Self::new(names, distances)?)
}
}

impl ToString for DistanceMatrix {
/// Convert a relaxed Phylip distance matrix to a String.
fn to_string(&self) -> String {
let mut s = String::new();
s += &self.len().to_string();
s += "\n";
for (name, ds) in zip(&self.names, &self.distances) {
s += &name;
for d in ds {
s += " ";
s += &d.to_string();
}
s += "\n";
}
s
}
}

/// Index access into the DistanceMatrix, taking into account the shape.
///
/// Indices should be used as if the matrix was square. For triangular matrices,
/// `i` and `j` must be distinct, and `(i,j)` and `(j,i)` represent the same
/// element.
impl Index<(usize, usize)> for DistanceMatrix {
type Output = f32;
fn index(&self, t: (usize, usize)) -> &Self::Output {
let (i, j) = self.get_index(t);
&self.distances[i][j]
}
}

/// Mutable index access into the DistanceMatrix, taking into account the shape, like `Index`.
impl IndexMut<(usize, usize)> for DistanceMatrix {
fn index_mut(&mut self, t: (usize, usize)) -> &mut Self::Output {
let (i, j) = self.get_index(t);
&mut self.distances[i][j]
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn distance_matrix_from_to_string() {
let square = "3
a 0 1 2
B 1 0 3
XYZ 2 3 0
";
let lower = "3
a
B 1
XYZ 2 3
";
let upper = "3
a 1 2
B 3
XYZ
";
for (s, matrix_type) in zip(
[square, lower, upper],
[MatrixType::Square, MatrixType::Lower, MatrixType::Upper],
) {
let t = s.parse::<DistanceMatrix>().unwrap();
assert_eq!(t.matrix_type, matrix_type);
assert_eq!(t.to_string(), s);
}
}

#[test]
fn floats_and_names() {
let s = "1
some_long_name?! 1.0
";
let t = s.parse::<DistanceMatrix>().unwrap();
assert_eq!(t[(0, 0)], 1.0);
assert_eq!(t.names[0], "some_long_name?!");

let s = "1
ABCabc 1.0001
";
let t = s.parse::<DistanceMatrix>().unwrap();
assert_eq!(t[(0, 0)], 1.0001);
assert_eq!(t.names[0], "ABCabc");
}

#[test]
fn indexing() {
let s = "1
a 0 1 2
b 3 4 5
c 6 7 8
";
let mut t = s.parse::<DistanceMatrix>().unwrap();
assert_eq!(t[(0, 0)], 0.);
assert_eq!(t[(1, 2)], 5.);
assert_eq!(t[(2, 1)], 7.);
t[(1, 2)] = -1.;
t[(2, 1)] = -2.;
assert_eq!(t[(1, 2)], -1.);
assert_eq!(t[(2, 1)], -2.);
}

#[test]
fn lower_indexing() {
let s = "1
a
b 5
c 7 8
";
let mut t = s.parse::<DistanceMatrix>().unwrap();
assert_eq!(t[(0, 2)], 7.);
assert_eq!(t[(1, 2)], 8.);
assert_eq!(t[(2, 1)], 8.);
t[(0, 2)] = -1.;
t[(2, 1)] = -2.;
t[(1, 2)] = -3.;
assert_eq!(t[(0, 2)], -1.);
assert_eq!(t[(2, 0)], -1.);
assert_eq!(t[(1, 2)], -3.);
assert_eq!(t[(2, 1)], -3.);
}

#[test]
fn upper_indexing() {
let s = "1
a 1 2
b 3
c
";
let mut t = s.parse::<DistanceMatrix>().unwrap();
assert_eq!(t[(0, 2)], 2.);
assert_eq!(t[(1, 2)], 3.);
assert_eq!(t[(2, 1)], 3.);
t[(0, 2)] = -1.;
t[(2, 1)] = -2.;
t[(1, 2)] = -3.;
assert_eq!(t[(0, 2)], -1.);
assert_eq!(t[(2, 0)], -1.);
assert_eq!(t[(1, 2)], -3.);
assert_eq!(t[(2, 1)], -3.);
}
}
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ extern crate petgraph;

pub mod alignment;
pub mod annot;
#[cfg(feature = "phylogeny")]
pub mod distancematrix;
pub mod genome;
#[cfg(feature = "phylogeny")]
pub mod phylogeny;
Expand Down

0 comments on commit 8a6f307

Please sign in to comment.