-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add DistanceMatrix type and conversion from/to Phylip matrix
- Loading branch information
1 parent
9a1c2b7
commit 8a6f307
Showing
4 changed files
with
288 additions
and
1 deletion.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,268 @@ | ||
//! A distance matrix in relaxed Phylip format, stored as a list of taxons and a matrix distances. | ||
|
||
use itertools::zip; | ||
use std::{ | ||
cmp::{max, min}, | ||
ffi::OsStr, | ||
fs, | ||
num::ParseIntError, | ||
ops::{Index, IndexMut}, | ||
path::Path, | ||
str::FromStr, | ||
}; | ||
|
||
/// The type of the matrix. Either square, lower triangular (excluding the | ||
/// diagonal), or upper triangular (excluding the diagonal). | ||
/// Square matrices are assumed to be symmetric, but this is not enforced nor checked. | ||
#[derive(Debug, PartialEq, Eq)] | ||
pub enum MatrixType { | ||
Square, | ||
Lower, | ||
Upper, | ||
} | ||
|
||
pub type Taxon = String; | ||
|
||
/// A distance matrix representing a Phylip format `.dist` file. | ||
#[derive(Debug)] | ||
pub struct DistanceMatrix { | ||
names: Vec<String>, | ||
distances: Vec<Vec<f32>>, | ||
matrix_type: MatrixType, | ||
} | ||
|
||
impl DistanceMatrix { | ||
/// The number of sequences (rows) in the matrix. | ||
pub fn len(&self) -> usize { | ||
self.names.len() | ||
} | ||
|
||
/// Create a new DistanceMatrix. | ||
/// | ||
/// `distances` must be a matrix with the same number of rows as `names`, and follow one of the accepted types. | ||
pub fn new(names: Vec<String>, distances: Vec<Vec<f32>>) -> Result<Self, String> { | ||
if names.len() != distances.len() { | ||
Err("Distances and Names have different length.".to_string()) | ||
} else { | ||
let n = names.len(); | ||
let square = distances.iter().all(|x| x.len() == n); | ||
let lower = distances.iter().enumerate().all(|(i, x)| x.len() == i); | ||
let upper = distances | ||
.iter() | ||
.enumerate() | ||
.all(|(i, x)| x.len() == n - 1 - i); | ||
let matrix_type = match (square, lower, upper) { | ||
(true, _, _) => MatrixType::Square, | ||
(_, true, _) => MatrixType::Lower, | ||
(_, _, true) => MatrixType::Upper, | ||
_ => Err("Could not determine shape of matrix.")?, | ||
}; | ||
Ok(Self { | ||
names, | ||
distances, | ||
matrix_type, | ||
}) | ||
} | ||
} | ||
|
||
// Converts an index (i,j) to the right index to use given the shape of the matrix. | ||
fn get_index(&self, (i, j): (usize, usize)) -> (usize, usize) { | ||
match self.matrix_type { | ||
MatrixType::Square => (i, j), | ||
MatrixType::Lower => { | ||
assert!(i != j); | ||
(max(i, j), min(i, j)) | ||
} | ||
MatrixType::Upper => { | ||
assert!(i != j); | ||
(min(i, j), max(i, j) - min(i, j) - 1) | ||
} | ||
} | ||
} | ||
|
||
/// Read a relaxed Phylip format `.dist`` file into a DistanceMatrix. | ||
pub fn from_file(p: &Path) -> Result<Self, String> { | ||
assert!(p.extension() == Some(OsStr::new("dist"))); | ||
fs::read_to_string(p).map_err(|e| e.to_string())?.parse() | ||
} | ||
|
||
/// Write the DistanceMatrix to a relaxed Phylip format `.dist`` file. | ||
pub fn to_file(self: &Self, p: &Path) -> Result<(), String> { | ||
assert!(p.extension() == Some(OsStr::new("dist"))); | ||
Ok(fs::write(p, self.to_string()).map_err(|e| e.to_string())?) | ||
} | ||
} | ||
|
||
impl FromStr for DistanceMatrix { | ||
type Err = String; | ||
|
||
/// Read a relaxed Phylib distance matrix. | ||
fn from_str(s: &str) -> Result<Self, String> { | ||
let mut lines = s.lines(); | ||
let n = lines | ||
.next() | ||
.ok_or("Expected n")? | ||
.parse() | ||
.map_err(|e: ParseIntError| e.to_string())?; | ||
|
||
let mut names = Vec::with_capacity(n); | ||
let mut distances = Vec::with_capacity(n); | ||
|
||
for line in lines.into_iter() { | ||
let mut it = line.split_ascii_whitespace(); | ||
names.push( | ||
it.next() | ||
.ok_or("Line does not start with a name")? | ||
.to_string(), | ||
); | ||
distances.push(it.map(|chars| chars.parse().unwrap()).collect()); | ||
} | ||
|
||
Ok(Self::new(names, distances)?) | ||
} | ||
} | ||
|
||
impl ToString for DistanceMatrix { | ||
/// Convert a relaxed Phylip distance matrix to a String. | ||
fn to_string(&self) -> String { | ||
let mut s = String::new(); | ||
s += &self.len().to_string(); | ||
s += "\n"; | ||
for (name, ds) in zip(&self.names, &self.distances) { | ||
s += &name; | ||
for d in ds { | ||
s += " "; | ||
s += &d.to_string(); | ||
} | ||
s += "\n"; | ||
} | ||
s | ||
} | ||
} | ||
|
||
/// Index access into the DistanceMatrix, taking into account the shape. | ||
/// | ||
/// Indices should be used as if the matrix was square. For triangular matrices, | ||
/// `i` and `j` must be distinct, and `(i,j)` and `(j,i)` represent the same | ||
/// element. | ||
impl Index<(usize, usize)> for DistanceMatrix { | ||
type Output = f32; | ||
fn index(&self, t: (usize, usize)) -> &Self::Output { | ||
let (i, j) = self.get_index(t); | ||
&self.distances[i][j] | ||
} | ||
} | ||
|
||
/// Mutable index access into the DistanceMatrix, taking into account the shape, like `Index`. | ||
impl IndexMut<(usize, usize)> for DistanceMatrix { | ||
fn index_mut(&mut self, t: (usize, usize)) -> &mut Self::Output { | ||
let (i, j) = self.get_index(t); | ||
&mut self.distances[i][j] | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn distance_matrix_from_to_string() { | ||
let square = "3 | ||
a 0 1 2 | ||
B 1 0 3 | ||
XYZ 2 3 0 | ||
"; | ||
let lower = "3 | ||
a | ||
B 1 | ||
XYZ 2 3 | ||
"; | ||
let upper = "3 | ||
a 1 2 | ||
B 3 | ||
XYZ | ||
"; | ||
for (s, matrix_type) in zip( | ||
[square, lower, upper], | ||
[MatrixType::Square, MatrixType::Lower, MatrixType::Upper], | ||
) { | ||
let t = s.parse::<DistanceMatrix>().unwrap(); | ||
assert_eq!(t.matrix_type, matrix_type); | ||
assert_eq!(t.to_string(), s); | ||
} | ||
} | ||
|
||
#[test] | ||
fn floats_and_names() { | ||
let s = "1 | ||
some_long_name?! 1.0 | ||
"; | ||
let t = s.parse::<DistanceMatrix>().unwrap(); | ||
assert_eq!(t[(0, 0)], 1.0); | ||
assert_eq!(t.names[0], "some_long_name?!"); | ||
|
||
let s = "1 | ||
ABCabc 1.0001 | ||
"; | ||
let t = s.parse::<DistanceMatrix>().unwrap(); | ||
assert_eq!(t[(0, 0)], 1.0001); | ||
assert_eq!(t.names[0], "ABCabc"); | ||
} | ||
|
||
#[test] | ||
fn indexing() { | ||
let s = "1 | ||
a 0 1 2 | ||
b 3 4 5 | ||
c 6 7 8 | ||
"; | ||
let mut t = s.parse::<DistanceMatrix>().unwrap(); | ||
assert_eq!(t[(0, 0)], 0.); | ||
assert_eq!(t[(1, 2)], 5.); | ||
assert_eq!(t[(2, 1)], 7.); | ||
t[(1, 2)] = -1.; | ||
t[(2, 1)] = -2.; | ||
assert_eq!(t[(1, 2)], -1.); | ||
assert_eq!(t[(2, 1)], -2.); | ||
} | ||
|
||
#[test] | ||
fn lower_indexing() { | ||
let s = "1 | ||
a | ||
b 5 | ||
c 7 8 | ||
"; | ||
let mut t = s.parse::<DistanceMatrix>().unwrap(); | ||
assert_eq!(t[(0, 2)], 7.); | ||
assert_eq!(t[(1, 2)], 8.); | ||
assert_eq!(t[(2, 1)], 8.); | ||
t[(0, 2)] = -1.; | ||
t[(2, 1)] = -2.; | ||
t[(1, 2)] = -3.; | ||
assert_eq!(t[(0, 2)], -1.); | ||
assert_eq!(t[(2, 0)], -1.); | ||
assert_eq!(t[(1, 2)], -3.); | ||
assert_eq!(t[(2, 1)], -3.); | ||
} | ||
|
||
#[test] | ||
fn upper_indexing() { | ||
let s = "1 | ||
a 1 2 | ||
b 3 | ||
c | ||
"; | ||
let mut t = s.parse::<DistanceMatrix>().unwrap(); | ||
assert_eq!(t[(0, 2)], 2.); | ||
assert_eq!(t[(1, 2)], 3.); | ||
assert_eq!(t[(2, 1)], 3.); | ||
t[(0, 2)] = -1.; | ||
t[(2, 1)] = -2.; | ||
t[(1, 2)] = -3.; | ||
assert_eq!(t[(0, 2)], -1.); | ||
assert_eq!(t[(2, 0)], -1.); | ||
assert_eq!(t[(1, 2)], -3.); | ||
assert_eq!(t[(2, 1)], -3.); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters