Skip to content

Commit

Permalink
Merge pull request cogent3#1827 from cogent3/refactor-of-map
Browse files Browse the repository at this point in the history
API: refactor of map into IndelMap and FeatureMap, fixes cogent3#1726
  • Loading branch information
GavinHuttley committed May 2, 2024
2 parents 407b927 + 3cf3910 commit 8f97f07
Show file tree
Hide file tree
Showing 22 changed files with 2,827 additions and 597 deletions.
55 changes: 55 additions & 0 deletions changelog.d/20240426_082600_Gavin.Huttley.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
<!--
A new scriv changelog fragment.
Uncomment the section that is right (remove the HTML comment wrapper).
-->

<!--
### Contributors
- A bullet item for the Contributors category.
-->

### ENH

- The new IndelMap class uses numpy arrays to store information about gap
locations for aligned sequences. This will greatly reduce the memory
overhead for aligned sequences. The class also provides explicit methods
for inter-converting between sequence and alignment coordinates. An important
difference to the original Map implementation is that IndelMap is memoryless,
meaning the history of slice operations is now fully delegated to the SeqView
class.
- The new FeatureMap class is a slightly modified version of the original Map
class. It is used solely for handling sequence feature mappings to sequences
and alignments. Like IndelMap, it is memoryless but it does record its
orientation with respect to the parent sequence.
- Note that both IndelMap and FeatureMap replace the spans attribute with a
generator.

<!--
### BUG
- A bullet item for the BUG category.
-->
<!--
### DOC
- A bullet item for the DOC category.
-->
<!--
### Deprecations
- A bullet item for the Deprecations category.
-->

### Discontinued

- The cogent3.core.location.Map class is now marked for deprecation. It is
being replaced by two classes, IndelMap and FeatureMap. The latter has
largely the same functionality of Map.


23 changes: 20 additions & 3 deletions src/cogent3/align/pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from cogent3.align.traceback import alignment_traceback, map_traceback
from cogent3.core.alignment import Aligned
from cogent3.core.location import IndelMap
from cogent3.evolve.likelihood_tree import LikelihoodTreeEdge
from cogent3.util.misc import ascontiguousarray

Expand Down Expand Up @@ -477,8 +478,20 @@ def _calcAligneds(self, children):
aligneds = []
for dim, child in enumerate(children):
for seq_name, aligned in child.aligneds:
aligned = aligned.remapped_to((maps[dim] * word_length).inverse())
aligneds.append((seq_name, aligned))
# if word_length != 1 then maps are forced to have
# sequences lengths that are modulo word_length
new_map = aligned.map.merge_maps(
maps[dim] * word_length,
parent_length=maps[dim].parent_length * word_length,
)
# Likewise, if the data is not modulo word_length,
# it is trimmed to match
data = (
aligned.data
if new_map.parent_length == len(aligned.data)
else aligned.data[: new_map.parent_length]
)
aligneds.append((seq_name, Aligned(new_map, data)))
return aligneds

def backward(self):
Expand Down Expand Up @@ -526,7 +539,11 @@ def __init__(self, leaf):
_Alignable.__init__(self, leaf)
if hasattr(leaf, "sequence"):
self.seq = leaf.sequence
aligned = Aligned([(0, len(self.seq))], self.seq, len(self.seq))
seqlen = len(self.seq)
imap = IndelMap.from_aligned_segments(
locations=[(0, seqlen)], aligned_length=seqlen
)
aligned = Aligned(imap, self.seq)
self.aligneds = [(self.leaf.edge_name, aligned)]
self.max_preds = 1
self._pog = None
Expand Down
31 changes: 21 additions & 10 deletions src/cogent3/align/traceback.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
#!/usr/bin/env python
"""Conversion of dynamic program results ("arrays of arrows") into gap vectors,
gapped sequences or Cogent Alignment objects"""
import typing

from cogent3.core.alignment import Aligned, Alignment
from cogent3.core.annotation import Map
from cogent3.core.location import IndelMap


IntOrNone = typing.Union[int, None]
IntListType = list[int]
CoordsListType = list[list[typing.Sequence[int]]]


def seq_traceback(s1, s2, aligned_positions, gap_value):
Expand All @@ -30,7 +35,9 @@ def seq_traceback(s1, s2, aligned_positions, gap_value):
return alignments


def gap_traceback(aligned_positions):
def gap_traceback(
aligned_positions: list[list[IntOrNone, IntOrNone]]
) -> tuple[IntListType, IntListType, CoordsListType, int]:
"""gap Vectors from state matrix and ending point"""
consuming = [False, False]
starts = [None, None]
Expand All @@ -52,15 +59,19 @@ def gap_traceback(aligned_positions):
if consuming[dimension]:
gv.append(a)
gap_vectors[dimension] = [(gv[i], gv[i + 1]) for i in range(0, len(gv), 2)]
return (starts, ends, gap_vectors, a)
return starts, ends, gap_vectors, a


def map_traceback(aligned_positions):
# using Map's to keep track of gaps for indel alignment
(starts, ends, gap_vectors, alignment_len) = gap_traceback(aligned_positions)
# print 'gv', gap_vectors
maps = [Map(gv, parent_length=alignment_len).inverse() for gv in gap_vectors]
return (starts, ends, maps)
def map_traceback(
aligned_positions: list[list[IntOrNone, IntOrNone]]
) -> tuple[IntListType, IntListType, list[IndelMap]]:
# using IndelMap's to keep track of gaps for indel alignment
starts, ends, gap_vectors, alignment_len = gap_traceback(aligned_positions)
maps = [
IndelMap.from_aligned_segments(locations=gv, aligned_length=alignment_len)
for gv in gap_vectors
]
return starts, ends, maps


def alignment_traceback(seqs, aligned_positions, word_length):
Expand Down
8 changes: 4 additions & 4 deletions src/cogent3/app/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def _merged_gaps(a_gaps: dict, b_gaps: dict) -> dict:
function to 'max'. Use 'sum' when the gaps derive from different
sequences.
"""

# todo convert to using IndelMap functions
if not a_gaps:
return b_gaps

Expand Down Expand Up @@ -251,7 +251,8 @@ def _gaps_for_injection(other_seq_gaps: dict, refseq_gaps: dict, seqlen: int) ->
# sequence coordinates
# we probably need to include the refseq gap union because we need to
# establish whether a refseq gap overlaps with a gap in other seq
# and
# todo convert these functions to using IndelMap and the numpy set
# operation functions
all_gaps = {}
all_gaps.update(other_seq_gaps)
for gap_pos, gap_length in sorted(refseq_gaps.items()):
Expand Down Expand Up @@ -304,8 +305,7 @@ def pairwise_to_multiple(pwise, ref_seq, moltype, info=None):
other_seq = aln.named_seqs[other_name]
other_gaps = dict(other_seq.map.get_gap_coordinates())
diff_gaps = _combined_refseq_gaps(curr_ref_gaps, ref_gaps)
inject = _gaps_for_injection(other_gaps, diff_gaps, len(other_seq.data))
if inject:
if inject := _gaps_for_injection(other_gaps, diff_gaps, len(other_seq.data)):
m = gap_coords_to_map(inject, len(other_seq.data))
other_seq = Aligned(m, other_seq.data)

Expand Down
Loading

0 comments on commit 8f97f07

Please sign in to comment.