From a1e0e0d7aed2670ec74cd3baf58271e1fcab3e97 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 20 Feb 2024 16:47:57 +1100 Subject: [PATCH 01/62] ENH: IndelMap a specialised map for indels [NEW] a MapABC base class to define the common API for map-like classes [NEW] new class is a simplification of current Map. It provides only those attributes needed to represent gaps in aligned sequences. The original Map will be reserved for Features. IndelMap does not store history, and it does not store strand / reversed information. As a result, methods like nucleic_reversed() always returns spans with Span.start < Spen.end --- src/cogent3/core/location.py | 487 ++++++++++++++++++++++++++++++- tests/test_core/test_location.py | 245 +++++++++++++++- 2 files changed, 715 insertions(+), 17 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index db6c9d67c..7bb56c3e6 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -20,11 +20,14 @@ former contains a and b while the latter contains c. """ import copy +import dataclasses +import inspect +from abc import ABC, abstractmethod from bisect import bisect_left, bisect_right from functools import total_ordering from itertools import chain -from typing import Union +from typing import List, Optional, Sequence, Tuple, Union from numpy import array, ndarray @@ -66,25 +69,24 @@ def _norm_slice(index, length): return (start, start + 1, 1) -def as_map(slice, length): +def as_map(slice, length, cls): """Take anything that might be used as a subscript: Integer, Slice, - or Map, and return a Map.""" + or MapABC, and return cls.""" if isinstance(slice, (list, tuple)): spans = [] for i in slice: - spans.extend(as_map(i, length).spans) - map = Map(spans=spans, parent_length=length) - elif isinstance(slice, Map): - map = slice + spans.extend(as_map(i, length).spans, cls) + return cls(spans=spans, parent_length=length) + elif isinstance(slice, cls): + return slice # TODO reasons for failure when the following is not commented out # should be checked further # assert map.parent_length == length, (map, length) else: - (lo, hi, step) = _norm_slice(slice, length) + lo, hi, step = _norm_slice(slice, length) assert (step or 1) == 1 - map = Map([(lo, hi)], parent_length=length) - return map + return cls(locations=[(lo, hi)], parent_length=length) class SpanI(object): @@ -323,13 +325,13 @@ def remap_with(self, map): C is a span of a feature on B which itself is a feature on A, so to place C on A return that part of B (map) covered by C (self)""" - (offsets, spans) = (map.offsets, map.spans) + offsets, spans = map.offsets, list(map.spans) map_length = offsets[-1] + spans[-1].length # don't try to remap any non-corresponding end region(s) # this won't matter if all spans lie properly within their # parent maps, but that might not be true of Display slices. - (zlo, zhi) = (max(0, self.start), min(map_length, self.end)) + zlo, zhi = max(0, self.start), min(map_length, self.end) # Find the right span(s) of the map first = bisect_right(offsets, zlo) - 1 @@ -450,7 +452,7 @@ def __eq__(self, other): return type(self) == type(other) -class _LostSpan(object): +class _LostSpan: """A placeholder span which doesn't exist in the underlying sequence""" __slots__ = ["length", "value", "_serialisable"] @@ -623,7 +625,7 @@ def __repr__(self): def __getitem__(self, slice): # A possible shorter map at the same level - slice = as_map(slice, len(self)) + slice = as_map(slice, len(self), self.__class__) new_parts = [] for span in slice.spans: new_parts.extend(span.remap_with(self)) @@ -1132,3 +1134,460 @@ def gap_coords_to_map(gaps_lengths: dict, seq_length: int) -> Map: spans.append(Span(last, seq_length)) return Map(spans=spans, parent_length=seq_length) + + +class MapABC(ABC): + """base class for genomic map objects""" + + def __new__(cls, *args, **kwargs): + obj = object.__new__(cls) + init_sig = inspect.signature(cls.__init__) + bargs = init_sig.bind_partial(cls, *args, **kwargs) + bargs.apply_defaults() + init_vals = bargs.arguments + init_vals.pop("self", None) + + obj._serialisable = init_vals + return obj + + @abstractmethod + def __len__(self): + ... + + @abstractmethod + def __add__(self, other): + ... + + @classmethod + @abstractmethod + def from_rich_dict(cls, data): + ... + + @abstractmethod + def gaps(self): + ... + + @abstractmethod + def nongap(self): + ... + + @abstractmethod + def get_coordinates(self): + ... + + @abstractmethod + def nucleic_reversed(self): + ... + + @abstractmethod + def reversed(self): + ... + + @abstractmethod + def to_rich_dict(self): + ... + + @abstractmethod + def inverse(self): + ... + + +def _spans_from_locations( + locations, tidy, parent_length +) -> Tuple[Union[Span, _LostSpan]]: + spans = [] + for start, end in locations: + reverse = start > end + if max(start, end) < 0 or min(start, end) > parent_length: + raise RuntimeError( + f"located outside sequence: {(start, end, parent_length)}" + ) + if max(start, end) > parent_length and min(start, end) < 0: + l_diff = min(start, end) + r_diff = max(start, end) - parent_length + start, end = (0, parent_length) if start < end else (parent_length, 0) + spans += [ + LostSpan(abs(l_diff)), + Span(start, end, tidy, tidy, reverse=reverse), + LostSpan(abs(r_diff)), + ] + elif min(start, end) < 0: + diff = min(start, end) + start = max(start, 0) + end = max(end, 0) + spans += [ + LostSpan(abs(diff)), + Span(start, end, tidy, tidy, reverse=reverse), + ] + elif max(start, end) > parent_length: + diff = max(start, end) - parent_length + start = min(start, parent_length) + end = min(end, parent_length) + spans += [ + Span(start, end, tidy, tidy, reverse=reverse), + LostSpan(abs(diff)), + ] + else: + spans += [Span(start, end, tidy, tidy, reverse=reverse)] + return tuple(spans) + + +T = Union[List[int], Tuple[int]] + + +@dataclasses.dataclass +class IndelMap(MapABC): + """store locations of deletions in a Aligned sequence""" + + # todo design notes + # I think this object should never try and store "history", i.e. it + # should directly relate to the current sequence only. Let that sequence, + # which is represented by a SeqView, store history. + # Following ths, storing reverse is also a bad idea for this object, + # also done by the SeqView only. + # TODO reverse complement of Alignment -> Aligned -> SeqView, IndelMap + # should just do nucleic reverse. I think this is the next task. + + spans: Tuple[Union[Span, _LostSpan, TerminalPadding]] = () + parent_length: int = 0 + locations: dataclasses.InitVar[Sequence[T]] = None + start: Optional[int] = dataclasses.field(init=False, default=0) + end: Optional[int] = dataclasses.field(init=False, default=0) + reverse: bool = dataclasses.field(init=False, default=False) + length: int = dataclasses.field(init=False, default=0) + _serialisable: dict = dataclasses.field(init=False, repr=False) + tidy: bool = True + termini_unknown: dataclasses.InitVar[bool] = dataclasses.field(default=False) + + def __post_init__(self, locations, termini_unknown): + if locations: + self.spans = _spans_from_locations( + locations, tidy=self.tidy, parent_length=self.parent_length + ) + + just_gaps = True + self.reverse = False + for span in self.spans: + self.length += len(span) + if span.lost: + continue + + if just_gaps: + self.start, self.end = span.start, span.end + self.reverse = span.reverse + just_gaps = False + else: + self.start = min(self.start, span.start) + self.end = max(self.end, span.end) + if self.reverse is not None and (span.reverse != self.reverse): + self.reverse = None + + if termini_unknown: + spans = list(self.spans) + if spans[0].lost: + spans[0] = TerminalPadding(spans[0].length) + if spans[-1].lost: + spans[-1] = TerminalPadding(spans[-1].length) + + self.spans = tuple(spans) + + def __getitem__(self, slice): + # A possible shorter map at the same level + new_map = as_map(slice, len(self), self.__class__) + new_parts = [] + for span in new_map.spans: + # we reset tidy start / end to false to avoid changes to + # default behaviour of tidy on IndelMap (which ignores these + # anyway) + old_tidy = span.tidy_start, span.tidy_end + span.tidy_start, span.tidy_end = False, False + new_parts.extend(span.remap_with(self)) + span.tidy_start, span.tidy_end = old_tidy + return self.__class__(spans=new_parts, parent_length=self.parent_length) + + def __len__(self): + return self.length + + def __add__(self, other): + if other.parent_length != self.parent_length: + raise ValueError("Those maps belong to different sequences") + return self.__class__( + spans=self.spans + other.spans, parent_length=self.parent_length + ) + + def __mul__(self, scale): + # For Protein -> DNA + new_parts = [] + for span in self.spans: + new_parts.append(span * scale) + return self.__class__(spans=new_parts, parent_length=self.parent_length * scale) + + def __repr__(self): + return repr(self.spans) + f"/{self.parent_length}" + + @property + def offsets(self): + return [0] + array([s.length for s in self.spans[:-1]]).cumsum().tolist() + + def gaps(self): + """The gaps (lost spans) in this map""" + locations = [] + offset = 0 + for s in self.spans: + if s.lost: + locations.append((offset, offset + s.length)) + offset += s.length + return self.__class__(locations=locations, parent_length=len(self)) + + def nongap(self): + """ungappeed segments in this map""" + locations = [] + offset = 0 + for s in self.spans: + if not s.lost: + locations.append((offset, offset + s.length)) + offset += s.length + return self.__class__(locations=locations, parent_length=len(self)) + + @property + def complete(self): + """whether any span represents a gap""" + return not any(span.lost for span in self.spans) + + @property + def useful(self): + return not all(span.lost for span in self.spans) + + def get_coordinates(self): + """returns span coordinates as [(v1, v2), ...] + + v1/v2 are (start, end) unless the map is reversed, in which case it will + be (end, start) + """ + + order_func = (lambda x: (max(x), min(x))) if self.reverse else (lambda x: x) + return list( + map(order_func, [(s.start, s.end) for s in self.spans if not s.lost]) + ) + + def get_gap_coordinates(self): + """returns [(gap pos, gap length), ...]""" + gap_pos = [] + for i, span in enumerate(self.spans): + if not span.lost: + continue + + pos = self.spans[i - 1].end if i else 0 + gap_pos.append((pos, len(span))) + + return gap_pos + + def nucleic_reversed(self): + """Same location on reversed parent""" + spans = [s.reversed_relative_to(self.parent_length) for s in self.spans] + return self.__class__(spans=spans, parent_length=self.parent_length) + + def strict_nucleic_reversed(self): + """map for a sequence that has itself been reversed and complemented + + Notes + ----- + discards reverse attribute on both spans and self + """ + spans = [] + parent_length = self.parent_length + for s in self.spans: + if not s.lost: + start = parent_length - s.end + assert start >= 0 + end = start + s.length + s = Span(start=start, end=end) + spans.append(s) + + spans.reverse() + return self.__class__(spans=spans, parent_length=self.parent_length) + + def reversed(self): + """Reversed location on same parent""" + spans = [s.reversed() for s in self.spans] + spans.reverse() + return self.__class__(spans=spans, parent_length=self.parent_length) + + def to_rich_dict(self): + """returns dicts for contained spans [dict(), ..]""" + spans = [s.to_rich_dict() for s in self.spans] + data = copy.deepcopy(self._serialisable) + data.pop("locations") + data["spans"] = spans + data["type"] = get_object_provenance(self) + data["version"] = __version__ + return data + + @classmethod + def from_rich_dict(cls, map_element): + from cogent3.util.deserialise import _get_class + + map_element.pop("version", None) + type_ = map_element.pop("type") + assert _get_class(type_) == cls + spans = [] + for element in map_element["spans"]: + element.pop("version", None) + klass = _get_class(element.pop("type")) + instance = klass(**element) + spans.append(instance) + + map_element["spans"] = spans + return cls(**map_element) + + def with_termini_unknown(self): + """returns new instance with terminal gaps indicated as unknown""" + return self.__class__( + spans=self.spans[:], + parent_length=self.parent_length, + termini_unknown=True, + ) + + def inverse(self): + """returns instance with coordinates updated for aligned, unaligned""" + # is this only required for parse_out_gaps? + # NO also used in cogent3.align code + + # can't work if there are overlaps in the map + # tidy ends don't survive inversion + if self.parent_length is None: + raise ValueError("Uninvertable. parent length not known") + + cum_posn = 0 + temp = [] + for span in self.spans: + if not span.lost: + if span.reverse: + temp.append( + (span.start, span.end, cum_posn + span.length, cum_posn) + ) + else: + temp.append( + (span.start, span.end, cum_posn, cum_posn + span.length) + ) + cum_posn += span.length + + temp.sort() + new_spans = [] + last_start = 0 + for start, end, cum_start, cum_end in temp: + if start > last_start: + new_spans.append(LostSpan(start - last_start)) + elif start < last_start: + raise ValueError(f"Uninvertable. Overlap: {start} < {last_start}") + + # we force tidy_ to be same as self, attribute has no meaning + # for IndelMap, but retained for compatability for now + new_spans.append( + Span( + cum_start, + cum_end, + tidy_start=self.tidy, + tidy_end=self.tidy, + reverse=cum_start > cum_end, + ) + ) + last_start = end + + if self.parent_length > last_start: + new_spans.append(LostSpan(self.parent_length - last_start)) + + return self.__class__(spans=new_spans, parent_length=len(self)) + + T = Union[ndarray, int] + + def absolute_position(self, rel_pos: T) -> T: + """converts rel_pos into an absolute position + + Raises + ------ + raises ValueError if rel_pos < 0 + """ + check = array([rel_pos], dtype=int) if isinstance(rel_pos, int) else rel_pos + if check.min() < 0: + raise ValueError(f"must positive, not {rel_pos=}") + + if len(self) == self.parent_length: + # handle case of reversed here? + return rel_pos + + return self.start + rel_pos + + def relative_position(self, abs_pos: T) -> T: + """converts abs_pos into an relative position + + Raises + ------ + raises ValueError if abs_pos < 0 + """ + check = array([abs_pos], dtype=int) if isinstance(abs_pos, int) else abs_pos + if check.min() < 0: + raise ValueError(f"must positive, not {abs_pos=}") + return abs_pos - self.start + + def get_covering_span(self): + span = (self.end, self.start) if self.reverse else (self.start, self.end) + return self.__class__(locations=[span], parent_length=self.parent_length) + + def zeroed(self): + """returns a new instance with the first span starting at 0 + + Note + ---- + + Useful when an annotated sequence is sliced, but the connection to + the original parent is being deliberately broken as in the + Sequence.deepcopy(sliced=True) case. + """ + # todo is this really required, best ifn we can rely on SeqView to + # store all relationship to underlying sequence + min_val = min(self.start, self.end) + spans = [] + for span in self.spans: + if span.lost: + spans.append(span) + continue + kwargs = span.to_rich_dict() + del kwargs["version"] + del kwargs["type"] + kwargs["start"] = kwargs["start"] - min_val + kwargs["end"] = kwargs["end"] - min_val + spans.append(Span(**kwargs)) + + kwargs = self.to_rich_dict() + del kwargs["version"] + del kwargs["type"] + kwargs["spans"] = spans + kwargs["parent_length"] = abs(self.start - self.end) + return self.__class__(**kwargs) + + def to_feature_map(self): + """returns a Map type, suited to Features""" + spans = [] + for span in self.spans: + if span.lost: + spans.append(span) + continue + kwargs = span.to_rich_dict() + del kwargs["version"] + del kwargs["type"] + spans.append(Span(**kwargs)) + + kwargs = self.to_rich_dict() + del kwargs["version"] + del kwargs["type"] + kwargs["spans"] = spans + return Map(**kwargs) + + def without_gaps(self): + # todo is this really required + # being used by Aligned.get_seq() + return self.__class__( + spans=[s for s in self.spans if not s.lost], + parent_length=self.parent_length, + ) diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 6c940da11..2fa28fbe5 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -2,8 +2,17 @@ """ from unittest import TestCase +import pytest + from cogent3 import DNA -from cogent3.core.location import Map, Span, gap_coords_to_map +from cogent3.core.location import ( + IndelMap, + LostSpan, + Map, + Span, + TerminalPadding, + gap_coords_to_map, +) class SpanTests(TestCase): @@ -300,14 +309,15 @@ def test_gap_coords_to_map(self): got = gap_coords_to_map({20: 1}, len(seq)) -def test_map_plus_position(): +@pytest.mark.parametrize("cls", (IndelMap, Map)) +def test_map_plus_position(cls): # seq is 9 long # plus coords 012345678 # +slice ** # plus seq AAACCCTGG # orig = Aligned(*DNA.make_seq("AAACCCTGG", name="a").parse_out_gaps()) - orig = Map([(0, 9)], parent_length=9) + orig = cls(locations=[(0, 9)], parent_length=9) assert orig.absolute_position(2) == 2 assert orig.absolute_position(6) == 6 @@ -329,3 +339,232 @@ def test_map_plus_position(): rc = orig.nucleic_reversed() rcsubseq = rc[2:7] abs_coord = rcsubseq.absolute_position(0) + + +def test_indel_map_useful_complete(): + im = IndelMap(spans=[LostSpan(3)], parent_length=0) + assert not im.useful + assert not im.complete + assert len(im) == im.length == 3 + + +@pytest.mark.parametrize("cls", (Map, IndelMap)) +def test_map_nucleic_reversed(cls): + # seq is 9 long + # plus coords 012345678 + # +slice ** + # plus seq AAACCCTGG + + # orig = Aligned(*DNA.make_seq("AAACCCTGG", name="a").parse_out_gaps()) + orig = cls(locations=[(0, 9)], parent_length=9) + # minus coords 012345678 + # rel coord 01234 + # -slice ***** + # minus seq CCAGGGTTT + # plus coords 876543210 + rc = orig.nucleic_reversed() + coords = rc.get_coordinates() + assert coords == [(9, 0)] + assert rc.reverse + assert (rc.start, rc.end) == (0, 9) + + +@pytest.mark.parametrize("cls", (Map, IndelMap)) +def test_coordinate(cls): + # coordinates are for ungapped segments in underlying sequence + # 01 2 345 + seq = DNA.make_seq("AC---G-TAA--") + m, s = seq.parse_out_gaps() + m = cls(spans=m.spans, parent_length=m.parent_length) + got = m.get_coordinates() + assert got == [(0, 2), (2, 3), (3, 6)] + + +@pytest.mark.parametrize("cls", (Map, IndelMap)) +def test_gap_coordinate(cls): + seq = DNA.make_seq("AC---G-TAA--") + m, s = seq.parse_out_gaps() + m = cls(spans=m.spans, parent_length=m.parent_length) + got = m.get_gap_coordinates() + assert got == [(2, 3), (3, 1), (6, 2)] + + +@pytest.mark.parametrize("cls", (Map, IndelMap)) +def test_gaps(cls): + # returns spans corresponding to position on "aligned" seq of gaps + # 000000000011 + # 012345678901 + seq = DNA.make_seq("AC---G-TAA--") + m, s = seq.parse_out_gaps() + m = cls(spans=m.spans, parent_length=m.parent_length) + got = [(g.start, g.end) for g in m.gaps().spans] + assert got == [(2, 5), (6, 7), (10, 12)] + + +@pytest.mark.parametrize("cls", (IndelMap, Map)) +def test_nongap(cls): + # returns spans corresponding to position on "aligned" seq of nongaps + # 000000000011 + # 012345678901 + seq = DNA.make_seq("AC---G-TAA--") + m, s = seq.parse_out_gaps() + m = cls(spans=m.spans, parent_length=m.parent_length) + + got = [(g.start, g.end) for g in m.nongap().spans] + assert got == [(0, 2), (5, 6), (7, 10)] + + +@pytest.mark.parametrize("cls", (Map, IndelMap)) +def test_reversed(cls): + seq = DNA.make_seq("AC---G-TAA--") + m, s = seq.parse_out_gaps() + m = cls(spans=m.spans, parent_length=m.parent_length) + # reversed() reverses the order of spans, but keeps their coordinates + # differs from nucleic reversed, which computes a new relative position + rev = m.reversed() + got = [s.length if s.lost else (s.start, s.end) for s in rev.spans] + expect = [s.length if s.lost else (s.start, s.end) for s in m.spans] + expect.reverse() + assert got == expect + + +def test_round_trip_rich_dict(): + seq = DNA.make_seq("AC---G-TAA--") + m, s = seq.parse_out_gaps() + # reversed() reverses the order of spans, but keeps their coordinates + # differs from nucleic reversed, which computes a new relative position + im = IndelMap(spans=m.spans, parent_length=m.parent_length) + got = IndelMap.from_rich_dict(im.to_rich_dict()) + assert im is not got + assert got.to_rich_dict() == im.to_rich_dict() + + +def test_serialisable_attr(): + im = IndelMap(locations=[(0, 2)], parent_length=20) + set_vals = {"locations": [(0, 2)], "parent_length": 20} + got = {k: im._serialisable[k] for k in set_vals} + assert got == set_vals + + +@pytest.mark.parametrize("cls", (Map, IndelMap)) +def test_terminal_unknown(cls): + # span idx 01 2 345 6 + seq = DNA.make_seq("-AC---G-TAA--") + m, s = seq.parse_out_gaps() + # not unknown, by default + assert m.spans[0].lost and not isinstance(m.spans[0], TerminalPadding) + # use the constructor arg + m = cls(spans=m.spans, parent_length=m.parent_length, termini_unknown=True) + assert isinstance(m.spans[0], TerminalPadding) + assert isinstance(m.spans[-1], TerminalPadding) + assert m.spans[2].lost and not isinstance(m.spans[1], TerminalPadding) + assert m.spans[4].lost and not isinstance(m.spans[2], TerminalPadding) + + # use the method + m, s = seq.parse_out_gaps() + m = cls(spans=m.spans, parent_length=m.parent_length).with_termini_unknown() + assert isinstance(m.spans[0], TerminalPadding) + assert isinstance(m.spans[-1], TerminalPadding) + # middle gap is not terminal, so... + assert not isinstance(m.spans[2], TerminalPadding) + + # no gaps, no effect + seq = DNA.make_seq("ACGTAA") + m, s = seq.parse_out_gaps() + # use the constructor arg + m = cls(spans=m.spans, parent_length=m.parent_length, termini_unknown=True) + assert not isinstance(m.spans[0], TerminalPadding) + # use the method + m = cls(spans=m.spans, parent_length=m.parent_length).with_termini_unknown() + assert not isinstance(m.spans[0], TerminalPadding) + + +@pytest.mark.parametrize("cls", (Map, IndelMap)) +def test_map_inverse(cls): + m = cls(locations=[(0, 2), (4, 6)], parent_length=6) + assert len(m) == 4 + mi = m.inverse() + assert len(mi) == 6 + assert mi.spans[1].lost and len(mi.spans[1]) == 2 + # invert the inversion, should give us back the original + re_inv = mi.inverse() + expect = m.to_rich_dict() + got = re_inv.to_rich_dict() + assert got == expect + + +@pytest.mark.parametrize("cls", (Map, IndelMap)) +def test_map_offsets(cls): + # offsets are absolute starts of spans + # 1 + # 01 3 678 1 + seq = DNA.make_seq("-AC---G-TAA--") + m, s = seq.parse_out_gaps() + got = m.offsets + assert got == [0, 1, 3, 6, 7, 8, 11] + + +@pytest.mark.parametrize("cls", (Map, IndelMap)) +def test_map_indexed(cls): + m = cls(locations=[(0, 2), (4, 6)], parent_length=6).inverse() + indexed = m[2] + assert len(indexed) == 1 + + +def test_compare_map_indexed(): + from cogent3.core.alignment import Aligned + + seq = DNA.make_seq("--AC-GTAA--".replace("-", "")) + spans = [LostSpan(2), Span(0, 2), LostSpan(2), Span(2, 6), LostSpan(2)] + kwargs = dict(spans=spans, parent_length=len(seq)) + mm = Map(**kwargs) + im = IndelMap(**kwargs) + ma = Aligned(mm, seq) + ia = Aligned(im, seq) + first = ia[0] + assert first == "-" + length = len(ma) + got = [str(ia[i]) for i in range(length)] + expect = [str(ma[i]) for i in range(length)] + assert got == expect + + +@pytest.mark.parametrize("slice_it", (True, False)) +def test_indel_map_zeroed(slice_it): + spans = [LostSpan(2), Span(2, 4), LostSpan(2), Span(4, 8), LostSpan(2)] + kwargs = dict(spans=spans, parent_length=6) + + mm = Map(**kwargs) + if slice_it: + mm = mm[:6] + mm_zeroed = mm.zeroed() + + im = IndelMap(**kwargs) + if slice_it: + im = im[:6] + + im_zeroed = im.zeroed() + assert im_zeroed.get_coordinates() == mm_zeroed.get_coordinates() + assert im_zeroed.parent_length == mm_zeroed.parent_length + + +def test_indelmap_to_feature_map(): + spans = [LostSpan(2), Span(2, 4), LostSpan(2), Span(4, 8), LostSpan(2)] + kwargs = dict(spans=spans, parent_length=6) + im = IndelMap(**kwargs) + mm = im.to_feature_map() + assert mm.get_coordinates() == im.get_coordinates() + + +def test_indelmap_strict_nucleic_reversed(): + spans = [LostSpan(2), Span(2, 4), LostSpan(2), Span(4, 8), LostSpan(2)] + kwargs = dict(spans=spans, parent_length=12) + orig = IndelMap(**kwargs) + rev = orig.strict_nucleic_reversed() + assert rev.spans[1].reverse == rev.spans[3].reverse == False + assert not rev.reverse + old = orig.nucleic_reversed() + assert old.spans[1].reverse == old.spans[3].reverse == True + assert rev.get_coordinates() == [ + tuple(sorted(a)) for a in reversed(old.get_coordinates()) + ] From 02858af656faedb354cd8fa5a097f8eefadd0c21 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 20 Feb 2024 16:49:08 +1100 Subject: [PATCH 02/62] MAINT: replace Map with IndelMap for seqs and Aligned --- src/cogent3/align/traceback.py | 9 ++++++--- src/cogent3/core/alignment.py | 15 +++++++++------ src/cogent3/core/annotation.py | 5 +++++ src/cogent3/core/location.py | 5 +---- src/cogent3/core/sequence.py | 6 +++--- 5 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/cogent3/align/traceback.py b/src/cogent3/align/traceback.py index 626d30c09..16a8d0ceb 100644 --- a/src/cogent3/align/traceback.py +++ b/src/cogent3/align/traceback.py @@ -3,7 +3,7 @@ gapped sequences or Cogent Alignment objects""" from cogent3.core.alignment import Aligned, Alignment -from cogent3.core.annotation import Map +from cogent3.core.location import IndelMap def seq_traceback(s1, s2, aligned_positions, gap_value): @@ -56,10 +56,13 @@ def gap_traceback(aligned_positions): def map_traceback(aligned_positions): - # using Map's to keep track of gaps for indel alignment + # using IndelMap's to keep track of gaps for indel alignment (starts, ends, gap_vectors, alignment_len) = gap_traceback(aligned_positions) # print 'gv', gap_vectors - maps = [Map(gv, parent_length=alignment_len).inverse() for gv in gap_vectors] + maps = [ + IndelMap(locations=gv, parent_length=alignment_len).inverse() + for gv in gap_vectors + ] return (starts, ends, maps) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index 77a317b1a..f3501095d 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -56,7 +56,7 @@ import cogent3 # will use to get at cogent3.parse.fasta.MinimalFastaParser, from cogent3._version import __version__ -from cogent3.core.annotation import Feature, Map +from cogent3.core.annotation import Feature from cogent3.core.annotation_db import ( BasicAnnotationDb, FeatureDataType, @@ -66,6 +66,7 @@ ) from cogent3.core.genetic_code import get_code from cogent3.core.info import Info as InfoClass +from cogent3.core.location import IndelMap, Map from cogent3.core.profile import PSSM, MotifCountsArray from cogent3.core.sequence import ArraySequence, Sequence, frac_same # which is a circular import otherwise. @@ -2145,7 +2146,7 @@ def __init__(self, map, data, length=None): # Unlike the normal map constructor, here we take a list of pairs of # alignment coordinates, NOT a list of pairs of sequence coordinates if isinstance(map, list): - map = Map(map, parent_length=length).inverse() + map = IndelMap(locations=map, parent_length=length).inverse() self.map = map self.data = data if hasattr(data, "info"): @@ -2325,7 +2326,9 @@ def remapped_to(self, map): def make_feature(self, feature: FeatureDataType, alignment: "Alignment") -> Feature: """returns a feature, not written into annotation_db""" annot = self.data.make_feature(feature) - return annot.remapped_to(alignment, self.map.inverse()) + inverted = self.map.inverse() + # todo should indicate whether tidy or not + return annot.remapped_to(alignment, inverted) def gap_vector(self): """Returns gap_vector of GappedSeq, for omit_gap_pos.""" @@ -2826,7 +2829,7 @@ def sample( positions = [ (loc * motif_length, (loc + 1) * motif_length) for loc in locations ] - sample = Map(positions, parent_length=len(self)) + sample = IndelMap(locations=positions, parent_length=len(self)) return self.gapped_by_map(sample, info=self.info) def sliding_windows(self, window, step, start=None, end=None): @@ -4659,7 +4662,7 @@ def __getitem__(self, index): raise ValueError(f"feature.parent {index.seqid!r} is not self") return index.get_slice() - if isinstance(index, Map): + if isinstance(index, (Map, IndelMap)): new = self._mapped(index) elif isinstance(index, (int, slice)): @@ -4833,7 +4836,7 @@ def filtered(self, predicate, motif_length=1, drop_remainder=True, **kwargs): locations = [(gv[i], gv[i + 1]) for i in range(0, len(gv), 2)] - keep = Map(locations, parent_length=len(self)) + keep = IndelMap(locations=locations, parent_length=len(self)) return self.gapped_by_map(keep, info=self.info) def get_seq(self, seqname): diff --git a/src/cogent3/core/annotation.py b/src/cogent3/core/annotation.py index 5f8a4270b..e61d8af59 100644 --- a/src/cogent3/core/annotation.py +++ b/src/cogent3/core/annotation.py @@ -147,6 +147,11 @@ def __repr__(self): return f"{name}({txt})" def remapped_to(self, grandparent, gmap): + if not isinstance(gmap, Map): + # todo possibly create method on IndelMap to produce the FeatureMap? + # due to separation of IndelMap and Map, change class + gmap = Map(spans=gmap.spans, parent_length=gmap.parent_length) + seqid = grandparent.name or f"from {self.seqid!r}" kwargs = { **self._serialisable, diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 7bb56c3e6..bd53b4b39 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -78,11 +78,8 @@ def as_map(slice, length, cls): for i in slice: spans.extend(as_map(i, length).spans, cls) return cls(spans=spans, parent_length=length) - elif isinstance(slice, cls): + elif isinstance(slice, (Map, IndelMap)): return slice - # TODO reasons for failure when the following is not commented out - # should be checked further - # assert map.parent_length == length, (map, length) else: lo, hi, step = _norm_slice(slice, length) assert (step or 1) == 1 diff --git a/src/cogent3/core/sequence.py b/src/cogent3/core/sequence.py index 14279b9dd..4d504dedb 100644 --- a/src/cogent3/core/sequence.py +++ b/src/cogent3/core/sequence.py @@ -56,7 +56,7 @@ ) from cogent3.core.genetic_code import get_code from cogent3.core.info import Info as InfoClass -from cogent3.core.location import LostSpan, Map +from cogent3.core.location import IndelMap, LostSpan, Map from cogent3.format.fasta import alignment_to_fasta from cogent3.maths.stats.contingency import CategoryCounts from cogent3.maths.stats.number import CategoryCounter @@ -1344,7 +1344,7 @@ def __getitem__(self, index): if hasattr(index, "map"): index = index.map - if isinstance(index, Map): + if isinstance(index, (Map, IndelMap)): new = self._mapped(index) preserve_offset = not index.reverse @@ -1484,7 +1484,7 @@ def parse_out_gaps(self): for match in nongap.finditer(str(self)): segments.append(match.span()) gapless.append(match.group()) - map = Map(segments, parent_length=len(self)).inverse() + map = IndelMap(locations=segments, parent_length=len(self)).inverse() seq = self.__class__( "".join(gapless), name=self.get_name(), info=self.info, preserve_case=True ) From a1abae2133b59ce7c16470bfbf8998d7255e3638 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 20 Feb 2024 16:49:48 +1100 Subject: [PATCH 03/62] MAINT: use seqview to record reverse strand --- src/cogent3/core/alignment.py | 21 +++++++++++++-------- tests/test_core/test_alignment.py | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index f3501095d..a0741a5d4 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -637,11 +637,12 @@ def deepcopy(self, sliced: bool = True): annotations. """ if isinstance(self, Alignment): - reversed = self.seqs[0].map.reverse + *_, strand = self.seqs[0].data.parent_coordinates() + else: - # WARNING GAH this is tightly coupling to implementation of seq attribute! - # todo GAH make method on Sequence? - reversed = self.seqs[0]._seq.reverse + *_, strand = self.seqs[0].parent_coordinates() + + reversed = strand == "-" new_seqs = dict() db = None if reversed and sliced else deepcopy(self.annotation_db) for seq in self.seqs: @@ -2190,7 +2191,8 @@ def deepcopy(self, sliced=True, exclude_annotations=False): new_seq = self.data.copy(exclude_annotations=exclude_annotations, sliced=sliced) if sliced: db = new_seq.annotation_db - if self.map.reverse or exclude_annotations: + *_, strand = self.data.parent_coordinates() + if strand == "-" or exclude_annotations: new_seq.annotation_db = None else: new_seq.annotation_offset = self.map.start @@ -2249,13 +2251,16 @@ def __add__(self, other): return Aligned(map, seq) def __getitem__(self, slice): + # todo we need to get the sequence coordinates that slice corresponds to + # so we can update the self.data, plus we will need to zero new_map new_map = self.map[slice] data = ( - self.data[new_map.start : new_map.end] if new_map.useful else self.data[0:0] + self.data[new_map.start : new_map.end] if new_map.useful else self.data[:0] ) if new_map.reverse: # A reverse slice means we should have an empty sequence - new_map = type(new_map)(locations=(), parent_length=len(self.data)) + # todo this clause will be removed when a negative step is allowed + new_map = new_map.__class__(locations=(), parent_length=len(self.data)) elif new_map.useful: new_map = new_map.zeroed() return Aligned(new_map, data) @@ -2298,7 +2303,7 @@ def from_rich_dict(cls, data: dict): deserialise_seq, ) - map_ = deserialise_map_spans(data["map_init"]) + map_ = IndelMap.from_rich_dict(data["map_init"]) seq = deserialise_seq(data["seq_init"]) return cls(map_, seq) diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py index 5dc949871..a7dd7b512 100644 --- a/tests/test_core/test_alignment.py +++ b/tests/test_core/test_alignment.py @@ -2827,7 +2827,7 @@ def test_aligned_rich_dict(reverse): rd = seq.to_rich_dict() got = Aligned.from_rich_dict(rd) - assert str(seq) == str(got) + assert str(got) == str(seq) @pytest.mark.parametrize("cls", (SequenceCollection, Alignment, ArrayAlignment)) From de3e299a530ba7fc833cb1f5243a911a46fdbd49 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 20 Feb 2024 16:50:38 +1100 Subject: [PATCH 04/62] API: removed IndelMap reversed --- src/cogent3/core/alignment.py | 9 ++-- src/cogent3/core/location.py | 77 +++++++++---------------------- tests/test_core/test_alignment.py | 7 --- tests/test_core/test_location.py | 28 +++++++---- 4 files changed, 45 insertions(+), 76 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index a0741a5d4..8e731f235 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -2257,9 +2257,9 @@ def __getitem__(self, slice): data = ( self.data[new_map.start : new_map.end] if new_map.useful else self.data[:0] ) - if new_map.reverse: - # A reverse slice means we should have an empty sequence - # todo this clause will be removed when a negative step is allowed + if new_map.useful and new_map.start > new_map.end: + # For now, a reverse slice means we should have an empty sequence + # todo modify this clause if a negative step is ever allowed new_map = new_map.__class__(locations=(), parent_length=len(self.data)) elif new_map.useful: new_map = new_map.zeroed() @@ -4849,8 +4849,7 @@ def get_seq(self, seqname): Note: always returns Sequence object, not ArraySequence. """ - seq = self.named_seqs[seqname] - return seq.data[seq.map.without_gaps()] + return self.named_seqs[seqname].data def get_gapped_seq(self, seq_name, recode_gaps=False): """Return a gapped Sequence object for the specified seqname. diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index bd53b4b39..28ddf4c83 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1176,10 +1176,6 @@ def get_coordinates(self): def nucleic_reversed(self): ... - @abstractmethod - def reversed(self): - ... - @abstractmethod def to_rich_dict(self): ... @@ -1189,9 +1185,7 @@ def inverse(self): ... -def _spans_from_locations( - locations, tidy, parent_length -) -> Tuple[Union[Span, _LostSpan]]: +def _spans_from_locations(locations, parent_length) -> Tuple[Union[Span, _LostSpan]]: spans = [] for start, end in locations: reverse = start > end @@ -1205,7 +1199,7 @@ def _spans_from_locations( start, end = (0, parent_length) if start < end else (parent_length, 0) spans += [ LostSpan(abs(l_diff)), - Span(start, end, tidy, tidy, reverse=reverse), + Span(start, end, reverse=reverse), LostSpan(abs(r_diff)), ] elif min(start, end) < 0: @@ -1214,18 +1208,18 @@ def _spans_from_locations( end = max(end, 0) spans += [ LostSpan(abs(diff)), - Span(start, end, tidy, tidy, reverse=reverse), + Span(start, end, reverse=reverse), ] elif max(start, end) > parent_length: diff = max(start, end) - parent_length start = min(start, parent_length) end = min(end, parent_length) spans += [ - Span(start, end, tidy, tidy, reverse=reverse), + Span(start, end, reverse=reverse), LostSpan(abs(diff)), ] else: - spans += [Span(start, end, tidy, tidy, reverse=reverse)] + spans += [Span(start, end, reverse=reverse)] return tuple(spans) @@ -1250,34 +1244,33 @@ class IndelMap(MapABC): locations: dataclasses.InitVar[Sequence[T]] = None start: Optional[int] = dataclasses.field(init=False, default=0) end: Optional[int] = dataclasses.field(init=False, default=0) - reverse: bool = dataclasses.field(init=False, default=False) length: int = dataclasses.field(init=False, default=0) - _serialisable: dict = dataclasses.field(init=False, repr=False) - tidy: bool = True termini_unknown: dataclasses.InitVar[bool] = dataclasses.field(default=False) + _serialisable: dict = dataclasses.field(init=False, repr=False) def __post_init__(self, locations, termini_unknown): if locations: self.spans = _spans_from_locations( - locations, tidy=self.tidy, parent_length=self.parent_length + locations, parent_length=self.parent_length ) - just_gaps = True - self.reverse = False - for span in self.spans: + last_not_lost = None + start, end = None, None + for i, span in enumerate(self.spans): self.length += len(span) if span.lost: continue + elif start is None: + # this ugly logic because we're using spans! + start = span.end if span.reverse else span.start - if just_gaps: - self.start, self.end = span.start, span.end - self.reverse = span.reverse - just_gaps = False - else: - self.start = min(self.start, span.start) - self.end = max(self.end, span.end) - if self.reverse is not None and (span.reverse != self.reverse): - self.reverse = None + last_not_lost = i + + if last_not_lost is not None: + span = self.spans[last_not_lost] + end = span.start if span.reverse else span.end + + self.start, self.end = start, end if termini_unknown: spans = list(self.spans) @@ -1356,16 +1349,8 @@ def useful(self): return not all(span.lost for span in self.spans) def get_coordinates(self): - """returns span coordinates as [(v1, v2), ...] - - v1/v2 are (start, end) unless the map is reversed, in which case it will - be (end, start) - """ - - order_func = (lambda x: (max(x), min(x))) if self.reverse else (lambda x: x) - return list( - map(order_func, [(s.start, s.end) for s in self.spans if not s.lost]) - ) + """returns span coordinates as [(start, end), ...]""" + return [(s.start, s.end) for s in self.spans if not s.lost] def get_gap_coordinates(self): """returns [(gap pos, gap length), ...]""" @@ -1404,12 +1389,6 @@ def strict_nucleic_reversed(self): spans.reverse() return self.__class__(spans=spans, parent_length=self.parent_length) - def reversed(self): - """Reversed location on same parent""" - spans = [s.reversed() for s in self.spans] - spans.reverse() - return self.__class__(spans=spans, parent_length=self.parent_length) - def to_rich_dict(self): """returns dicts for contained spans [dict(), ..]""" spans = [s.to_rich_dict() for s in self.spans] @@ -1484,8 +1463,6 @@ def inverse(self): Span( cum_start, cum_end, - tidy_start=self.tidy, - tidy_end=self.tidy, reverse=cum_start > cum_end, ) ) @@ -1528,7 +1505,7 @@ def relative_position(self, abs_pos: T) -> T: return abs_pos - self.start def get_covering_span(self): - span = (self.end, self.start) if self.reverse else (self.start, self.end) + span = (self.end, self.start) return self.__class__(locations=[span], parent_length=self.parent_length) def zeroed(self): @@ -1580,11 +1557,3 @@ def to_feature_map(self): del kwargs["type"] kwargs["spans"] = spans return Map(**kwargs) - - def without_gaps(self): - # todo is this really required - # being used by Aligned.get_seq() - return self.__class__( - spans=[s for s in self.spans if not s.lost], - parent_length=self.parent_length, - ) diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py index a7dd7b512..dd2be94b8 100644 --- a/tests/test_core/test_alignment.py +++ b/tests/test_core/test_alignment.py @@ -3498,9 +3498,6 @@ def test_sliced_deepcopy(data, name, rev): if rev: aln = aln.rc() - # the map just mirrors the slice effect, it no longer has "memory" of rev complement - assert aln.named_seqs[name].map.reverse == False - notsliced = aln.deepcopy(sliced=False) # the annotation offsets should match original object assert {s.data.annotation_offset for s in notsliced.seqs} == { @@ -3525,10 +3522,6 @@ def test_sliced_deepcopy(data, name, rev): sliced.named_seqs[name].data._seq.seq is not orig.named_seqs[name].data._seq.seq ) - # the map just mirrors the slice effect, it has no "memory" of rev complement - assert notsliced.named_seqs[name].map.reverse == False - assert sliced.named_seqs[name].map.reverse == False - assert sliced.named_seqs[name].map.parent_length == len( str(sliced_seq).replace("-", "") ) diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 2fa28fbe5..2129d54eb 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -348,8 +348,8 @@ def test_indel_map_useful_complete(): assert len(im) == im.length == 3 -@pytest.mark.parametrize("cls", (Map, IndelMap)) -def test_map_nucleic_reversed(cls): +@pytest.mark.parametrize("cls,expect", ((Map, [(9, 0)]), (IndelMap, [(0, 9)]))) +def test_map_nucleic_reversed(cls, expect): # seq is 9 long # plus coords 012345678 # +slice ** @@ -364,9 +364,7 @@ def test_map_nucleic_reversed(cls): # plus coords 876543210 rc = orig.nucleic_reversed() coords = rc.get_coordinates() - assert coords == [(9, 0)] - assert rc.reverse - assert (rc.start, rc.end) == (0, 9) + assert coords == expect @pytest.mark.parametrize("cls", (Map, IndelMap)) @@ -414,11 +412,10 @@ def test_nongap(cls): assert got == [(0, 2), (5, 6), (7, 10)] -@pytest.mark.parametrize("cls", (Map, IndelMap)) -def test_reversed(cls): +def test_reversed(): seq = DNA.make_seq("AC---G-TAA--") - m, s = seq.parse_out_gaps() - m = cls(spans=m.spans, parent_length=m.parent_length) + m, _ = seq.parse_out_gaps() + m = Map(spans=m.spans, parent_length=m.parent_length) # reversed() reverses the order of spans, but keeps their coordinates # differs from nucleic reversed, which computes a new relative position rev = m.reversed() @@ -562,9 +559,20 @@ def test_indelmap_strict_nucleic_reversed(): orig = IndelMap(**kwargs) rev = orig.strict_nucleic_reversed() assert rev.spans[1].reverse == rev.spans[3].reverse == False - assert not rev.reverse old = orig.nucleic_reversed() assert old.spans[1].reverse == old.spans[3].reverse == True assert rev.get_coordinates() == [ tuple(sorted(a)) for a in reversed(old.get_coordinates()) ] + + +def test_indelmap_with_reverse_span(): + spans = [ + LostSpan(2), + Span(8, 4, reverse=True), + LostSpan(2), + Span(4, 2, reverse=True), + LostSpan(2), + ] + imap = IndelMap(spans=spans, parent_length=12) + assert (imap.start, imap.end) == (8, 2) From 872c4bcd82fb4d2d879dcf8cd1fac08b328a8ec6 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 20 Feb 2024 16:56:30 +1100 Subject: [PATCH 05/62] API: IndelMap.spans is a generator --- src/cogent3/core/alignment.py | 4 +- src/cogent3/core/annotation.py | 8 +-- src/cogent3/core/location.py | 91 ++++++++++++++++++++------------ src/cogent3/core/sequence.py | 2 +- src/cogent3/parse/cigar.py | 10 ++-- src/cogent3/parse/gbseq.py | 6 +-- tests/test_core/test_location.py | 73 +++++++++++++++---------- 7 files changed, 116 insertions(+), 78 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index 8e731f235..b336baa99 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -2244,10 +2244,10 @@ def __len__(self): def __add__(self, other): if self.data is other.data: - (map, seq) = (self.map + other.map, self.data) + map, seq = self.map + other.map, self.data else: seq = self.get_gapped_seq() + other.get_gapped_seq() - (map, seq) = seq.parse_out_gaps() + map, seq = seq.parse_out_gaps() return Aligned(map, seq) def __getitem__(self, slice): diff --git a/src/cogent3/core/annotation.py b/src/cogent3/core/annotation.py index e61d8af59..6964faadc 100644 --- a/src/cogent3/core/annotation.py +++ b/src/cogent3/core/annotation.py @@ -147,10 +147,10 @@ def __repr__(self): return f"{name}({txt})" def remapped_to(self, grandparent, gmap): + # grandparent can be either a Sequence or an Alignment if not isinstance(gmap, Map): - # todo possibly create method on IndelMap to produce the FeatureMap? # due to separation of IndelMap and Map, change class - gmap = Map(spans=gmap.spans, parent_length=gmap.parent_length) + gmap = gmap.to_feature_map() seqid = grandparent.name or f"from {self.seqid!r}" kwargs = { @@ -218,13 +218,13 @@ def union(self, features: Iterable): ----- Overlapping spans are merged """ - combined = self.map.spans[:] + combined = list(self.map.spans) feat_names = [self.name] if self.name else set() biotypes = {self.biotype} if self.biotype else set() seqids = {self.seqid} if self.seqid else set() for feature in features: if feature.parent is not self.parent: - raise ValueError(f"cannot merge annotations from different objects") + raise ValueError("cannot merge annotations from different objects") combined.extend(feature.map.spans) if feature.name: diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 28ddf4c83..87fd3dd9b 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -27,7 +27,7 @@ from bisect import bisect_left, bisect_right from functools import total_ordering from itertools import chain -from typing import List, Optional, Sequence, Tuple, Union +from typing import Iterator, List, Optional, Sequence, Tuple, Union from numpy import array, ndarray @@ -604,12 +604,13 @@ def __init__( self.reverse = None if termini_unknown: + spans = list(spans) if spans[0].lost: spans[0] = TerminalPadding(spans[0].length) if spans[-1].lost: spans[-1] = TerminalPadding(spans[-1].length) - self.spans = spans + self.spans = tuple(spans) self.length = posn self.parent_length = parent_length self.__inverse = None @@ -618,7 +619,7 @@ def __len__(self): return self.length def __repr__(self): - return repr(self.spans) + f"/{self.parent_length}" + return repr(list(self.spans)) + f"/{self.parent_length}" def __getitem__(self, slice): # A possible shorter map at the same level @@ -1239,24 +1240,32 @@ class IndelMap(MapABC): # TODO reverse complement of Alignment -> Aligned -> SeqView, IndelMap # should just do nucleic reverse. I think this is the next task. - spans: Tuple[Union[Span, _LostSpan, TerminalPadding]] = () + spans: dataclasses.InitVar[Optional[tuple]] = () + locations: dataclasses.InitVar[Optional[Sequence[T]]] = None + termini_unknown: dataclasses.InitVar[bool] = dataclasses.field(default=False) parent_length: int = 0 - locations: dataclasses.InitVar[Sequence[T]] = None start: Optional[int] = dataclasses.field(init=False, default=0) end: Optional[int] = dataclasses.field(init=False, default=0) length: int = dataclasses.field(init=False, default=0) - termini_unknown: dataclasses.InitVar[bool] = dataclasses.field(default=False) _serialisable: dict = dataclasses.field(init=False, repr=False) + _spans: Tuple[Union[Span, _LostSpan, TerminalPadding]] = dataclasses.field( + default=(), init=False + ) - def __post_init__(self, locations, termini_unknown): + def __post_init__(self, spans, locations, termini_unknown): if locations: - self.spans = _spans_from_locations( - locations, parent_length=self.parent_length - ) - + spans = _spans_from_locations(locations, parent_length=self.parent_length) + elif isinstance(spans, property): + # This clause is due a known issue with dataclasses. + # As we have a spans property, the default spans value is + # ignored, so we have to check for its value being a property + # and then set the default value here + spans = () + + spans = tuple(spans) last_not_lost = None start, end = None, None - for i, span in enumerate(self.spans): + for i, span in enumerate(spans): self.length += len(span) if span.lost: continue @@ -1267,19 +1276,25 @@ def __post_init__(self, locations, termini_unknown): last_not_lost = i if last_not_lost is not None: - span = self.spans[last_not_lost] + span = spans[last_not_lost] end = span.start if span.reverse else span.end + if start is None: + start = 0 + + if end is None: + end = self.parent_length + self.start, self.end = start, end if termini_unknown: - spans = list(self.spans) + spans = list(spans) if spans[0].lost: spans[0] = TerminalPadding(spans[0].length) if spans[-1].lost: spans[-1] = TerminalPadding(spans[-1].length) - self.spans = tuple(spans) + self._spans = tuple(spans) def __getitem__(self, slice): # A possible shorter map at the same level @@ -1302,28 +1317,28 @@ def __add__(self, other): if other.parent_length != self.parent_length: raise ValueError("Those maps belong to different sequences") return self.__class__( - spans=self.spans + other.spans, parent_length=self.parent_length + spans=self._spans + tuple(other.spans), parent_length=self.parent_length ) def __mul__(self, scale): # For Protein -> DNA new_parts = [] - for span in self.spans: + for span in self._spans: new_parts.append(span * scale) return self.__class__(spans=new_parts, parent_length=self.parent_length * scale) def __repr__(self): - return repr(self.spans) + f"/{self.parent_length}" + return repr(self._spans) + f"/{self.parent_length}" @property def offsets(self): - return [0] + array([s.length for s in self.spans[:-1]]).cumsum().tolist() + return [0] + array([s.length for s in self._spans[:-1]]).cumsum().tolist() def gaps(self): """The gaps (lost spans) in this map""" locations = [] offset = 0 - for s in self.spans: + for s in self._spans: if s.lost: locations.append((offset, offset + s.length)) offset += s.length @@ -1333,40 +1348,45 @@ def nongap(self): """ungappeed segments in this map""" locations = [] offset = 0 - for s in self.spans: + for s in self._spans: if not s.lost: locations.append((offset, offset + s.length)) offset += s.length return self.__class__(locations=locations, parent_length=len(self)) + @property + def spans(self) -> Iterator[Span]: + """generator of spans""" + yield from self._spans + @property def complete(self): """whether any span represents a gap""" - return not any(span.lost for span in self.spans) + return not any(span.lost for span in self._spans) @property def useful(self): - return not all(span.lost for span in self.spans) + return not all(span.lost for span in self._spans) def get_coordinates(self): """returns span coordinates as [(start, end), ...]""" - return [(s.start, s.end) for s in self.spans if not s.lost] + return [(s.start, s.end) for s in self._spans if not s.lost] def get_gap_coordinates(self): """returns [(gap pos, gap length), ...]""" gap_pos = [] - for i, span in enumerate(self.spans): + for i, span in enumerate(self._spans): if not span.lost: continue - pos = self.spans[i - 1].end if i else 0 + pos = self._spans[i - 1].end if i else 0 gap_pos.append((pos, len(span))) return gap_pos def nucleic_reversed(self): """Same location on reversed parent""" - spans = [s.reversed_relative_to(self.parent_length) for s in self.spans] + spans = [s.reversed_relative_to(self.parent_length) for s in self._spans] return self.__class__(spans=spans, parent_length=self.parent_length) def strict_nucleic_reversed(self): @@ -1378,7 +1398,7 @@ def strict_nucleic_reversed(self): """ spans = [] parent_length = self.parent_length - for s in self.spans: + for s in self._spans: if not s.lost: start = parent_length - s.end assert start >= 0 @@ -1391,8 +1411,11 @@ def strict_nucleic_reversed(self): def to_rich_dict(self): """returns dicts for contained spans [dict(), ..]""" - spans = [s.to_rich_dict() for s in self.spans] - data = copy.deepcopy(self._serialisable) + spans = [s.to_rich_dict() for s in self._spans] + # exclude spans from deep copy since being overwritten + data = copy.deepcopy( + {k: v for k, v in self._serialisable.items() if k != "spans"} + ) data.pop("locations") data["spans"] = spans data["type"] = get_object_provenance(self) @@ -1419,7 +1442,7 @@ def from_rich_dict(cls, map_element): def with_termini_unknown(self): """returns new instance with terminal gaps indicated as unknown""" return self.__class__( - spans=self.spans[:], + spans=self._spans[:], parent_length=self.parent_length, termini_unknown=True, ) @@ -1436,7 +1459,7 @@ def inverse(self): cum_posn = 0 temp = [] - for span in self.spans: + for span in self._spans: if not span.lost: if span.reverse: temp.append( @@ -1522,7 +1545,7 @@ def zeroed(self): # store all relationship to underlying sequence min_val = min(self.start, self.end) spans = [] - for span in self.spans: + for span in self._spans: if span.lost: spans.append(span) continue @@ -1543,7 +1566,7 @@ def zeroed(self): def to_feature_map(self): """returns a Map type, suited to Features""" spans = [] - for span in self.spans: + for span in self._spans: if span.lost: spans.append(span) continue diff --git a/src/cogent3/core/sequence.py b/src/cogent3/core/sequence.py index 4d504dedb..9013516bf 100644 --- a/src/cogent3/core/sequence.py +++ b/src/cogent3/core/sequence.py @@ -1063,7 +1063,7 @@ def make_feature(self, feature: FeatureDataType, *args) -> Feature: if pre or post: # create a lost span to represent the segment missing from # the instance - spans = fmap.spans + spans = list(fmap.spans) if pre: spans.insert(0, LostSpan(pre)) if post: diff --git a/src/cogent3/parse/cigar.py b/src/cogent3/parse/cigar.py index 77051e554..6c6dc28f6 100644 --- a/src/cogent3/parse/cigar.py +++ b/src/cogent3/parse/cigar.py @@ -18,14 +18,14 @@ import re from cogent3 import DNA, make_aligned_seqs -from cogent3.core.location import LostSpan, Map, Span +from cogent3.core.location import IndelMap, LostSpan, Span _pattern = re.compile("([0-9]*)([DM])") def map_to_cigar(map): - """convert a Map into a cigar string""" + """convert a IndelMap into a cigar string""" cigar = "" for span in map.spans: if isinstance(span, Span): @@ -39,7 +39,7 @@ def map_to_cigar(map): def cigar_to_map(cigar_text): - """convert cigar string into Map""" + """convert cigar string into IndelMap""" assert "I" not in cigar_text spans, posn = [], 0 for n, c in _pattern.findall(cigar_text): @@ -49,7 +49,7 @@ def cigar_to_map(cigar_text): posn += n else: spans.append(LostSpan(n)) - return Map(spans=spans, parent_length=posn) + return IndelMap(spans=spans, parent_length=posn) def aligned_from_cigar(cigar_text, seq, moltype=DNA): @@ -88,7 +88,7 @@ def _remap(map): span.end = span.end - start length = span.end spans.append(span) - new_map = Map(spans=spans, parent_length=length) + new_map = IndelMap(spans=spans, parent_length=length) return new_map diff --git a/src/cogent3/parse/gbseq.py b/src/cogent3/parse/gbseq.py index cf0b501f5..639d2243f 100644 --- a/src/cogent3/parse/gbseq.py +++ b/src/cogent3/parse/gbseq.py @@ -5,7 +5,7 @@ import io import xml.dom.minidom -from cogent3.core import annotation, moltype +from cogent3.core import location, moltype """ @@ -64,7 +64,7 @@ def GbSeqXmlParser(doc): seq = alphabet.make_seq(raw_string, name=name) - all = annotation.Map([(0, len(seq))], parent_length=len(seq)) + all = location.Map(locations=[(0, len(seq))], parent_length=len(seq)) seq.add_feature( biotype="source", name=name, spans=all.get_coordinates(), strand=all.reverse ) @@ -107,7 +107,7 @@ def GbSeqXmlParser(doc): .nodeValue ) spans.append((point - 1, point)) - if spans == []: + if not spans: spans = [(0, len(seq))] for qualifier in feature.getElementsByTagName("GBQualifier"): qname = ( diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 2129d54eb..4d50409cc 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -372,8 +372,8 @@ def test_coordinate(cls): # coordinates are for ungapped segments in underlying sequence # 01 2 345 seq = DNA.make_seq("AC---G-TAA--") - m, s = seq.parse_out_gaps() - m = cls(spans=m.spans, parent_length=m.parent_length) + m, _ = seq.parse_out_gaps() + m = cls(spans=tuple(m.spans), parent_length=m.parent_length) got = m.get_coordinates() assert got == [(0, 2), (2, 3), (3, 6)] @@ -381,8 +381,8 @@ def test_coordinate(cls): @pytest.mark.parametrize("cls", (Map, IndelMap)) def test_gap_coordinate(cls): seq = DNA.make_seq("AC---G-TAA--") - m, s = seq.parse_out_gaps() - m = cls(spans=m.spans, parent_length=m.parent_length) + m, _ = seq.parse_out_gaps() + m = cls(spans=tuple(m.spans), parent_length=m.parent_length) got = m.get_gap_coordinates() assert got == [(2, 3), (3, 1), (6, 2)] @@ -394,8 +394,8 @@ def test_gaps(cls): # 012345678901 seq = DNA.make_seq("AC---G-TAA--") m, s = seq.parse_out_gaps() - m = cls(spans=m.spans, parent_length=m.parent_length) - got = [(g.start, g.end) for g in m.gaps().spans] + m = cls(spans=tuple(m.spans), parent_length=m.parent_length) + got = [(g.start, g.end) for g in tuple(m.gaps().spans)] assert got == [(2, 5), (6, 7), (10, 12)] @@ -405,8 +405,8 @@ def test_nongap(cls): # 000000000011 # 012345678901 seq = DNA.make_seq("AC---G-TAA--") - m, s = seq.parse_out_gaps() - m = cls(spans=m.spans, parent_length=m.parent_length) + m, _ = seq.parse_out_gaps() + m = cls(spans=tuple(m.spans), parent_length=m.parent_length) got = [(g.start, g.end) for g in m.nongap().spans] assert got == [(0, 2), (5, 6), (7, 10)] @@ -431,7 +431,8 @@ def test_round_trip_rich_dict(): # reversed() reverses the order of spans, but keeps their coordinates # differs from nucleic reversed, which computes a new relative position im = IndelMap(spans=m.spans, parent_length=m.parent_length) - got = IndelMap.from_rich_dict(im.to_rich_dict()) + rd = im.to_rich_dict() + got = IndelMap.from_rich_dict(rd) assert im is not got assert got.to_rich_dict() == im.to_rich_dict() @@ -447,33 +448,38 @@ def test_serialisable_attr(): def test_terminal_unknown(cls): # span idx 01 2 345 6 seq = DNA.make_seq("-AC---G-TAA--") - m, s = seq.parse_out_gaps() + m, _ = seq.parse_out_gaps() # not unknown, by default - assert m.spans[0].lost and not isinstance(m.spans[0], TerminalPadding) + m_spans = tuple(m.spans) + assert m_spans[0].lost and not isinstance(m_spans[0], TerminalPadding) # use the constructor arg - m = cls(spans=m.spans, parent_length=m.parent_length, termini_unknown=True) - assert isinstance(m.spans[0], TerminalPadding) - assert isinstance(m.spans[-1], TerminalPadding) - assert m.spans[2].lost and not isinstance(m.spans[1], TerminalPadding) - assert m.spans[4].lost and not isinstance(m.spans[2], TerminalPadding) + m = cls(spans=tuple(m.spans), parent_length=m.parent_length, termini_unknown=True) + m_spans = tuple(m.spans) + assert isinstance(m_spans[0], TerminalPadding) + assert isinstance(m_spans[-1], TerminalPadding) + assert m_spans[2].lost and not isinstance(m_spans[1], TerminalPadding) + assert m_spans[4].lost and not isinstance(m_spans[2], TerminalPadding) # use the method - m, s = seq.parse_out_gaps() - m = cls(spans=m.spans, parent_length=m.parent_length).with_termini_unknown() - assert isinstance(m.spans[0], TerminalPadding) - assert isinstance(m.spans[-1], TerminalPadding) + m, _ = seq.parse_out_gaps() + m = cls(spans=tuple(m.spans), parent_length=m.parent_length).with_termini_unknown() + m_spans = tuple(m.spans) + assert isinstance(m_spans[0], TerminalPadding) + assert isinstance(m_spans[-1], TerminalPadding) # middle gap is not terminal, so... - assert not isinstance(m.spans[2], TerminalPadding) + assert not isinstance(m_spans[2], TerminalPadding) # no gaps, no effect seq = DNA.make_seq("ACGTAA") - m, s = seq.parse_out_gaps() + m, _ = seq.parse_out_gaps() # use the constructor arg - m = cls(spans=m.spans, parent_length=m.parent_length, termini_unknown=True) - assert not isinstance(m.spans[0], TerminalPadding) + m = cls(spans=tuple(m.spans), parent_length=m.parent_length, termini_unknown=True) + m_spans = tuple(m.spans) + assert not isinstance(m_spans[0], TerminalPadding) # use the method - m = cls(spans=m.spans, parent_length=m.parent_length).with_termini_unknown() - assert not isinstance(m.spans[0], TerminalPadding) + m = cls(spans=tuple(m.spans), parent_length=m.parent_length).with_termini_unknown() + m_spans = tuple(m.spans) + assert not isinstance(m_spans[0], TerminalPadding) @pytest.mark.parametrize("cls", (Map, IndelMap)) @@ -482,7 +488,8 @@ def test_map_inverse(cls): assert len(m) == 4 mi = m.inverse() assert len(mi) == 6 - assert mi.spans[1].lost and len(mi.spans[1]) == 2 + mi_spans = tuple(mi.spans) + assert mi_spans[1].lost and len(mi_spans[1]) == 2 # invert the inversion, should give us back the original re_inv = mi.inverse() expect = m.to_rich_dict() @@ -558,9 +565,11 @@ def test_indelmap_strict_nucleic_reversed(): kwargs = dict(spans=spans, parent_length=12) orig = IndelMap(**kwargs) rev = orig.strict_nucleic_reversed() - assert rev.spans[1].reverse == rev.spans[3].reverse == False + rev_spans = tuple(rev.spans) + assert rev_spans[1].reverse == rev_spans[3].reverse == False old = orig.nucleic_reversed() - assert old.spans[1].reverse == old.spans[3].reverse == True + old_spans = tuple(old.spans) + assert old_spans[1].reverse == old_spans[3].reverse == True assert rev.get_coordinates() == [ tuple(sorted(a)) for a in reversed(old.get_coordinates()) ] @@ -576,3 +585,9 @@ def test_indelmap_with_reverse_span(): ] imap = IndelMap(spans=spans, parent_length=12) assert (imap.start, imap.end) == (8, 2) + + +def test_indelmap_no_gaps(): + imap = IndelMap(locations=(), parent_length=6) + gaps = imap.gaps() + assert not gaps From 1dad7c51f8720c2ca79ed76399b5916cc8691039 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 20 Feb 2024 16:59:54 +1100 Subject: [PATCH 06/62] API: replace Map with FeatureMap in Features --- src/cogent3/core/alignment.py | 6 +- src/cogent3/core/annotation.py | 15 +- src/cogent3/core/location.py | 459 +++++++++++++++++++++++--- src/cogent3/core/sequence.py | 10 +- src/cogent3/parse/gbseq.py | 2 +- tests/test_core/test_annotation.py | 6 +- tests/test_core/test_annotation_db.py | 2 +- tests/test_core/test_location.py | 49 +-- tests/test_core/test_maps.py | 4 +- 9 files changed, 474 insertions(+), 79 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index b336baa99..b8e982ba3 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -66,7 +66,7 @@ ) from cogent3.core.genetic_code import get_code from cogent3.core.info import Info as InfoClass -from cogent3.core.location import IndelMap, Map +from cogent3.core.location import FeatureMap, IndelMap from cogent3.core.profile import PSSM, MotifCountsArray from cogent3.core.sequence import ArraySequence, Sequence, frac_same # which is a circular import otherwise. @@ -4667,7 +4667,7 @@ def __getitem__(self, index): raise ValueError(f"feature.parent {index.seqid!r} is not self") return index.get_slice() - if isinstance(index, (Map, IndelMap)): + if isinstance(index, (FeatureMap, IndelMap)): new = self._mapped(index) elif isinstance(index, (int, slice)): @@ -5239,7 +5239,7 @@ def make_feature( # in Sequence? revd = feature.pop("strand", None) == "-" feature["strand"] = "-" if revd else "+" - fmap = Map(parent_length=len(self), locations=feature.pop("spans")) + fmap = FeatureMap(parent_length=len(self), locations=feature.pop("spans")) if revd: fmap = fmap.nucleic_reversed() return Feature(parent=self, map=fmap, **feature) diff --git a/src/cogent3/core/annotation.py b/src/cogent3/core/annotation.py index 6964faadc..8ddc86a6e 100644 --- a/src/cogent3/core/annotation.py +++ b/src/cogent3/core/annotation.py @@ -2,7 +2,7 @@ from numpy import array -from .location import Map +from .location import FeatureMap # todo gah write docstrings! @@ -23,7 +23,14 @@ class Feature: # todo gah implement a __new__ to trap args for serialisation purposes? def __init__( - self, *, parent, seqid: str, map: Map, biotype: str, name: str, strand: str + self, + *, + parent, + seqid: str, + map: FeatureMap, + biotype: str, + name: str, + strand: str, ): # _serialisable is used for creating derivative instances d = locals() @@ -148,7 +155,7 @@ def __repr__(self): def remapped_to(self, grandparent, gmap): # grandparent can be either a Sequence or an Alignment - if not isinstance(gmap, Map): + if not isinstance(gmap, FeatureMap): # due to separation of IndelMap and Map, change class gmap = gmap.to_feature_map() @@ -234,7 +241,7 @@ def union(self, features: Iterable): if feature.biotype: biotypes.add(feature.biotype) name = ", ".join(feat_names) - map = Map(spans=combined, parent_length=len(self.parent)) + map = FeatureMap(spans=combined, parent_length=len(self.parent)) map = map.covered() # No overlaps # the covered method drops reversed status so we need to # resurrect that, but noting we've not checked consistency diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 87fd3dd9b..d8ce1066a 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -78,7 +78,7 @@ def as_map(slice, length, cls): for i in slice: spans.extend(as_map(i, length).spans, cls) return cls(spans=spans, parent_length=length) - elif isinstance(slice, (Map, IndelMap)): + elif isinstance(slice, (FeatureMap, IndelMap)): return slice else: lo, hi, step = _norm_slice(slice, length) @@ -528,7 +528,7 @@ def __repr__(self): return f"?{self.length}?" -class Map: +class Map: # pragma: no cover """A map holds a list of spans.""" def __init__( @@ -551,7 +551,7 @@ def __init__( reverse = start > end if max(start, end) < 0 or min(start, end) > parent_length: raise RuntimeError( - f"located outside sequence: {str((start, end, parent_length))}" + f"located outside sequence: {(start, end, parent_length)}" ) if max(start, end) > parent_length and min(start, end) < 0: l_diff = min(start, end) @@ -1099,41 +1099,6 @@ def RangeFromString(string, delimiter=","): # pragma: no cover return result -def gap_coords_to_map(gaps_lengths: dict, seq_length: int) -> Map: - """ - Parameters - ---------- - gaps_lengths - {gap insertion pos: gap length, ...} - seq_length : int - length of unaligned sequence - - Returns - ------- - Map - """ - - if not gaps_lengths: - return Map([(0, seq_length)], parent_length=seq_length) - - spans = [] - last = pos = 0 - for pos in sorted(gaps_lengths): - if pos > seq_length: - raise ValueError( - f"cannot have gap at position {pos} beyond seq_length= {seq_length}" - ) - - gap = LostSpan(length=gaps_lengths[pos]) - spans.extend([gap] if pos == 0 else [Span(last, pos), gap]) - last = pos - - if pos < seq_length: - spans.append(Span(last, seq_length)) - - return Map(spans=spans, parent_length=seq_length) - - class MapABC(ABC): """base class for genomic map objects""" @@ -1579,4 +1544,420 @@ def to_feature_map(self): del kwargs["version"] del kwargs["type"] kwargs["spans"] = spans - return Map(**kwargs) + return FeatureMap(**kwargs) + + +class FeatureMap: + """A map holds a list of spans.""" + + def __init__( + self, + locations=None, + spans=None, + tidy=False, + parent_length=None, + termini_unknown=False, + ): + assert parent_length is not None + d = locals() + exclude = ("self", "__class__", "__slots__") + self._serialisable = {k: v for k, v in d.items() if k not in exclude} + + if spans is None: + spans = [] + for start, end in locations: + diff = 0 + reverse = start > end + if max(start, end) < 0 or min(start, end) > parent_length: + raise RuntimeError( + f"located outside sequence: {(start, end, parent_length)}" + ) + if max(start, end) > parent_length and min(start, end) < 0: + l_diff = min(start, end) + r_diff = max(start, end) - parent_length + start, end = ( + (0, parent_length) if start < end else (parent_length, 0) + ) + spans += [ + LostSpan(abs(l_diff)), + Span(start, end, tidy, tidy, reverse=reverse), + LostSpan(abs(r_diff)), + ] + elif min(start, end) < 0: + diff = min(start, end) + start = 0 if start < 0 else start + end = 0 if end < 0 else end + spans += [ + LostSpan(abs(diff)), + Span(start, end, tidy, tidy, reverse=reverse), + ] + elif max(start, end) > parent_length: + diff = max(start, end) - parent_length + start = parent_length if start > parent_length else start + end = parent_length if end > parent_length else end + spans += [ + Span(start, end, tidy, tidy, reverse=reverse), + LostSpan(abs(diff)), + ] + else: + spans += [Span(start, end, tidy, tidy, reverse=reverse)] + + self.offsets = [] + self.useful = False + self.complete = True + self.reverse = None + posn = 0 + for span in spans: + self.offsets.append(posn) + posn += span.length + if span.lost: + self.complete = False + elif not self.useful: + self.useful = True + (self.start, self.end) = (span.start, span.end) + self.reverse = span.reverse + else: + self.start = min(self.start, span.start) + self.end = max(self.end, span.end) + if self.reverse is not None and (span.reverse != self.reverse): + self.reverse = None + + if termini_unknown: + spans = list(spans) + if spans[0].lost: + spans[0] = TerminalPadding(spans[0].length) + if spans[-1].lost: + spans[-1] = TerminalPadding(spans[-1].length) + + self.spans = tuple(spans) + self.length = posn + self.parent_length = parent_length + + def __len__(self): + return self.length + + def __repr__(self): + return repr(list(self.spans)) + f"/{self.parent_length}" + + def __getitem__(self, new_map): + # A possible shorter map at the same level + new_map = as_map(new_map, len(self), self.__class__) + new_parts = [] + for span in new_map.spans: + new_parts.extend(span.remap_with(self)) + return self.__class__(spans=new_parts, parent_length=self.parent_length) + + def __mul__(self, scale): + new_parts = [span * scale for span in self.spans] + return self.__class__(spans=new_parts, parent_length=self.parent_length * scale) + + def __div__(self, scale): + new_parts = [span / scale for span in self.spans] + return self.__class__( + spans=new_parts, parent_length=self.parent_length // scale + ) + + def __add__(self, other): + if other.parent_length != self.parent_length: + raise ValueError("Those maps belong to different sequences") + return self.__class__( + spans=self.spans + other.spans, parent_length=self.parent_length + ) + + def _with_termini_unknown(self): + return self.__class__( + self, + spans=self.spans[:], + parent_length=self.parent_length, + termini_unknown=True, + ) + + def get_covering_span(self): + if self.reverse: + span = (self.end, self.start) + else: + span = (self.start, self.end) + return self.__class__([span], parent_length=self.parent_length) + + def covered(self): + """>>> Map([(10,20), (15, 25), (80, 90)]).covered().spans + [Span(10,25), Span(80, 90)]""" + + delta = {} + for span in self.spans: + if span.lost: + continue + delta[span.start] = delta.get(span.start, 0) + 1 + delta[span.end] = delta.get(span.end, 0) - 1 + positions = sorted(delta.keys()) + last_y = y = 0 + last_x = start = None + result = [] + for x in positions: + y += delta[x] + if x == last_x: + continue + if y and not last_y: + assert start is None + start = x + elif last_y and not y: + result.append((start, x)) + start = None + last_x = x + last_y = y + assert y == 0 + return self.__class__(locations=result, parent_length=self.parent_length) + + def reversed(self): + """Reversed location on same parent""" + spans = [s.reversed() for s in self.spans] + spans.reverse() + return self.__class__(spans=spans, parent_length=self.parent_length) + + def nucleic_reversed(self): + """Same location on reversed parent""" + spans = [s.reversed_relative_to(self.parent_length) for s in self.spans] + return self.__class__(spans=spans, parent_length=self.parent_length) + + def get_gap_coordinates(self): + """returns [(gap pos, gap length), ...]""" + gap_pos = [] + for i, span in enumerate(self.spans): + if not span.lost: + continue + + pos = self.spans[i - 1].end if i else 0 + gap_pos.append((pos, len(span))) + + return gap_pos + + def gaps(self): + """The gaps (lost spans) in this map""" + locations = [] + offset = 0 + for s in self.spans: + if s.lost: + locations.append((offset, offset + s.length)) + offset += s.length + return self.__class__(locations, parent_length=len(self)) + + def shadow(self): + """The 'negative' map of the spans not included in this map""" + return self.inverse().gaps() + + def nongap(self): + locations = [] + offset = 0 + for s in self.spans: + if not s.lost: + locations.append((offset, offset + s.length)) + offset += s.length + return self.__class__(locations, parent_length=len(self)) + + def without_gaps(self): + return self.__class__( + spans=[s for s in self.spans if not s.lost], + parent_length=self.parent_length, + ) + + def inverse(self): + """returns instance with coordinates updated for aligned, unaligned""" + # is this only required for parse_out_gaps? + # NO also used in cogent3.align code + + # can't work if there are overlaps in the map + # tidy ends don't survive inversion + if self.parent_length is None: + raise ValueError("Uninvertable. parent length not known") + + cum_posn = 0 + temp = [] + for span in self.spans: + if not span.lost: + if span.reverse: + temp.append( + (span.start, span.end, cum_posn + span.length, cum_posn) + ) + else: + temp.append( + (span.start, span.end, cum_posn, cum_posn + span.length) + ) + cum_posn += span.length + + temp.sort() + new_spans = [] + last_start = 0 + for start, end, cum_start, cum_end in temp: + if start > last_start: + new_spans.append(LostSpan(start - last_start)) + elif start < last_start: + raise ValueError(f"Uninvertable. Overlap: {start} < {last_start}") + + # we force tidy_ to be same as self, attribute has no meaning + # for IndelMap, but retained for compatability for now + new_spans.append( + Span( + cum_start, + cum_end, + reverse=cum_start > cum_end, + ) + ) + last_start = end + + if self.parent_length > last_start: + new_spans.append(LostSpan(self.parent_length - last_start)) + + return self.__class__(spans=new_spans, parent_length=len(self)) + + def _inverse(self): + # can't work if there are overlaps in the map + # tidy ends don't survive inversion + if self.parent_length is None: + raise ValueError("Uninvertable. parent length not known") + posn = 0 + temp = [] + for span in self.spans: + if not span.lost: + if span.reverse: + temp.append((span.start, span.end, posn + span.length, posn)) + else: + temp.append((span.start, span.end, posn, posn + span.length)) + posn += span.length + + temp.sort() + new_spans = [] + last_hi = 0 + for lo, hi, start, end in temp: + if lo > last_hi: + new_spans.append(LostSpan(lo - last_hi)) + elif lo < last_hi: + raise ValueError(f"Uninvertable. Overlap: {lo} < {last_hi}") + new_spans.append(Span(start, end, reverse=start > end)) + last_hi = hi + if self.parent_length > last_hi: + new_spans.append(LostSpan(self.parent_length - last_hi)) + return self.__class__(spans=new_spans, parent_length=len(self)) + + def get_coordinates(self): + """returns span coordinates as [(v1, v2), ...] + + v1/v2 are (start, end) unless the map is reversed, in which case it will + be (end, start)""" + + if self.reverse: + order_func = lambda x: (max(x), min(x)) + else: + order_func = lambda x: x + + coords = list( + map(order_func, [(s.start, s.end) for s in self.spans if not s.lost]) + ) + + return coords + + def to_rich_dict(self): + """returns dicts for contained spans [dict(), ..]""" + spans = [s.to_rich_dict() for s in self.spans] + data = copy.deepcopy(self._serialisable) + data.pop("locations") + data["spans"] = spans + data["type"] = get_object_provenance(self) + data["version"] = __version__ + return data + + def zeroed(self): + """returns a new instance with the first span starting at 0 + + Note + ---- + + Useful when an Annotatable object is sliced, but the connection to + the original parent is being deliberately broken as in the + Sequence.deepcopy(sliced=True) case. + """ + # todo there's probably a more efficient way to do this + # create the new instance + from cogent3.util.deserialise import deserialise_map_spans + + data = self.to_rich_dict() + zeroed = deserialise_map_spans(data) + zeroed.parent_length = len(zeroed.get_covering_span()) + shift = min(zeroed.start, zeroed.end) + new_end = 0 + for span in zeroed.spans: + if span.lost: + continue + span.start -= shift + span.end -= shift + new_end = max(new_end, span.end) + + zeroed.start = 0 + zeroed.end = new_end + + return zeroed + + T = Union[ndarray, int] + + def absolute_position(self, rel_pos: T) -> T: + """converts rel_pos into an absolute position + + Raises + ------ + raises ValueError if rel_pos < 0 + """ + check = array([rel_pos], dtype=int) if isinstance(rel_pos, int) else rel_pos + if check.min() < 0: + raise ValueError(f"must positive, not {rel_pos=}") + + if len(self) == self.parent_length: + # handle case of reversed here? + return rel_pos + + return self.start + rel_pos + + def relative_position(self, abs_pos: T) -> T: + """converts abs_pos into an relative position + + Raises + ------ + raises ValueError if abs_pos < 0 + """ + check = array([abs_pos], dtype=int) if isinstance(abs_pos, int) else abs_pos + if check.min() < 0: + raise ValueError(f"must positive, not {abs_pos=}") + return abs_pos - self.start + + +def gap_coords_to_map(gaps_lengths: dict, seq_length: int) -> FeatureMap: + """ + Parameters + ---------- + gaps_lengths + {gap insertion pos: gap length, ...} + seq_length : int + length of unaligned sequence + + Returns + ------- + Map + """ + + if not gaps_lengths: + return FeatureMap([(0, seq_length)], parent_length=seq_length) + + spans = [] + last = pos = 0 + for pos in sorted(gaps_lengths): + if pos > seq_length: + raise ValueError( + f"cannot have gap at position {pos} beyond seq_length= {seq_length}" + ) + + gap = LostSpan(length=gaps_lengths[pos]) + spans.extend([gap] if pos == 0 else [Span(last, pos), gap]) + last = pos + + if pos < seq_length: + spans.append(Span(last, seq_length)) + + return FeatureMap(spans=spans, parent_length=seq_length) diff --git a/src/cogent3/core/sequence.py b/src/cogent3/core/sequence.py index 9013516bf..0a5c6459b 100644 --- a/src/cogent3/core/sequence.py +++ b/src/cogent3/core/sequence.py @@ -56,7 +56,7 @@ ) from cogent3.core.genetic_code import get_code from cogent3.core.info import Info as InfoClass -from cogent3.core.location import IndelMap, LostSpan, Map +from cogent3.core.location import FeatureMap, IndelMap, LostSpan from cogent3.format.fasta import alignment_to_fasta from cogent3.maths.stats.contingency import CategoryCounts from cogent3.maths.stats.number import CategoryCounter @@ -1059,7 +1059,7 @@ def make_feature(self, feature: FeatureDataType, *args) -> Feature: continue new_spans.append(new.tolist()) - fmap = Map(locations=new_spans, parent_length=len(self)) + fmap = FeatureMap(locations=new_spans, parent_length=len(self)) if pre or post: # create a lost span to represent the segment missing from # the instance @@ -1068,7 +1068,7 @@ def make_feature(self, feature: FeatureDataType, *args) -> Feature: spans.insert(0, LostSpan(pre)) if post: spans.append(LostSpan(post)) - fmap = Map(spans=spans, parent_length=len(self)) + fmap = FeatureMap(spans=spans, parent_length=len(self)) if revd and not seq_rced: # the sequence is on the plus strand, and the @@ -1268,7 +1268,7 @@ def with_masked_annotations( seqid=self.name, name=None, biotype=None, - map=Map(locations=[], parent_length=len(self)), + map=FeatureMap(locations=[], parent_length=len(self)), strand="+", ) else: @@ -1344,7 +1344,7 @@ def __getitem__(self, index): if hasattr(index, "map"): index = index.map - if isinstance(index, (Map, IndelMap)): + if isinstance(index, (FeatureMap, IndelMap)): new = self._mapped(index) preserve_offset = not index.reverse diff --git a/src/cogent3/parse/gbseq.py b/src/cogent3/parse/gbseq.py index 639d2243f..bc2712b70 100644 --- a/src/cogent3/parse/gbseq.py +++ b/src/cogent3/parse/gbseq.py @@ -64,7 +64,7 @@ def GbSeqXmlParser(doc): seq = alphabet.make_seq(raw_string, name=name) - all = location.Map(locations=[(0, len(seq))], parent_length=len(seq)) + all = location.FeatureMap(locations=[(0, len(seq))], parent_length=len(seq)) seq.add_feature( biotype="source", name=name, spans=all.get_coordinates(), strand=all.reverse ) diff --git a/tests/test_core/test_annotation.py b/tests/test_core/test_annotation.py index e8f384cb8..bddd156ff 100644 --- a/tests/test_core/test_annotation.py +++ b/tests/test_core/test_annotation.py @@ -4,7 +4,7 @@ from cogent3 import DNA, load_seq, make_aligned_seqs, make_unaligned_seqs from cogent3.core.alignment import Alignment, SequenceCollection -from cogent3.core.location import Map, Span +from cogent3.core.location import FeatureMap, Span def makeSampleSequence(name, with_gaps=False): @@ -95,10 +95,10 @@ def test_map(self): """reversing a map with multiple spans should preserve span relative order""" forward = [Span(20, 30), Span(40, 50)] - fmap = Map(spans=forward, parent_length=100) + fmap = FeatureMap(spans=forward, parent_length=100) fmap_reversed = fmap.nucleic_reversed() reverse = [Span(70, 80, reverse=True), Span(50, 60, reverse=True)] - rmap = Map(spans=reverse, parent_length=100) + rmap = FeatureMap(spans=reverse, parent_length=100) for i in range(2): self.assertEqual(fmap_reversed.spans[i], rmap.spans[i]) diff --git a/tests/test_core/test_annotation_db.py b/tests/test_core/test_annotation_db.py index 28e45da99..ac081d1a0 100644 --- a/tests/test_core/test_annotation_db.py +++ b/tests/test_core/test_annotation_db.py @@ -406,7 +406,7 @@ def test_feature_nucleic(): from cogent3.core import location as loc seq = make_seq("AACCTTTGGGGAATTT", moltype="dna") - mmap = loc.Map(locations=[(4, 7), (11, 13)], parent_length=16) + mmap = loc.FeatureMap(locations=[(4, 7), (11, 13)], parent_length=16) expect = seq[mmap.reversed()] rcseq = seq.rc() diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 4d50409cc..fc415a4c1 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -6,9 +6,9 @@ from cogent3 import DNA from cogent3.core.location import ( + FeatureMap, IndelMap, LostSpan, - Map, Span, TerminalPadding, gap_coords_to_map, @@ -266,13 +266,13 @@ class MapTests(TestCase): def test_get_coords(self): """get_coordinates should return raw coordinates matching input""" spans = [(0, 9), (20, 32)] - map = Map(spans, parent_length=100) + map = FeatureMap(spans, parent_length=100) coords = map.get_coordinates() self.assertEqual(coords, spans) # should work for reversed Maps too spans = [(32, 20), (9, 0)] - map = Map(spans, parent_length=100) + map = FeatureMap(spans, parent_length=100) coords = map.get_coordinates() self.assertEqual(coords, spans) @@ -309,7 +309,7 @@ def test_gap_coords_to_map(self): got = gap_coords_to_map({20: 1}, len(seq)) -@pytest.mark.parametrize("cls", (IndelMap, Map)) +@pytest.mark.parametrize("cls", (IndelMap, FeatureMap)) def test_map_plus_position(cls): # seq is 9 long # plus coords 012345678 @@ -348,7 +348,7 @@ def test_indel_map_useful_complete(): assert len(im) == im.length == 3 -@pytest.mark.parametrize("cls,expect", ((Map, [(9, 0)]), (IndelMap, [(0, 9)]))) +@pytest.mark.parametrize("cls,expect", ((FeatureMap, [(9, 0)]), (IndelMap, [(0, 9)]))) def test_map_nucleic_reversed(cls, expect): # seq is 9 long # plus coords 012345678 @@ -367,7 +367,7 @@ def test_map_nucleic_reversed(cls, expect): assert coords == expect -@pytest.mark.parametrize("cls", (Map, IndelMap)) +@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) def test_coordinate(cls): # coordinates are for ungapped segments in underlying sequence # 01 2 345 @@ -378,7 +378,7 @@ def test_coordinate(cls): assert got == [(0, 2), (2, 3), (3, 6)] -@pytest.mark.parametrize("cls", (Map, IndelMap)) +@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) def test_gap_coordinate(cls): seq = DNA.make_seq("AC---G-TAA--") m, _ = seq.parse_out_gaps() @@ -387,7 +387,7 @@ def test_gap_coordinate(cls): assert got == [(2, 3), (3, 1), (6, 2)] -@pytest.mark.parametrize("cls", (Map, IndelMap)) +@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) def test_gaps(cls): # returns spans corresponding to position on "aligned" seq of gaps # 000000000011 @@ -399,7 +399,7 @@ def test_gaps(cls): assert got == [(2, 5), (6, 7), (10, 12)] -@pytest.mark.parametrize("cls", (IndelMap, Map)) +@pytest.mark.parametrize("cls", (IndelMap, FeatureMap)) def test_nongap(cls): # returns spans corresponding to position on "aligned" seq of nongaps # 000000000011 @@ -415,7 +415,7 @@ def test_nongap(cls): def test_reversed(): seq = DNA.make_seq("AC---G-TAA--") m, _ = seq.parse_out_gaps() - m = Map(spans=m.spans, parent_length=m.parent_length) + m = FeatureMap(spans=m.spans, parent_length=m.parent_length) # reversed() reverses the order of spans, but keeps their coordinates # differs from nucleic reversed, which computes a new relative position rev = m.reversed() @@ -444,8 +444,7 @@ def test_serialisable_attr(): assert got == set_vals -@pytest.mark.parametrize("cls", (Map, IndelMap)) -def test_terminal_unknown(cls): +def test_terminal_unknown(): # span idx 01 2 345 6 seq = DNA.make_seq("-AC---G-TAA--") m, _ = seq.parse_out_gaps() @@ -453,7 +452,9 @@ def test_terminal_unknown(cls): m_spans = tuple(m.spans) assert m_spans[0].lost and not isinstance(m_spans[0], TerminalPadding) # use the constructor arg - m = cls(spans=tuple(m.spans), parent_length=m.parent_length, termini_unknown=True) + m = IndelMap( + spans=tuple(m.spans), parent_length=m.parent_length, termini_unknown=True + ) m_spans = tuple(m.spans) assert isinstance(m_spans[0], TerminalPadding) assert isinstance(m_spans[-1], TerminalPadding) @@ -462,7 +463,9 @@ def test_terminal_unknown(cls): # use the method m, _ = seq.parse_out_gaps() - m = cls(spans=tuple(m.spans), parent_length=m.parent_length).with_termini_unknown() + m = IndelMap( + spans=tuple(m.spans), parent_length=m.parent_length + ).with_termini_unknown() m_spans = tuple(m.spans) assert isinstance(m_spans[0], TerminalPadding) assert isinstance(m_spans[-1], TerminalPadding) @@ -473,16 +476,20 @@ def test_terminal_unknown(cls): seq = DNA.make_seq("ACGTAA") m, _ = seq.parse_out_gaps() # use the constructor arg - m = cls(spans=tuple(m.spans), parent_length=m.parent_length, termini_unknown=True) + m = IndelMap( + spans=tuple(m.spans), parent_length=m.parent_length, termini_unknown=True + ) m_spans = tuple(m.spans) assert not isinstance(m_spans[0], TerminalPadding) # use the method - m = cls(spans=tuple(m.spans), parent_length=m.parent_length).with_termini_unknown() + m = IndelMap( + spans=tuple(m.spans), parent_length=m.parent_length + ).with_termini_unknown() m_spans = tuple(m.spans) assert not isinstance(m_spans[0], TerminalPadding) -@pytest.mark.parametrize("cls", (Map, IndelMap)) +@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) def test_map_inverse(cls): m = cls(locations=[(0, 2), (4, 6)], parent_length=6) assert len(m) == 4 @@ -497,7 +504,7 @@ def test_map_inverse(cls): assert got == expect -@pytest.mark.parametrize("cls", (Map, IndelMap)) +@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) def test_map_offsets(cls): # offsets are absolute starts of spans # 1 @@ -508,7 +515,7 @@ def test_map_offsets(cls): assert got == [0, 1, 3, 6, 7, 8, 11] -@pytest.mark.parametrize("cls", (Map, IndelMap)) +@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) def test_map_indexed(cls): m = cls(locations=[(0, 2), (4, 6)], parent_length=6).inverse() indexed = m[2] @@ -521,7 +528,7 @@ def test_compare_map_indexed(): seq = DNA.make_seq("--AC-GTAA--".replace("-", "")) spans = [LostSpan(2), Span(0, 2), LostSpan(2), Span(2, 6), LostSpan(2)] kwargs = dict(spans=spans, parent_length=len(seq)) - mm = Map(**kwargs) + mm = FeatureMap(**kwargs) im = IndelMap(**kwargs) ma = Aligned(mm, seq) ia = Aligned(im, seq) @@ -538,7 +545,7 @@ def test_indel_map_zeroed(slice_it): spans = [LostSpan(2), Span(2, 4), LostSpan(2), Span(4, 8), LostSpan(2)] kwargs = dict(spans=spans, parent_length=6) - mm = Map(**kwargs) + mm = FeatureMap(**kwargs) if slice_it: mm = mm[:6] mm_zeroed = mm.zeroed() diff --git a/tests/test_core/test_maps.py b/tests/test_core/test_maps.py index 6ccb21b31..275876598 100644 --- a/tests/test_core/test_maps.py +++ b/tests/test_core/test_maps.py @@ -1,7 +1,7 @@ import unittest from cogent3 import DNA, make_aligned_seqs -from cogent3.core.location import Map, Span +from cogent3.core.location import FeatureMap, Span class MapTest(unittest.TestCase): @@ -9,7 +9,7 @@ class MapTest(unittest.TestCase): def test_spans(self): # a simple two part map of length 10 - map = Map([(0, 5), (5, 10)], parent_length=10) + map = FeatureMap([(0, 5), (5, 10)], parent_length=10) # try different spans on the above map for (start, end), expected in [ ((0, 4), "[0:4]"), From 877769ddc2d80352971a3e005e10d56ac00e7a18 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Wed, 21 Feb 2024 08:02:08 +1100 Subject: [PATCH 07/62] MAINT: fix issue caught by codacy --- src/cogent3/core/location.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index d8ce1066a..9803c7830 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -76,7 +76,7 @@ def as_map(slice, length, cls): if isinstance(slice, (list, tuple)): spans = [] for i in slice: - spans.extend(as_map(i, length).spans, cls) + spans.extend(as_map(i, length, cls).spans) return cls(spans=spans, parent_length=length) elif isinstance(slice, (FeatureMap, IndelMap)): return slice From b336be071f45d8ff21bec1621c70c47024c6b889 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Wed, 6 Mar 2024 08:43:06 +1100 Subject: [PATCH 08/62] API: simplify FeatureMap constuctor --- src/cogent3/core/location.py | 50 +++--------------------------------- 1 file changed, 3 insertions(+), 47 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 9803c7830..6b708af89 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1543,6 +1543,7 @@ def to_feature_map(self): kwargs = self.to_rich_dict() del kwargs["version"] del kwargs["type"] + del kwargs["termini_unknown"] # not used for FeatureMap kwargs["spans"] = spans return FeatureMap(**kwargs) @@ -1554,9 +1555,7 @@ def __init__( self, locations=None, spans=None, - tidy=False, parent_length=None, - termini_unknown=False, ): assert parent_length is not None d = locals() @@ -1564,43 +1563,7 @@ def __init__( self._serialisable = {k: v for k, v in d.items() if k not in exclude} if spans is None: - spans = [] - for start, end in locations: - diff = 0 - reverse = start > end - if max(start, end) < 0 or min(start, end) > parent_length: - raise RuntimeError( - f"located outside sequence: {(start, end, parent_length)}" - ) - if max(start, end) > parent_length and min(start, end) < 0: - l_diff = min(start, end) - r_diff = max(start, end) - parent_length - start, end = ( - (0, parent_length) if start < end else (parent_length, 0) - ) - spans += [ - LostSpan(abs(l_diff)), - Span(start, end, tidy, tidy, reverse=reverse), - LostSpan(abs(r_diff)), - ] - elif min(start, end) < 0: - diff = min(start, end) - start = 0 if start < 0 else start - end = 0 if end < 0 else end - spans += [ - LostSpan(abs(diff)), - Span(start, end, tidy, tidy, reverse=reverse), - ] - elif max(start, end) > parent_length: - diff = max(start, end) - parent_length - start = parent_length if start > parent_length else start - end = parent_length if end > parent_length else end - spans += [ - Span(start, end, tidy, tidy, reverse=reverse), - LostSpan(abs(diff)), - ] - else: - spans += [Span(start, end, tidy, tidy, reverse=reverse)] + spans = _spans_from_locations(locations, parent_length=parent_length) self.offsets = [] self.useful = False @@ -1614,7 +1577,7 @@ def __init__( self.complete = False elif not self.useful: self.useful = True - (self.start, self.end) = (span.start, span.end) + self.start, self.end = span.start, span.end self.reverse = span.reverse else: self.start = min(self.start, span.start) @@ -1622,13 +1585,6 @@ def __init__( if self.reverse is not None and (span.reverse != self.reverse): self.reverse = None - if termini_unknown: - spans = list(spans) - if spans[0].lost: - spans[0] = TerminalPadding(spans[0].length) - if spans[-1].lost: - spans[-1] = TerminalPadding(spans[-1].length) - self.spans = tuple(spans) self.length = posn self.parent_length = parent_length From 9840db96300c804bdbaa0ea070fe63197c389e59 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Wed, 6 Mar 2024 08:43:42 +1100 Subject: [PATCH 09/62] API: FeatureMap drops reverse [CHANGED] Use Feature.reversed property where orientation is required [CHANGED] raise an exception if start > end within locations values provided to FeatureMap constructor. [NEW] added FeatureMap.strict_nucleic_reversed() method, reflecting that FeatureMap requires new Span's constructed instead of the more complex setting / checking of .reverse properties. --- src/cogent3/core/alignment.py | 11 ++++- src/cogent3/core/annotation.py | 39 +++++++++------ src/cogent3/core/location.py | 71 +++++++++++++++------------ src/cogent3/core/sequence.py | 35 +++++-------- src/cogent3/draw/drawable.py | 11 ++--- src/cogent3/parse/gbseq.py | 8 ++- tests/test_core/test_alignment.py | 51 ++++++++++--------- tests/test_core/test_annotation.py | 13 ++++- tests/test_core/test_annotation_db.py | 10 ++-- tests/test_core/test_features.py | 2 +- tests/test_core/test_location.py | 48 ++++++++---------- 11 files changed, 159 insertions(+), 140 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index b8e982ba3..feef5f077 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -2834,8 +2834,15 @@ def sample( positions = [ (loc * motif_length, (loc + 1) * motif_length) for loc in locations ] - sample = IndelMap(locations=positions, parent_length=len(self)) - return self.gapped_by_map(sample, info=self.info) + make_seq = self.moltype.make_seq + new_seqs = [] + for seq in self.seqs: + seq = make_seq( + "".join(str(seq[x1:x2]) for x1, x2 in positions), name=seq.name + ) + new_seqs.append(seq) + + return self.__class__(new_seqs, info=self.info, moltype=self.moltype) def sliding_windows(self, window, step, start=None, end=None): """Generator yielding new alignments of given length and interval. diff --git a/src/cogent3/core/annotation.py b/src/cogent3/core/annotation.py index 8ddc86a6e..0c70f8263 100644 --- a/src/cogent3/core/annotation.py +++ b/src/cogent3/core/annotation.py @@ -81,7 +81,7 @@ def get_slice(self, complete: bool = False, allow_gaps: bool = False): Parameters ---------- complete - if feature not complete on parent,causes an exception to be + if feature not complete on parent, causes an exception to be raised. If False, gaps are removed. allow_gaps if on an alignment, includes the gap positions @@ -95,21 +95,17 @@ def get_slice(self, complete: bool = False, allow_gaps: bool = False): If 'complete' is true and the full length of this feature is not present in the sequence, then this method will fail. """ - # todo gah set allow_gaps=True as the default - map = self.map - if not (complete or map.complete): - map = map.without_gaps() + fmap = self.map + if not (complete or fmap.complete): + fmap = fmap.without_gaps() if not allow_gaps: - if self.reversed: - map = map.reversed() - result = self.parent[map] + result = self.parent[fmap] if self.reversed: result = result.rc() return result # all slicing now requires start < end - start, end = min(map.start, map.end), max(map.start, map.end) - result = self.parent[start:end] + result = self.parent[fmap.start : fmap.end] if self.reversed: result = result.rc() return result @@ -225,14 +221,21 @@ def union(self, features: Iterable): ----- Overlapping spans are merged """ + # spans always on the plus strand, irrespective of whether + # a feature is reversed combined = list(self.map.spans) feat_names = [self.name] if self.name else set() biotypes = {self.biotype} if self.biotype else set() seqids = {self.seqid} if self.seqid else set() + + same_orientation = True for feature in features: if feature.parent is not self.parent: raise ValueError("cannot merge annotations from different objects") + if same_orientation and feature.reversed != self.reversed: + same_orientation = False + combined.extend(feature.map.spans) if feature.name: feat_names.append(feature.name) @@ -240,19 +243,25 @@ def union(self, features: Iterable): seqids.add(feature.seqid) if feature.biotype: biotypes.add(feature.biotype) + name = ", ".join(feat_names) - map = FeatureMap(spans=combined, parent_length=len(self.parent)) - map = map.covered() # No overlaps + fmap = FeatureMap(spans=combined, parent_length=len(self.parent)) + fmap = fmap.covered() # No overlaps # the covered method drops reversed status so we need to # resurrect that, but noting we've not checked consistency # across the features - if self.map.reverse != map.reverse: - map = map.reversed() + strand = self._strand if same_orientation else "+" seqid = ", ".join(seqids) if seqids else None biotype = ", ".join(biotypes) kwargs = { **self._serialisable, - **{"map": map, "seqid": seqid, "biotype": biotype, "name": name}, + **{ + "map": fmap, + "seqid": seqid, + "biotype": biotype, + "name": name, + "strand": strand, + }, } return self.__class__(**kwargs) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 6b708af89..df8b70ee6 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -83,7 +83,9 @@ def as_map(slice, length, cls): else: lo, hi, step = _norm_slice(slice, length) assert (step or 1) == 1 - return cls(locations=[(lo, hi)], parent_length=length) + # since we disallow step, a reverse slice means an empty series + locations = [] if lo > hi else [(lo, hi)] + return cls(locations=locations, parent_length=length) class SpanI(object): @@ -1152,9 +1154,17 @@ def inverse(self): def _spans_from_locations(locations, parent_length) -> Tuple[Union[Span, _LostSpan]]: + if not len(locations): + # using len() because locations can be a numpy array + return () + + if locations[0][0] > locations[-1][1]: + raise ValueError("locations must be ordered smallest-> largest") + spans = [] for start, end in locations: - reverse = start > end + if start > end: + raise ValueError("locations must be ordered smallest-> largest") if max(start, end) < 0 or min(start, end) > parent_length: raise RuntimeError( f"located outside sequence: {(start, end, parent_length)}" @@ -1165,7 +1175,7 @@ def _spans_from_locations(locations, parent_length) -> Tuple[Union[Span, _LostSp start, end = (0, parent_length) if start < end else (parent_length, 0) spans += [ LostSpan(abs(l_diff)), - Span(start, end, reverse=reverse), + Span(start, end), LostSpan(abs(r_diff)), ] elif min(start, end) < 0: @@ -1174,18 +1184,19 @@ def _spans_from_locations(locations, parent_length) -> Tuple[Union[Span, _LostSp end = max(end, 0) spans += [ LostSpan(abs(diff)), - Span(start, end, reverse=reverse), + Span(start, end), ] elif max(start, end) > parent_length: diff = max(start, end) - parent_length start = min(start, parent_length) end = min(end, parent_length) spans += [ - Span(start, end, reverse=reverse), + Span(start, end), LostSpan(abs(diff)), ] else: - spans += [Span(start, end, reverse=reverse)] + spans += [Span(start, end)] + return tuple(spans) @@ -1493,7 +1504,7 @@ def relative_position(self, abs_pos: T) -> T: return abs_pos - self.start def get_covering_span(self): - span = (self.end, self.start) + span = self.start, self.end return self.__class__(locations=[span], parent_length=self.parent_length) def zeroed(self): @@ -1568,7 +1579,6 @@ def __init__( self.offsets = [] self.useful = False self.complete = True - self.reverse = None posn = 0 for span in spans: self.offsets.append(posn) @@ -1578,12 +1588,9 @@ def __init__( elif not self.useful: self.useful = True self.start, self.end = span.start, span.end - self.reverse = span.reverse else: self.start = min(self.start, span.start) self.end = max(self.end, span.end) - if self.reverse is not None and (span.reverse != self.reverse): - self.reverse = None self.spans = tuple(spans) self.length = posn @@ -1629,10 +1636,7 @@ def _with_termini_unknown(self): ) def get_covering_span(self): - if self.reverse: - span = (self.end, self.start) - else: - span = (self.start, self.end) + span = (self.start, self.end) return self.__class__([span], parent_length=self.parent_length) def covered(self): @@ -1664,17 +1668,31 @@ def covered(self): assert y == 0 return self.__class__(locations=result, parent_length=self.parent_length) - def reversed(self): - """Reversed location on same parent""" - spans = [s.reversed() for s in self.spans] - spans.reverse() - return self.__class__(spans=spans, parent_length=self.parent_length) - def nucleic_reversed(self): """Same location on reversed parent""" spans = [s.reversed_relative_to(self.parent_length) for s in self.spans] return self.__class__(spans=spans, parent_length=self.parent_length) + def strict_nucleic_reversed(self): + """map for a sequence that has itself been reversed and complemented + + Notes + ----- + discards reverse attribute on both spans and self + """ + spans = [] + parent_length = self.parent_length + for s in self.spans: + if not s.lost: + start = parent_length - s.end + assert start >= 0 + end = start + s.length + s = Span(start=start, end=end) + spans.append(s) + + spans.reverse() + return self.__class__(spans=spans, parent_length=self.parent_length) + def get_gap_coordinates(self): """returns [(gap pos, gap length), ...]""" gap_pos = [] @@ -1800,16 +1818,7 @@ def get_coordinates(self): v1/v2 are (start, end) unless the map is reversed, in which case it will be (end, start)""" - if self.reverse: - order_func = lambda x: (max(x), min(x)) - else: - order_func = lambda x: x - - coords = list( - map(order_func, [(s.start, s.end) for s in self.spans if not s.lost]) - ) - - return coords + return [(s.start, s.end) for s in self.spans if not s.lost] def to_rich_dict(self): """returns dicts for contained spans [dict(), ..]""" diff --git a/src/cogent3/core/sequence.py b/src/cogent3/core/sequence.py index 0a5c6459b..1e4ac8100 100644 --- a/src/cogent3/core/sequence.py +++ b/src/cogent3/core/sequence.py @@ -1070,21 +1070,8 @@ def make_feature(self, feature: FeatureDataType, *args) -> Feature: spans.append(LostSpan(post)) fmap = FeatureMap(spans=spans, parent_length=len(self)) - if revd and not seq_rced: - # the sequence is on the plus strand, and the - # feature coordinates are also for the plus strand - # but their order needs to be changed to indicate - # reverse complement is required - fmap = fmap.reversed() - elif seq_rced and not revd: - # plus strand feature, but the sequence reverse complemented - # so we need to nucleic-reverse the map - fmap = fmap.nucleic_reversed() - elif seq_rced: - # sequence is rc'ed, the feature was minus strand of - # original, so needs to be both nucleic reversed and - # then reversed - fmap = fmap.nucleic_reversed().reversed() + if seq_rced: + fmap = fmap.strict_nucleic_reversed() feature.pop("on_alignment", None) feature.pop("seqid", None) @@ -1279,8 +1266,8 @@ def with_masked_annotations( i = 0 segments = [] - fmap = region.map.reversed() if region.map.reverse else region.map - for b, e in fmap.get_coordinates(): + coords = region.map.get_coordinates() + for b, e in coords: segments.extend((str(self[i:b]), mask_char * (e - b))) i = e segments.append(str(self[i:])) @@ -1291,11 +1278,17 @@ def with_masked_annotations( new.annotation_db = self.annotation_db return new - def gapped_by_map_segment_iter(self, map, allow_gaps=True, recode_gaps=False): + def gapped_by_map_segment_iter( + self, map, allow_gaps=True, recode_gaps=False + ) -> str: if not allow_gaps and not map.complete: raise ValueError(f"gap(s) in map {map}") - complement = self.moltype.complement + # leave reorienting segments (e.g. if on rev strand) to the calling method + # it seems like that's the best bet, since it preserves given order of + # segments. + # + # todo make these methods private. for span in map.spans: if span.lost: @@ -1303,8 +1296,6 @@ def gapped_by_map_segment_iter(self, map, allow_gaps=True, recode_gaps=False): seg = unknown * span.length else: seg = str(self[span.start : span.end]) - if span.reverse: - seg = "".join(complement(seg[::-1])) yield seg @@ -1346,7 +1337,7 @@ def __getitem__(self, index): if isinstance(index, (FeatureMap, IndelMap)): new = self._mapped(index) - preserve_offset = not index.reverse + preserve_offset = True elif isinstance(index, slice) or _is_int(index): new = self.__class__( diff --git a/src/cogent3/draw/drawable.py b/src/cogent3/draw/drawable.py index 2544f35fe..d919b46a1 100644 --- a/src/cogent3/draw/drawable.py +++ b/src/cogent3/draw/drawable.py @@ -777,19 +777,16 @@ def __call__(self, type_=None, name=None, coords=None, **kwargs): name = type_.name if type_.map.complete else f"{type_.name} (incomplete)" coords = type_.map.get_coordinates() - reverse = type_.map.get_covering_span().reverse + reverse = type_.reversed type_ = type_.biotype else: - if coords[0][0] > coords[-1][1]: - reverse = True - else: - reverse = False + reverse = coords[0][0] > coords[-1][1] if coords is None: raise ValueError("No coordinates defined") - kwargs.update(dict(reverse=reverse)) + kwargs |= dict(reverse=reverse) klass = self._shapes.get(type_.lower(), Rectangle) - color = self._colors.get(type_.lower(), None) + color = self._colors.get(type_.lower()) if klass != Arrow: kwargs.pop("reverse", None) diff --git a/src/cogent3/parse/gbseq.py b/src/cogent3/parse/gbseq.py index bc2712b70..3e3476599 100644 --- a/src/cogent3/parse/gbseq.py +++ b/src/cogent3/parse/gbseq.py @@ -64,10 +64,8 @@ def GbSeqXmlParser(doc): seq = alphabet.make_seq(raw_string, name=name) - all = location.FeatureMap(locations=[(0, len(seq))], parent_length=len(seq)) - seq.add_feature( - biotype="source", name=name, spans=all.get_coordinates(), strand=all.reverse - ) + feat = location.FeatureMap(locations=[(0, len(seq))], parent_length=len(seq)) + seq.add_feature(biotype="source", name=name, spans=feat.get_coordinates()) organism = str( record.getElementsByTagName("GBSeq_organism")[0].childNodes[0].nodeValue @@ -122,7 +120,7 @@ def GbSeqXmlParser(doc): .nodeValue ) seq.add_feature(biotype=key, name=feature_name, spans=spans) - yield (name, seq) + yield name, seq def parse(*args): diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py index dd2be94b8..838daaed4 100644 --- a/tests/test_core/test_alignment.py +++ b/tests/test_core/test_alignment.py @@ -439,13 +439,6 @@ def test_to_nexus(self): got = align_norm.to_nexus("protein") self.assertEqual(got, expect) - def test_to_json(self): - """roundtrip of to_json produces correct dict""" - aln = self.Class({"seq1": "ACGG", "seq2": "CGCA", "seq3": "CCG-"}) - got = json.loads(aln.to_json()) - expect = aln.to_rich_dict() - self.assertEqual(got, expect) - def test_num_seqs(self): """SequenceCollection.num_seqs should count seqs.""" aln = self.Class({"seq1": "ACGU", "seq2": "CGUA", "seq3": "CCGU"}) @@ -1345,19 +1338,6 @@ def test_uncertainties(self): obs = aln.entropy_per_pos() assert_allclose(obs, [0, 0, 0]) - def test_sample_info(self): - """Alignment.sample should preserver info attribute""" - alignment = self.Class( - {"seq1": "ABCDEFGHIJKLMNOP", "seq2": "ABCDEFGHIJKLMNOP"}, - info={"key": "value"}, - ) - # effectively permute columns, preserving length - shuffled = alignment.sample() - self.assertEqual(shuffled.info["key"], "value") - # ensure length correct - sample = alignment.sample(10) - self.assertEqual(sample.info["key"], "value") - def test_sample_with_replacement(self): # test with replacement -- just verify that it rnus alignment = self.Class({"seq1": "gatc", "seq2": "gatc"}) @@ -2816,7 +2796,7 @@ def test_get_gap_array_equivalence(): assert_allclose(array_aln.get_gap_array(), aln.get_gap_array()) -@pytest.mark.parametrize("reverse", (False, True)) +@pytest.mark.parametrize("reverse", (False, True)[1:]) def test_aligned_rich_dict(reverse): map_, s = make_seq( "TTGAAGAATATGT------GAAAGAG", name="s1", moltype="dna" @@ -3059,10 +3039,10 @@ def test_aln_rev_slice(name, array_align): assert not got seq = got.get_gapped_seq(name) - assert str(seq) == "" + assert not str(seq) seq = got.get_seq(name) - assert str(seq) == "" + assert not str(seq) @pytest.mark.parametrize("cls", (SequenceCollection, Alignment, ArrayAlignment)) @@ -3607,3 +3587,28 @@ def test_construct_unaligned_seq_propogates_seqid(cls): seq = cls(data) got = _construct_unaligned_seq(seq, name="seq1", moltype=DNA) assert got._seq.seqid == "seq1" + + +@pytest.mark.parametrize("cls", (Alignment, ArrayAlignment)) +def test_sample_info(cls): + """Alignment.sample should preserver info attribute""" + alignment = cls( + {"seq1": "ABCDEFGHIJKLMNOP", "seq2": "ABCDEFGHIJKLMNOP"}, + info={"key": "value"}, + ) + # effectively permute columns, preserving length + shuffled = alignment.sample() + assert shuffled.info["key"] == "value" + # ensure length correct + sample = alignment.sample(10) + assert sample.info["key"] == "value" + + +@pytest.mark.parametrize("cls", (SequenceCollection, Alignment, ArrayAlignment)) +def test_to_json(cls): + """roundtrip of to_json produces correct dict""" + aln = cls({"seq1": "ACGG", "seq2": "CGCA", "seq3": "CCG-"}) + txt = aln.to_json() + got = json.loads(txt) + expect = aln.to_rich_dict() + assert got == expect diff --git a/tests/test_core/test_annotation.py b/tests/test_core/test_annotation.py index bddd156ff..d39bc7797 100644 --- a/tests/test_core/test_annotation.py +++ b/tests/test_core/test_annotation.py @@ -308,16 +308,25 @@ def test_features_survives_seq_rename(rev): seq = DNA.make_seq("".join(segments), name="original") gene = seq.add_feature(biotype="gene", name="gene1", spans=[(10, 20), (25, 30)]) + gene_expect = str(seq[10:20]) + str(seq[25:30]) + assert str(gene.get_slice()) == gene_expect domain = seq.add_feature( biotype="domain", name="domain1", spans=[(20, 25)], strand="-" ) + domain_expect = str(seq[20:25].rc()) + domain_got = domain.get_slice() + assert str(domain_got) == domain_expect sliced = seq[5:-3] sliced.name = "sliced" sliced = sliced.rc() if rev else sliced + got = list(sliced.get_features(name="gene1"))[0] - assert str(got.get_slice()) == str(gene.get_slice()) + got = got.get_slice() + assert str(got) == gene_expect + got = list(sliced.get_features(name="domain1"))[0] - assert str(got.get_slice()) == str(domain.get_slice()) + got = got.get_slice() + assert str(got) == domain_expect @pytest.mark.parametrize("rev", (False, True)) diff --git a/tests/test_core/test_annotation_db.py b/tests/test_core/test_annotation_db.py index ac081d1a0..8ef5ad45f 100644 --- a/tests/test_core/test_annotation_db.py +++ b/tests/test_core/test_annotation_db.py @@ -405,13 +405,15 @@ def test_feature_nucleic(): from cogent3 import make_seq from cogent3.core import location as loc + # 111111 + # 0123456789012345 seq = make_seq("AACCTTTGGGGAATTT", moltype="dna") - mmap = loc.FeatureMap(locations=[(4, 7), (11, 13)], parent_length=16) - expect = seq[mmap.reversed()] + mmap = loc.FeatureMap(locations=[(4, 7), (10, 12)], parent_length=len(seq)) + expect = seq[mmap] rcseq = seq.rc() - rmap = mmap.nucleic_reversed().reversed() - got = rcseq[rmap] + rmap = mmap.strict_nucleic_reversed() + got = rcseq[rmap].rc() assert str(got) == str(expect) diff --git a/tests/test_core/test_features.py b/tests/test_core/test_features.py index d51dea406..5c1952762 100755 --- a/tests/test_core/test_features.py +++ b/tests/test_core/test_features.py @@ -698,7 +698,7 @@ def test_feature_not_equal_attr(ann_seq, attr): biotype=f1.biotype, map=f1.map, name=f1.name, - strand="-" if f1.map.reverse else "+", + strand="-" if f1.reversed else "+", ) value = attrs["map"][:4] if attr == "map" else "different" attrs[attr] = value diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index fc415a4c1..c8a91f79a 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -263,19 +263,6 @@ def test_ends_inside(self): class MapTests(TestCase): """tests of the Map class""" - def test_get_coords(self): - """get_coordinates should return raw coordinates matching input""" - spans = [(0, 9), (20, 32)] - map = FeatureMap(spans, parent_length=100) - coords = map.get_coordinates() - self.assertEqual(coords, spans) - - # should work for reversed Maps too - spans = [(32, 20), (9, 0)] - map = FeatureMap(spans, parent_length=100) - coords = map.get_coordinates() - self.assertEqual(coords, spans) - def test_get_gap_coords(self): """returns gap start and lengths""" m, seq = DNA.make_seq("-AC--GT-TTA--").parse_out_gaps() @@ -348,8 +335,9 @@ def test_indel_map_useful_complete(): assert len(im) == im.length == 3 -@pytest.mark.parametrize("cls,expect", ((FeatureMap, [(9, 0)]), (IndelMap, [(0, 9)]))) -def test_map_nucleic_reversed(cls, expect): +@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) +def test_map_nucleic_reversed(cls): + expect = [(0, 9)] # seq is 9 long # plus coords 012345678 # +slice ** @@ -412,19 +400,6 @@ def test_nongap(cls): assert got == [(0, 2), (5, 6), (7, 10)] -def test_reversed(): - seq = DNA.make_seq("AC---G-TAA--") - m, _ = seq.parse_out_gaps() - m = FeatureMap(spans=m.spans, parent_length=m.parent_length) - # reversed() reverses the order of spans, but keeps their coordinates - # differs from nucleic reversed, which computes a new relative position - rev = m.reversed() - got = [s.length if s.lost else (s.start, s.end) for s in rev.spans] - expect = [s.length if s.lost else (s.start, s.end) for s in m.spans] - expect.reverse() - assert got == expect - - def test_round_trip_rich_dict(): seq = DNA.make_seq("AC---G-TAA--") m, s = seq.parse_out_gaps() @@ -598,3 +573,20 @@ def test_indelmap_no_gaps(): imap = IndelMap(locations=(), parent_length=6) gaps = imap.gaps() assert not gaps + + +def test_get_coords(): + """get_coordinates should return raw coordinates matching input""" + spans = [(0, 9), (20, 32)] + map = FeatureMap(spans, parent_length=100) + coords = map.get_coordinates() + assert coords == spans + + +def test_get_coords_invalid_order(): + """get_coordinates should return raw coordinates matching input""" + + # should work for reversed Maps too + spans = [(32, 20), (9, 0)] + with pytest.raises(ValueError): + FeatureMap(spans, parent_length=100) From 7953c7bbe9aa7f1e3d27b5eefdac44208cbd1b04 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Wed, 6 Mar 2024 08:47:04 +1100 Subject: [PATCH 10/62] API: update FeatureMap.nucleic_reversed [CHANGED] delete the old FeatureMap.nucleic_reversed() implementation and rename FeatureMap.strict_nucleic_reversed() to FeatureMap.nucleic_reversed() [CHANGED] FeatureMap now inherits from MapABC [CHANGED] FeatureMap.spans is now a generator, as per IndelMap --- src/cogent3/core/alignment.py | 8 --- src/cogent3/core/location.py | 90 +++++++++++++-------------- src/cogent3/core/sequence.py | 17 +---- tests/test_core/test_annotation.py | 21 +++---- tests/test_core/test_annotation_db.py | 2 +- tests/test_core/test_location.py | 13 ++-- tests/test_core/test_maps.py | 2 +- tests/test_core/test_sequence.py | 1 - 8 files changed, 62 insertions(+), 92 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index feef5f077..76f0c293f 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -2272,12 +2272,6 @@ def rc(self): # in this new approach, the map is always plus strand, so # the following comprehension ensures the order of spans # and the span start/end satisfy this - spans = [s if s.lost else s.reversed() for s in new_map.spans] - spans.reverse() - new_map = type(new_map)( - spans=spans, - parent_length=new_map.parent_length, - ) return Aligned(new_map, self.data.rc()) def to_rna(self): @@ -5242,8 +5236,6 @@ def make_feature( feature["seqid"] = feature.get("seqid", None) # there's no sequence to bind to, the feature is directly on self - # todo gah check handling of strand etc..., maybe reuse code - # in Sequence? revd = feature.pop("strand", None) == "-" feature["strand"] = "-" if revd else "+" fmap = FeatureMap(parent_length=len(self), locations=feature.pop("spans")) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index df8b70ee6..c01b55bb7 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1123,11 +1123,6 @@ def __len__(self): def __add__(self, other): ... - @classmethod - @abstractmethod - def from_rich_dict(cls, data): - ... - @abstractmethod def gaps(self): ... @@ -1207,15 +1202,6 @@ def _spans_from_locations(locations, parent_length) -> Tuple[Union[Span, _LostSp class IndelMap(MapABC): """store locations of deletions in a Aligned sequence""" - # todo design notes - # I think this object should never try and store "history", i.e. it - # should directly relate to the current sequence only. Let that sequence, - # which is represented by a SeqView, store history. - # Following ths, storing reverse is also a bad idea for this object, - # also done by the SeqView only. - # TODO reverse complement of Alignment -> Aligned -> SeqView, IndelMap - # should just do nucleic reverse. I think this is the next task. - spans: dataclasses.InitVar[Optional[tuple]] = () locations: dataclasses.InitVar[Optional[Sequence[T]]] = None termini_unknown: dataclasses.InitVar[bool] = dataclasses.field(default=False) @@ -1229,7 +1215,7 @@ class IndelMap(MapABC): ) def __post_init__(self, spans, locations, termini_unknown): - if locations: + if locations is not None and len(locations): spans = _spans_from_locations(locations, parent_length=self.parent_length) elif isinstance(spans, property): # This clause is due a known issue with dataclasses. @@ -1297,14 +1283,11 @@ def __add__(self, other): ) def __mul__(self, scale): - # For Protein -> DNA - new_parts = [] - for span in self._spans: - new_parts.append(span * scale) + new_parts = [span * scale for span in self._spans] return self.__class__(spans=new_parts, parent_length=self.parent_length * scale) def __repr__(self): - return repr(self._spans) + f"/{self.parent_length}" + return f"{self._spans!r}/{self.parent_length}" @property def offsets(self): @@ -1365,7 +1348,7 @@ def nucleic_reversed(self): spans = [s.reversed_relative_to(self.parent_length) for s in self._spans] return self.__class__(spans=spans, parent_length=self.parent_length) - def strict_nucleic_reversed(self): + def nucleic_reversed(self): """map for a sequence that has itself been reversed and complemented Notes @@ -1559,22 +1542,34 @@ def to_feature_map(self): return FeatureMap(**kwargs) -class FeatureMap: +@dataclasses.dataclass +class FeatureMap(MapABC): """A map holds a list of spans.""" - def __init__( - self, - locations=None, - spans=None, - parent_length=None, - ): - assert parent_length is not None - d = locals() - exclude = ("self", "__class__", "__slots__") - self._serialisable = {k: v for k, v in d.items() if k not in exclude} + spans: dataclasses.InitVar[Optional[tuple]] = () + locations: dataclasses.InitVar[Optional[Sequence[T]]] = None + parent_length: int = 0 + offsets: list[int] = dataclasses.field(init=False, repr=False) + useful: bool = dataclasses.field(init=False, repr=False, default=False) + complete: bool = dataclasses.field(init=False, repr=False, default=True) + _serialisable: dict = dataclasses.field(init=False, repr=False) + _spans: Tuple[Union[Span, _LostSpan, TerminalPadding]] = dataclasses.field( + default=(), init=False + ) - if spans is None: - spans = _spans_from_locations(locations, parent_length=parent_length) + def __post_init__(self, spans, locations): + assert self.parent_length is not None + + if locations is not None and len(locations): + spans = _spans_from_locations(locations, parent_length=self.parent_length) + elif isinstance(spans, property): + # This clause is due a known issue with dataclasses. + # As we have a spans property, the default spans value is + # ignored, so we have to check for its value being a property + # and then set the default value here + spans = () + + spans = tuple(spans) self.offsets = [] self.useful = False @@ -1592,15 +1587,14 @@ def __init__( self.start = min(self.start, span.start) self.end = max(self.end, span.end) - self.spans = tuple(spans) + self._spans = tuple(spans) self.length = posn - self.parent_length = parent_length def __len__(self): return self.length def __repr__(self): - return repr(list(self.spans)) + f"/{self.parent_length}" + return f"{list(self.spans)!r}/{self.parent_length}" def __getitem__(self, new_map): # A possible shorter map at the same level @@ -1627,6 +1621,10 @@ def __add__(self, other): spans=self.spans + other.spans, parent_length=self.parent_length ) + @property + def spans(self): + yield from self._spans + def _with_termini_unknown(self): return self.__class__( self, @@ -1637,7 +1635,7 @@ def _with_termini_unknown(self): def get_covering_span(self): span = (self.start, self.end) - return self.__class__([span], parent_length=self.parent_length) + return self.__class__(locations=[span], parent_length=self.parent_length) def covered(self): """>>> Map([(10,20), (15, 25), (80, 90)]).covered().spans @@ -1669,11 +1667,6 @@ def covered(self): return self.__class__(locations=result, parent_length=self.parent_length) def nucleic_reversed(self): - """Same location on reversed parent""" - spans = [s.reversed_relative_to(self.parent_length) for s in self.spans] - return self.__class__(spans=spans, parent_length=self.parent_length) - - def strict_nucleic_reversed(self): """map for a sequence that has itself been reversed and complemented Notes @@ -1696,11 +1689,12 @@ def strict_nucleic_reversed(self): def get_gap_coordinates(self): """returns [(gap pos, gap length), ...]""" gap_pos = [] - for i, span in enumerate(self.spans): + spans = list(self.spans) + for i, span in enumerate(spans): if not span.lost: continue - pos = self.spans[i - 1].end if i else 0 + pos = spans[i - 1].end if i else 0 gap_pos.append((pos, len(span))) return gap_pos @@ -1713,7 +1707,7 @@ def gaps(self): if s.lost: locations.append((offset, offset + s.length)) offset += s.length - return self.__class__(locations, parent_length=len(self)) + return self.__class__(locations=locations, parent_length=len(self)) def shadow(self): """The 'negative' map of the spans not included in this map""" @@ -1726,7 +1720,7 @@ def nongap(self): if not s.lost: locations.append((offset, offset + s.length)) offset += s.length - return self.__class__(locations, parent_length=len(self)) + return self.__class__(locations=locations, parent_length=len(self)) def without_gaps(self): return self.__class__( @@ -1908,7 +1902,7 @@ def gap_coords_to_map(gaps_lengths: dict, seq_length: int) -> FeatureMap: """ if not gaps_lengths: - return FeatureMap([(0, seq_length)], parent_length=seq_length) + return FeatureMap(locations=[(0, seq_length)], parent_length=seq_length) spans = [] last = pos = 0 diff --git a/src/cogent3/core/sequence.py b/src/cogent3/core/sequence.py index 1e4ac8100..938caba55 100644 --- a/src/cogent3/core/sequence.py +++ b/src/cogent3/core/sequence.py @@ -1071,7 +1071,7 @@ def make_feature(self, feature: FeatureDataType, *args) -> Feature: fmap = FeatureMap(spans=spans, parent_length=len(self)) if seq_rced: - fmap = fmap.strict_nucleic_reversed() + fmap = fmap.nucleic_reversed() feature.pop("on_alignment", None) feature.pop("seqid", None) @@ -1284,12 +1284,6 @@ def gapped_by_map_segment_iter( if not allow_gaps and not map.complete: raise ValueError(f"gap(s) in map {map}") - # leave reorienting segments (e.g. if on rev strand) to the calling method - # it seems like that's the best bet, since it preserves given order of - # segments. - # - # todo make these methods private. - for span in map.spans: if span.lost: unknown = "?" if span.terminal or recode_gaps else "-" @@ -1304,12 +1298,10 @@ def gapped_by_map_motif_iter(self, map): yield from segment def gapped_by_map(self, map, recode_gaps=False): - # todo gah do we propagate annotations here? segments = self.gapped_by_map_segment_iter(map, True, recode_gaps) - new = self.__class__( + return self.__class__( "".join(segments), name=self.name, check=False, info=self.info ) - return new def _mapped(self, map): # Called by generic __getitem__ @@ -1319,10 +1311,7 @@ def _mapped(self, map): def __repr__(self): myclass = f"{self.__class__.__name__}" myclass = myclass.split(".")[-1] - if len(self) > 10: - seq = f"{str(self)[:7]}... {len(self):,}" - else: - seq = str(self) + seq = f"{str(self)[:7]}... {len(self):,}" if len(self) > 10 else str(self) return f"{myclass}({seq})" def __getitem__(self, index): diff --git a/tests/test_core/test_annotation.py b/tests/test_core/test_annotation.py index d39bc7797..a1420f37c 100644 --- a/tests/test_core/test_annotation.py +++ b/tests/test_core/test_annotation.py @@ -91,17 +91,6 @@ def test_span(self): assert forward.reversed_relative_to(100) == reverse assert reverse.reversed_relative_to(100) == forward - def test_map(self): - """reversing a map with multiple spans should preserve span relative - order""" - forward = [Span(20, 30), Span(40, 50)] - fmap = FeatureMap(spans=forward, parent_length=100) - fmap_reversed = fmap.nucleic_reversed() - reverse = [Span(70, 80, reverse=True), Span(50, 60, reverse=True)] - rmap = FeatureMap(spans=reverse, parent_length=100) - for i in range(2): - self.assertEqual(fmap_reversed.spans[i], rmap.spans[i]) - @pytest.mark.parametrize("alignment", (False, True)) def test_constructing_collections(alignment): @@ -362,3 +351,13 @@ def test_features_invalid_seqid(cls): with pytest.raises(ValueError): # seqid does not exist list(seqs.get_features(name="gene1", seqid="blah")) + + +def test_map(): + """reversing a map with multiple spans should match hand-crafted""" + forward = [Span(20, 30), Span(40, 50)] + fmap = FeatureMap(spans=forward, parent_length=100) + fmap_reversed = fmap.nucleic_reversed() + reverse = [Span(50, 60), Span(70, 80)] + rmap = FeatureMap(spans=reverse, parent_length=100) + assert fmap_reversed.get_coordinates() == rmap.get_coordinates() diff --git a/tests/test_core/test_annotation_db.py b/tests/test_core/test_annotation_db.py index 8ef5ad45f..8a4e815ba 100644 --- a/tests/test_core/test_annotation_db.py +++ b/tests/test_core/test_annotation_db.py @@ -412,7 +412,7 @@ def test_feature_nucleic(): expect = seq[mmap] rcseq = seq.rc() - rmap = mmap.strict_nucleic_reversed() + rmap = mmap.nucleic_reversed() got = rcseq[rmap].rc() assert str(got) == str(expect) diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index c8a91f79a..d0327f23c 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -542,19 +542,16 @@ def test_indelmap_to_feature_map(): assert mm.get_coordinates() == im.get_coordinates() -def test_indelmap_strict_nucleic_reversed(): +def test_indelmap_nucleic_reversed(): spans = [LostSpan(2), Span(2, 4), LostSpan(2), Span(4, 8), LostSpan(2)] kwargs = dict(spans=spans, parent_length=12) orig = IndelMap(**kwargs) - rev = orig.strict_nucleic_reversed() + rev = orig.nucleic_reversed() rev_spans = tuple(rev.spans) assert rev_spans[1].reverse == rev_spans[3].reverse == False old = orig.nucleic_reversed() old_spans = tuple(old.spans) - assert old_spans[1].reverse == old_spans[3].reverse == True - assert rev.get_coordinates() == [ - tuple(sorted(a)) for a in reversed(old.get_coordinates()) - ] + assert rev.get_coordinates() == old.get_coordinates() def test_indelmap_with_reverse_span(): @@ -578,7 +575,7 @@ def test_indelmap_no_gaps(): def test_get_coords(): """get_coordinates should return raw coordinates matching input""" spans = [(0, 9), (20, 32)] - map = FeatureMap(spans, parent_length=100) + map = FeatureMap(locations=spans, parent_length=100) coords = map.get_coordinates() assert coords == spans @@ -589,4 +586,4 @@ def test_get_coords_invalid_order(): # should work for reversed Maps too spans = [(32, 20), (9, 0)] with pytest.raises(ValueError): - FeatureMap(spans, parent_length=100) + FeatureMap(locations=spans, parent_length=100) diff --git a/tests/test_core/test_maps.py b/tests/test_core/test_maps.py index 275876598..54c3e4829 100644 --- a/tests/test_core/test_maps.py +++ b/tests/test_core/test_maps.py @@ -9,7 +9,7 @@ class MapTest(unittest.TestCase): def test_spans(self): # a simple two part map of length 10 - map = FeatureMap([(0, 5), (5, 10)], parent_length=10) + map = FeatureMap(locations=[(0, 5), (5, 10)], parent_length=10) # try different spans on the above map for (start, end), expected in [ ((0, 4), "[0:4]"), diff --git a/tests/test_core/test_sequence.py b/tests/test_core/test_sequence.py index a88a9ffbc..784b1f16e 100644 --- a/tests/test_core/test_sequence.py +++ b/tests/test_core/test_sequence.py @@ -2595,7 +2595,6 @@ def test_gapped_by_map_segment_iter(): moltype = get_moltype("dna") m, seq = moltype.make_seq("-TCC--AG").parse_out_gaps() g = list(seq.gapped_by_map_segment_iter(m, allow_gaps=True, recode_gaps=False)) - print(g) @pytest.mark.parametrize("rev", (False, True)) From 97acd2caf5fd1e16ccca3eab08e6e3bc5dea451d Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Wed, 6 Mar 2024 10:56:28 +1100 Subject: [PATCH 11/62] MAINT: deleted duplicated method plus other issues raised by codacy --- src/cogent3/core/location.py | 5 ----- src/cogent3/core/sequence.py | 8 ++++---- tests/test_core/test_location.py | 4 ++-- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index c01b55bb7..476b93e7c 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1343,11 +1343,6 @@ def get_gap_coordinates(self): return gap_pos - def nucleic_reversed(self): - """Same location on reversed parent""" - spans = [s.reversed_relative_to(self.parent_length) for s in self._spans] - return self.__class__(spans=spans, parent_length=self.parent_length) - def nucleic_reversed(self): """map for a sequence that has itself been reversed and complemented diff --git a/src/cogent3/core/sequence.py b/src/cogent3/core/sequence.py index 938caba55..b2e782ccb 100644 --- a/src/cogent3/core/sequence.py +++ b/src/cogent3/core/sequence.py @@ -1279,12 +1279,12 @@ def with_masked_annotations( return new def gapped_by_map_segment_iter( - self, map, allow_gaps=True, recode_gaps=False + self, segment_map, allow_gaps=True, recode_gaps=False ) -> str: - if not allow_gaps and not map.complete: - raise ValueError(f"gap(s) in map {map}") + if not allow_gaps and not segment_map.complete: + raise ValueError(f"gap(s) in map {segment_map}") - for span in map.spans: + for span in segment_map.spans: if span.lost: unknown = "?" if span.terminal or recode_gaps else "-" seg = unknown * span.length diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index d0327f23c..61273e9d3 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -575,8 +575,8 @@ def test_indelmap_no_gaps(): def test_get_coords(): """get_coordinates should return raw coordinates matching input""" spans = [(0, 9), (20, 32)] - map = FeatureMap(locations=spans, parent_length=100) - coords = map.get_coordinates() + fmap = FeatureMap(locations=spans, parent_length=100) + coords = fmap.get_coordinates() assert coords == spans From a2bc51c0cae977c5bc00950689b9f8c9af33470c Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Mon, 22 Apr 2024 07:14:18 +1000 Subject: [PATCH 12/62] TST: convert to pytest some tests test related to Maps --- tests/test_core/test_alignment.py | 271 ++++++++++++++++-------------- tests/test_core/test_location.py | 52 +++--- 2 files changed, 175 insertions(+), 148 deletions(-) diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py index f2821df59..268578a64 100644 --- a/tests/test_core/test_alignment.py +++ b/tests/test_core/test_alignment.py @@ -1070,93 +1070,6 @@ def is_list(x): {"a": "AA", "b": "A-", "c": "AA"}, ) - def test_no_degenerates(self): - """no_degenerates correctly excludes columns containing IUPAC ambiguity codes""" - data = { - "s1": "AAA CCC GGG TTT".replace(" ", ""), - "s2": "CCC GGG T-T AAA".replace(" ", ""), - "s3": "GGR YTT AAA CCC".replace(" ", ""), - } - aln = self.Class(data=data, moltype=DNA) - - # motif length of 1, defaults - no gaps allowed - result = aln.no_degenerates().to_dict() - expect = { - "s1": "AA CC GG TTT".replace(" ", ""), - "s2": "CC GG TT AAA".replace(" ", ""), - "s3": "GG TT AA CCC".replace(" ", ""), - } - self.assertEqual(result, expect) - - # allow gaps - result = aln.no_degenerates(allow_gap=True).to_dict() - expect = { - "s1": "AA CC GGG TTT".replace(" ", ""), - "s2": "CC GG T-T AAA".replace(" ", ""), - "s3": "GG TT AAA CCC".replace(" ", ""), - } - self.assertEqual(result, expect) - - # motif length of 3, defaults - no gaps allowed - result = aln.no_degenerates(motif_length=3).to_dict() - expect = { - "s1": "TTT".replace(" ", ""), - "s2": "AAA".replace(" ", ""), - "s3": "CCC".replace(" ", ""), - } - self.assertEqual(result, expect) - - # allow gaps - result = aln.no_degenerates(motif_length=3, allow_gap=True).to_dict() - expect = { - "s1": "GGG TTT".replace(" ", ""), - "s2": "T-T AAA".replace(" ", ""), - "s3": "AAA CCC".replace(" ", ""), - } - self.assertEqual(result, expect) - - # raises ValueError if a default moltype -- with no - # degen characters -- is used - aln = self.Class(data=data) - self.assertRaises(ValueError, aln.no_degenerates) - - def test_omit_gap_pos(self): - """Alignment omit_gap_pos should return alignment w/o positions of gaps""" - aln = self.end_gaps - # first, check behavior when we're just acting on the cols (and not - # trying to delete the naughty seqs). - - # default should strip out cols that are 100% gaps - result = aln.omit_gap_pos() - self.assertEqual(result.to_dict(), {"a": "-ABC", "b": "CBA-", "c": "-DEF"}) - # if allowed_gap_frac is 1, shouldn't delete anything - self.assertEqual( - aln.omit_gap_pos(1).to_dict(), - {"a": "--A-BC-", "b": "-CB-A--", "c": "--D-EF-"}, - ) - # if allowed_gap_frac is 0, should strip out any cols containing gaps - self.assertEqual( - aln.omit_gap_pos(0).to_dict(), {"a": "AB", "b": "BA", "c": "DE"} - ) - # intermediate numbers should work as expected - self.assertEqual( - aln.omit_gap_pos(0.4).to_dict(), {"a": "ABC", "b": "BA-", "c": "DEF"} - ) - self.assertEqual( - aln.omit_gap_pos(0.7).to_dict(), {"a": "-ABC", "b": "CBA-", "c": "-DEF"} - ) - - # when we increase the number of sequences to 6, more differences - # start to appear. - new_aln_data = aln.named_seqs.copy() - new_aln_data["d"] = "-------" - new_aln_data["e"] = "XYZXYZX" - new_aln_data["f"] = "AB-CDEF" - aln = self.Class(new_aln_data) - - # if no gaps are allowed, we get None - self.assertEqual(aln.omit_gap_pos(0), None) - def test_omit_gap_pos2(self): """consistency with different motif_length values""" data = { @@ -1416,41 +1329,6 @@ def test_variable_positions(self): self.assertEqual(aln.variable_positions(include_gap_motif=True), []) self.assertEqual(aln.variable_positions(include_gap_motif=False), []) - def test_to_type(self): - """correctly interconvert between alignment types""" - new_seqs = {"seq1": "ACGTACGTA", "seq2": "ACCGAA---", "seq3": "ACGTACGTT"} - array_align = self.Class == ArrayAlignment - # when array_align arg matches instance class, no conversion - # and get back self - aln = self.Class(data=new_seqs) - new = aln.to_type(array_align=array_align) - self.assertEqual(id(aln), id(new)) - - # when array_align arg does not match, should get back the opposite type - new = aln.to_type(array_align=not array_align) - self.assertFalse(isinstance(new, self.Class)) - - # we should be able to specify moltype and alignment - new = aln.to_type(array_align=not array_align, moltype=DNA) - self.assertEqual(new.to_dict(), new_seqs) - # and translate - self.assertEqual( - new.get_translation().to_dict(), - {"seq1": "TYV", "seq3": "TYV", "seq2": "TE-"}, - ) - - # should work on ArrayAlign when just moltype changes - new_seqs = {"seq1": "ACGTACGTA", "seq2": "ACCGAA---"} - aln = self.Class(data=new_seqs) - new = aln.to_type(array_align=array_align, moltype=DNA) - new = new.no_degenerates() # this should not fail! - self.assertEqual(len(new), len(aln) - 3) - - # should correctly apply to existing moltype - aln = self.Class(data=new_seqs, moltype=DNA) - new = aln.to_type(array_align=not array_align) - self.assertEqual(aln.moltype, new.moltype) - def test_to_type_info(self): """interconverting between alignment types preserves info attribute""" new_seqs = {"seq1": "ACGTACGTA", "seq2": "ACCGAA---", "seq3": "ACGTACGTT"} @@ -2166,6 +2044,57 @@ def test_model_aln_to_model_aln(self): self.assertEqual(self.r1.name, "x") +def test_featuremap_slice_aligned(): + from cogent3.core.alignment import Aligned + from cogent3.core.location import FeatureMap, Span + + raw_seq = "ACGGTAAAG" + im, seq = DNA.make_seq(raw_seq).parse_out_gaps() + ia = Aligned(im, seq) + length = len(raw_seq) + fmap = FeatureMap(spans=[Span(2, 4), Span(5, 8)], parent_length=length) + got = ia[fmap] + assert str(got) == "GGAAA" + + +@pytest.mark.parametrize("cls", (ArrayAlignment, Alignment)) +def test_to_type(cls): + """correctly interconvert between alignment types""" + new_seqs = {"seq1": "ACGTACGTA", "seq2": "ACCGAA---", "seq3": "ACGTACGTT"} + array_align = cls == ArrayAlignment + # when array_align arg matches instance class, no conversion + # and get back self + aln = cls(data=new_seqs) + new = aln.to_type(array_align=array_align) + assert id(aln) == id(new) + + # when array_align arg does not match, should get back the opposite type + new = aln.to_type(array_align=not array_align) + assert not isinstance(new, cls) + + # we should be able to specify moltype and alignment + new = aln.to_type(array_align=not array_align, moltype=DNA) + assert new.to_dict() == new_seqs + # and translate + assert new.get_translation().to_dict() == { + "seq1": "TYV", + "seq3": "TYV", + "seq2": "TE-", + } + + # should work on ArrayAlign when just moltype changes + new_seqs = {"seq1": "ACGTACGTA", "seq2": "ACCGAA---"} + aln = cls(data=new_seqs) + new = aln.to_type(array_align=array_align, moltype=DNA) + new = new.no_degenerates() # this should not fail! + assert len(new) == len(aln) - 3 + + # should correctly apply to existing moltype + aln = cls(data=new_seqs, moltype=DNA) + new = aln.to_type(array_align=not array_align) + assert aln.moltype == new.moltype + + @pytest.mark.parametrize( "moltype,array_align", tuple(itertools.product(["rna", "dna", "protein"], [True, False])), @@ -3606,6 +3535,104 @@ def test_array_align_error_with_mixed_length(): make_aligned_seqs(data=data) +@pytest.mark.parametrize("cls", (Alignment, ArrayAlignment)) +def test_omit_gap_pos(cls): + """Alignment omit_gap_pos should return alignment w/o positions of gaps""" + aln = cls({"a": "--A-BC-", "b": "-CB-A--", "c": "--D-EF-"}, names=["a", "b", "c"]) + # first, check behavior when we're just acting on the cols (and not + # trying to delete the naughty seqs). + + # default should strip out cols that are 100% gaps + result = aln.omit_gap_pos() + assert result.to_dict() == {"a": "-ABC", "b": "CBA-", "c": "-DEF"} + # if allowed_gap_frac is 1, shouldn't delete anything + assert aln.omit_gap_pos(1).to_dict() == { + "a": "--A-BC-", + "b": "-CB-A--", + "c": "--D-EF-", + } + + # if allowed_gap_frac is 0, should strip out any cols containing gaps + assert aln.omit_gap_pos(0).to_dict() == {"a": "AB", "b": "BA", "c": "DE"} + # intermediate numbers should work as expected + assert aln.omit_gap_pos(0.4).to_dict() == {"a": "ABC", "b": "BA-", "c": "DEF"} + assert aln.omit_gap_pos(0.7).to_dict() == {"a": "-ABC", "b": "CBA-", "c": "-DEF"} + + # when we increase the number of sequences to 6, more differences + # start to appear. + new_aln_data = aln.named_seqs.copy() + new_aln_data["d"] = "-------" + new_aln_data["e"] = "XYZXYZX" + new_aln_data["f"] = "AB-CDEF" + aln = cls(new_aln_data) + + # if no gaps are allowed, we get None + assert aln.omit_gap_pos(0) is None + + +@pytest.mark.parametrize("cls", (Alignment, ArrayAlignment)) +def test_no_degenerates(cls): + """no_degenerates correctly excludes columns containing IUPAC ambiguity codes""" + data = { + "s1": "AAA CCC GGG TTT".replace(" ", ""), + "s2": "CCC GGG T-T AAA".replace(" ", ""), + "s3": "GGR YTT AAA CCC".replace(" ", ""), + } + aln = cls(data=data, moltype=DNA) + + # motif length of 1, defaults - no gaps allowed + result = aln.no_degenerates().to_dict() + expect = { + "s1": "AA CC GG TTT".replace(" ", ""), + "s2": "CC GG TT AAA".replace(" ", ""), + "s3": "GG TT AA CCC".replace(" ", ""), + } + assert result == expect + + # allow gaps + result = aln.no_degenerates(allow_gap=True).to_dict() + expect = { + "s1": "AA CC GGG TTT".replace(" ", ""), + "s2": "CC GG T-T AAA".replace(" ", ""), + "s3": "GG TT AAA CCC".replace(" ", ""), + } + assert result == expect + + # motif length of 3, defaults - no gaps allowed + result = aln.no_degenerates(motif_length=3).to_dict() + expect = { + "s1": "TTT".replace(" ", ""), + "s2": "AAA".replace(" ", ""), + "s3": "CCC".replace(" ", ""), + } + assert result == expect + + # allow gaps + result = aln.no_degenerates(motif_length=3, allow_gap=True).to_dict() + expect = { + "s1": "GGG TTT".replace(" ", ""), + "s2": "T-T AAA".replace(" ", ""), + "s3": "AAA CCC".replace(" ", ""), + } + assert result == expect + + +@pytest.mark.parametrize("cls", (Alignment, ArrayAlignment)) +@pytest.mark.parametrize("moltype", ("bytes", "text")) +def test_no_degenerates_invalid_moltype(cls, moltype): + # raises ValueError if a default moltype -- with no + # degen characters -- is used + data = { + "s1": "AAA CCC GGG TTT".replace(" ", ""), + "s2": "CCC GGG T-T AAA".replace(" ", ""), + "s3": "GGR YTT AAA CCC".replace(" ", ""), + } + + aln = cls(data=data, moltype=moltype) + with pytest.raises(ValueError): + aln.no_degenerates() + + @pytest.mark.parametrize("cls", (Alignment, ArrayAlignment)) @pytest.mark.parametrize("calc", ("hamming", None)) def test_quick_tree(cls, calc, brca1_data): diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 61273e9d3..96ccd390b 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -269,32 +269,6 @@ def test_get_gap_coords(self): got = m.get_gap_coordinates() self.assertEqual(dict(got), {0: 1, 2: 2, 4: 1, 7: 2}) - def test_gap_coords_to_map(self): - """construct a Map from coordinates of gap alone""" - m, seq = DNA.make_seq("-AC--GT-TTA--").parse_out_gaps() - gap_coords = {0: 1, 2: 2, 4: 1, 7: 2} - seqlen = 70 - got = gap_coords_to_map(gap_coords, seqlen) - self.assertEqual(len(got), seqlen + sum(gap_coords.values())) - - gap_coords = {5: 2, 17: 3, 10: 2} - seqlen = 20 - got = gap_coords_to_map(gap_coords, seqlen) - self.assertEqual(len(got), sum(gap_coords.values()) + seqlen) - - # roundtrip from Map.get_gap_coordinates() - self.assertEqual(dict(got.get_gap_coordinates()), gap_coords) - - # and no gaps - m, seq = DNA.make_seq("ACGTTTA").parse_out_gaps() - got = gap_coords_to_map({}, len(seq)) - self.assertEqual(len(got), len(m)) - self.assertEqual(got.get_coordinates(), m.get_coordinates()) - - # and gaps outside sequence - with self.assertRaises(ValueError): - got = gap_coords_to_map({20: 1}, len(seq)) - @pytest.mark.parametrize("cls", (IndelMap, FeatureMap)) def test_map_plus_position(cls): @@ -587,3 +561,29 @@ def test_get_coords_invalid_order(): spans = [(32, 20), (9, 0)] with pytest.raises(ValueError): FeatureMap(locations=spans, parent_length=100) + + +def test_gap_coords_to_map(): + """construct a Map from coordinates of gap alone""" + gap_coords = {0: 1, 2: 2, 4: 1, 7: 2} + seqlen = 70 + got = gap_coords_to_map(gap_coords, seqlen) + assert len(got) == seqlen + sum(gap_coords.values()) + + gap_coords = {5: 2, 17: 3, 10: 2} + seqlen = 20 + got = gap_coords_to_map(gap_coords, seqlen) + assert len(got) == sum(gap_coords.values()) + seqlen + + # roundtrip from Map.get_gap_coordinates() + assert dict(got.get_gap_coordinates()) == gap_coords + + # and no gaps + m, seq = DNA.make_seq("ACGTTTA").parse_out_gaps() + got = gap_coords_to_map({}, len(seq)) + assert len(got) == len(m) + assert got.get_coordinates() == m.get_coordinates() + + # and gaps outside sequence + with pytest.raises(ValueError): + gap_coords_to_map({20: 1}, len(seq)) From 1c393e4e681f7bae04c8d3f3ec7a664fb7ad79ba Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Mon, 22 Apr 2024 07:14:26 +1000 Subject: [PATCH 13/62] TST: convert a test to pytest --- tests/test_core/test_alignment.py | 98 ++++++++++++++++--------------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py index 268578a64..1f6c394d3 100644 --- a/tests/test_core/test_alignment.py +++ b/tests/test_core/test_alignment.py @@ -455,54 +455,6 @@ def test_add_info(self): align = align1 + align2 self.assertEqual(align.info["key"], "foo") - def test_add_seqs(self): - """add_seqs should return an alignment with the new sequences appended or inserted""" - data = [("name1", "AAA"), ("name2", "AAA"), ("name3", "AAA"), ("name4", "AAA")] - data1 = [("name1", "AAA"), ("name2", "AAA")] - data2 = [("name3", "AAA"), ("name4", "AAA")] - data3 = [("name5", "BBB"), ("name6", "CCC")] - aln = self.Class(data) - aln3 = self.Class(data3) - - out_aln = aln.add_seqs(aln3) - # test append at the end - self.assertEqual(str(out_aln), str(self.Class(data + data3))) - - out_aln = aln.add_seqs(aln3, before_name="name3") - self.assertEqual( - str(out_aln), str(self.Class(data1 + data3 + data2)) - ) # test insert before - - out_aln = aln.add_seqs(aln3, after_name="name2") - self.assertEqual( - str(out_aln), str(self.Class(data1 + data3 + data2)) - ) # test insert after - - out_aln = aln.add_seqs(aln3, before_name="name1") - # test if insert before first seq works - self.assertEqual(str(out_aln), str(self.Class(data3 + data))) - - out_aln = aln.add_seqs(aln3, after_name="name4") - # test if insert after last seq works - self.assertEqual(str(out_aln), str(self.Class(data + data3))) - - self.assertRaises( - ValueError, aln.add_seqs, aln3, before_name="name5" - ) # wrong after/before name - self.assertRaises( - ValueError, aln.add_seqs, aln3, after_name="name5" - ) # wrong after/before name - - if isinstance(aln, Alignment) or isinstance(aln, ArrayAlignment): - self.assertRaises((DataError, ValueError), aln.add_seqs, aln3 + aln3) - else: - exp = set([seq for name, seq in data]) - exp.update([seq + seq for name, seq in data3]) - got = set() - for seq in aln.add_seqs(aln3 + aln3).seqs: - got.update([str(seq).strip()]) - self.assertEqual(got, exp) - def test_add_seqs_info(self): """add_seqs should preserve info attribute""" data = [("name1", "AAA"), ("name2", "AAA"), ("name3", "AAA"), ("name4", "AAA")] @@ -3535,6 +3487,56 @@ def test_array_align_error_with_mixed_length(): make_aligned_seqs(data=data) +@pytest.mark.parametrize("cls", (Alignment, ArrayAlignment, SequenceCollection)) +def test_add_seqs(cls): + """add_seqs should return an alignment with the new sequences appended or inserted""" + data = [("name1", "AAA"), ("name2", "AAA"), ("name3", "AAA"), ("name4", "AAA")] + data1 = [("name1", "AAA"), ("name2", "AAA")] + data2 = [("name3", "AAA"), ("name4", "AAA")] + data3 = [("name5", "BBB"), ("name6", "CCC")] + aln = cls(data) + aln3 = cls(data3) + + out_aln = aln.add_seqs(aln3) + # test append at the end + assert str(out_aln) == str(cls(data + data3)) + + out_aln = aln.add_seqs(aln3, before_name="name3") + assert str(out_aln) == str(cls(data1 + data3 + data2)) + + # test insert before + + out_aln = aln.add_seqs(aln3, after_name="name2") + assert str(out_aln) == str(cls(data1 + data3 + data2)) # test insert after + + out_aln = aln.add_seqs(aln3, before_name="name1") + # test if insert before first seq works + assert str(out_aln) == str(cls(data3 + data)) + + out_aln = aln.add_seqs(aln3, after_name="name4") + # test if insert after last seq works + assert str(out_aln) == str(cls(data + data3)) + + with pytest.raises(ValueError): + # wrong after/before name + aln.add_seqs(aln3, before_name="name5") + + with pytest.raises(ValueError): + # wrong after/before name + aln.add_seqs(aln3, after_name="name5") + + if isinstance(aln, Alignment) or isinstance(aln, ArrayAlignment): + with pytest.raises((DataError, ValueError)): + aln.add_seqs(aln3 + aln3) + else: + exp = set([seq for name, seq in data]) + exp.update([seq + seq for name, seq in data3]) + got = set() + for seq in aln.add_seqs(aln3 + aln3).seqs: + got.update([str(seq).strip()]) + assert got == exp + + @pytest.mark.parametrize("cls", (Alignment, ArrayAlignment)) def test_omit_gap_pos(cls): """Alignment omit_gap_pos should return alignment w/o positions of gaps""" From e844a259c226370d74945872cb2cdc5de4558cad Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Mon, 22 Apr 2024 07:22:12 +1000 Subject: [PATCH 14/62] API: delete IndelMap relative and absolute position methods --- src/cogent3/core/alignment.py | 2 +- src/cogent3/core/location.py | 31 ------------------------------- tests/test_core/test_location.py | 5 ++--- 3 files changed, 3 insertions(+), 35 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index 82cd6d6d9..3043e7dd3 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -5367,7 +5367,7 @@ def get_features( if feature["seqid"]: raise RuntimeError(f"{on_alignment=} {feature=}") if seq_map is None: - seq_map = self.seqs[0].map + seq_map = self.seqs[0].map.to_feature_map() *_, strand = self.seqs[0].data.parent_coordinates() spans = numpy.array(feature["spans"]) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 476b93e7c..25409130f 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1450,37 +1450,6 @@ def inverse(self): return self.__class__(spans=new_spans, parent_length=len(self)) - T = Union[ndarray, int] - - def absolute_position(self, rel_pos: T) -> T: - """converts rel_pos into an absolute position - - Raises - ------ - raises ValueError if rel_pos < 0 - """ - check = array([rel_pos], dtype=int) if isinstance(rel_pos, int) else rel_pos - if check.min() < 0: - raise ValueError(f"must positive, not {rel_pos=}") - - if len(self) == self.parent_length: - # handle case of reversed here? - return rel_pos - - return self.start + rel_pos - - def relative_position(self, abs_pos: T) -> T: - """converts abs_pos into an relative position - - Raises - ------ - raises ValueError if abs_pos < 0 - """ - check = array([abs_pos], dtype=int) if isinstance(abs_pos, int) else abs_pos - if check.min() < 0: - raise ValueError(f"must positive, not {abs_pos=}") - return abs_pos - self.start - def get_covering_span(self): span = self.start, self.end return self.__class__(locations=[span], parent_length=self.parent_length) diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 96ccd390b..bcb9f1db1 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -270,15 +270,14 @@ def test_get_gap_coords(self): self.assertEqual(dict(got), {0: 1, 2: 2, 4: 1, 7: 2}) -@pytest.mark.parametrize("cls", (IndelMap, FeatureMap)) -def test_map_plus_position(cls): +def test_map_plus_position(): # seq is 9 long # plus coords 012345678 # +slice ** # plus seq AAACCCTGG # orig = Aligned(*DNA.make_seq("AAACCCTGG", name="a").parse_out_gaps()) - orig = cls(locations=[(0, 9)], parent_length=9) + orig = FeatureMap(locations=[(0, 9)], parent_length=9) assert orig.absolute_position(2) == 2 assert orig.absolute_position(6) == 6 From 85440d45d9365b5e1cfc4003eab791a63900a49e Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Mon, 22 Apr 2024 07:32:43 +1000 Subject: [PATCH 15/62] API: Add class methods to new map classes for construction from old types [NEW] .from_locations() and .from_spans() methods to both IndelMap and FeatureMap classes. --- src/cogent3/align/traceback.py | 2 +- src/cogent3/core/alignment.py | 8 +- src/cogent3/core/location.py | 125 ++++++++++++++++++++------ src/cogent3/core/sequence.py | 8 +- src/cogent3/parse/gbseq.py | 4 +- tests/test_core/test_annotation_db.py | 4 +- tests/test_core/test_location.py | 18 ++-- tests/test_core/test_maps.py | 2 +- 8 files changed, 123 insertions(+), 48 deletions(-) diff --git a/src/cogent3/align/traceback.py b/src/cogent3/align/traceback.py index 16a8d0ceb..032750ad9 100644 --- a/src/cogent3/align/traceback.py +++ b/src/cogent3/align/traceback.py @@ -60,7 +60,7 @@ def map_traceback(aligned_positions): (starts, ends, gap_vectors, alignment_len) = gap_traceback(aligned_positions) # print 'gv', gap_vectors maps = [ - IndelMap(locations=gv, parent_length=alignment_len).inverse() + IndelMap.from_locations(locations=gv, parent_length=alignment_len).inverse() for gv in gap_vectors ] return (starts, ends, maps) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index 3043e7dd3..7178ad048 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -2140,7 +2140,7 @@ def __init__(self, map, data, length=None): # Unlike the normal map constructor, here we take a list of pairs of # alignment coordinates, NOT a list of pairs of sequence coordinates if isinstance(map, list): - map = IndelMap(locations=map, parent_length=length).inverse() + map = IndelMap.from_locations(locations=map, parent_length=length).inverse() self.map = map self.data = data if hasattr(data, "info"): @@ -4835,7 +4835,7 @@ def filtered(self, predicate, motif_length=1, drop_remainder=True, **kwargs): locations = [(gv[i], gv[i + 1]) for i in range(0, len(gv), 2)] - keep = IndelMap(locations=locations, parent_length=len(self)) + keep = IndelMap.from_locations(locations=locations, parent_length=len(self)) return self.gapped_by_map(keep, info=self.info) def get_seq(self, seqname): @@ -5231,7 +5231,9 @@ def make_feature( # there's no sequence to bind to, the feature is directly on self revd = feature.pop("strand", None) == "-" feature["strand"] = "-" if revd else "+" - fmap = FeatureMap(parent_length=len(self), locations=feature.pop("spans")) + fmap = FeatureMap.from_locations( + locations=feature.pop("spans"), parent_length=len(self) + ) if revd: fmap = fmap.nucleic_reversed() return Feature(parent=self, map=fmap, **feature) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 25409130f..eee588243 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -29,7 +29,7 @@ from itertools import chain from typing import Iterator, List, Optional, Sequence, Tuple, Union -from numpy import array, ndarray +import numpy from cogent3._version import __version__ from cogent3.util import warning as c3warn @@ -85,6 +85,8 @@ def as_map(slice, length, cls): assert (step or 1) == 1 # since we disallow step, a reverse slice means an empty series locations = [] if lo > hi else [(lo, hi)] + # IndelMap has this class method + cls = getattr(cls, "from_locations", cls) return cls(locations=locations, parent_length=length) @@ -802,7 +804,7 @@ def to_rich_dict(self): """returns dicts for contained spans [dict(), ..]""" spans = [s.to_rich_dict() for s in self.spans] data = copy.deepcopy(self._serialisable) - data.pop("locations") + data.pop("locations", None) data["spans"] = spans data["type"] = get_object_provenance(self) data["version"] = __version__ @@ -839,7 +841,7 @@ def zeroed(self): return zeroed - T = Union[ndarray, int] + T = Union[numpy.ndarray, int] def absolute_position(self, rel_pos: T) -> T: """converts rel_pos into an absolute position @@ -848,7 +850,9 @@ def absolute_position(self, rel_pos: T) -> T: ------ raises ValueError if rel_pos < 0 """ - check = array([rel_pos], dtype=int) if isinstance(rel_pos, int) else rel_pos + check = ( + numpy.array([rel_pos], dtype=int) if isinstance(rel_pos, int) else rel_pos + ) if check.min() < 0: raise ValueError(f"must positive, not {rel_pos=}") @@ -865,7 +869,9 @@ def relative_position(self, abs_pos: T) -> T: ------ raises ValueError if abs_pos < 0 """ - check = array([abs_pos], dtype=int) if isinstance(abs_pos, int) else abs_pos + check = ( + numpy.array([abs_pos], dtype=int) if isinstance(abs_pos, int) else abs_pos + ) if check.min() < 0: raise ValueError(f"must positive, not {abs_pos=}") return abs_pos - self.start @@ -1147,6 +1153,20 @@ def to_rich_dict(self): def inverse(self): ... + @classmethod + def from_locations(cls, locations, parent_length, **kwargs): + if len(locations): + spans = _spans_from_locations(locations, parent_length=parent_length) + else: + spans = () + + return cls.from_spans(spans=spans, parent_length=parent_length, **kwargs) + + @classmethod + @abstractmethod + def from_spans(cls, spans, parent_length, **kwargs): + ... + def _spans_from_locations(locations, parent_length) -> Tuple[Union[Span, _LostSpan]]: if not len(locations): @@ -1195,6 +1215,29 @@ def _spans_from_locations(locations, parent_length) -> Tuple[Union[Span, _LostSp return tuple(spans) +O = tuple[numpy.ndarray, Sequence] + + +def map_to_gap_coords( + indel_map: Map, dtype: Union[type, str] = numpy.int32 +) -> numpy.ndarray: + """returns coordinates of sequence gaps + + Parameters + ---------- + indel_map + old style Map object + dtype + string or numpy type, default to 32-bit integer + + Returns + ------- + numpy.array[[gap index, gap length],...]) + """ + # Assuming the maximum integer is < 2^31 + return numpy.array(indel_map.get_gap_coordinates(), dtype=dtype) + + T = Union[List[int], Tuple[int]] @@ -1203,7 +1246,6 @@ class IndelMap(MapABC): """store locations of deletions in a Aligned sequence""" spans: dataclasses.InitVar[Optional[tuple]] = () - locations: dataclasses.InitVar[Optional[Sequence[T]]] = None termini_unknown: dataclasses.InitVar[bool] = dataclasses.field(default=False) parent_length: int = 0 start: Optional[int] = dataclasses.field(init=False, default=0) @@ -1214,10 +1256,8 @@ class IndelMap(MapABC): default=(), init=False ) - def __post_init__(self, spans, locations, termini_unknown): - if locations is not None and len(locations): - spans = _spans_from_locations(locations, parent_length=self.parent_length) - elif isinstance(spans, property): + def __post_init__(self, spans, termini_unknown): + if isinstance(spans, property): # This clause is due a known issue with dataclasses. # As we have a spans property, the default spans value is # ignored, so we have to check for its value being a property @@ -1258,6 +1298,12 @@ def __post_init__(self, spans, locations, termini_unknown): self._spans = tuple(spans) + @classmethod + def from_spans(cls, spans, parent_length, termini_unknown=False): + return cls( + spans=spans, parent_length=parent_length, termini_unknown=termini_unknown + ) + def __getitem__(self, slice): # A possible shorter map at the same level new_map = as_map(slice, len(self), self.__class__) @@ -1291,7 +1337,7 @@ def __repr__(self): @property def offsets(self): - return [0] + array([s.length for s in self._spans[:-1]]).cumsum().tolist() + return [0] + numpy.array([s.length for s in self._spans[:-1]]).cumsum().tolist() def gaps(self): """The gaps (lost spans) in this map""" @@ -1301,7 +1347,9 @@ def gaps(self): if s.lost: locations.append((offset, offset + s.length)) offset += s.length - return self.__class__(locations=locations, parent_length=len(self)) + return self.__class__.from_locations( + locations=locations, parent_length=len(self) + ) def nongap(self): """ungappeed segments in this map""" @@ -1311,7 +1359,9 @@ def nongap(self): if not s.lost: locations.append((offset, offset + s.length)) offset += s.length - return self.__class__(locations=locations, parent_length=len(self)) + return self.__class__.from_locations( + locations=locations, parent_length=len(self) + ) @property def spans(self) -> Iterator[Span]: @@ -1370,7 +1420,7 @@ def to_rich_dict(self): data = copy.deepcopy( {k: v for k, v in self._serialisable.items() if k != "spans"} ) - data.pop("locations") + data.pop("locations", None) data["spans"] = spans data["type"] = get_object_provenance(self) data["version"] = __version__ @@ -1452,7 +1502,9 @@ def inverse(self): def get_covering_span(self): span = self.start, self.end - return self.__class__(locations=[span], parent_length=self.parent_length) + return self.__class__.from_locations( + locations=[span], parent_length=self.parent_length + ) def zeroed(self): """returns a new instance with the first span starting at 0 @@ -1511,7 +1563,6 @@ class FeatureMap(MapABC): """A map holds a list of spans.""" spans: dataclasses.InitVar[Optional[tuple]] = () - locations: dataclasses.InitVar[Optional[Sequence[T]]] = None parent_length: int = 0 offsets: list[int] = dataclasses.field(init=False, repr=False) useful: bool = dataclasses.field(init=False, repr=False, default=False) @@ -1521,12 +1572,10 @@ class FeatureMap(MapABC): default=(), init=False ) - def __post_init__(self, spans, locations): + def __post_init__(self, spans): assert self.parent_length is not None - if locations is not None and len(locations): - spans = _spans_from_locations(locations, parent_length=self.parent_length) - elif isinstance(spans, property): + if isinstance(spans, property): # This clause is due a known issue with dataclasses. # As we have a spans property, the default spans value is # ignored, so we have to check for its value being a property @@ -1554,6 +1603,10 @@ def __post_init__(self, spans, locations): self._spans = tuple(spans) self.length = posn + @classmethod + def from_spans(cls, spans, parent_length): + return cls(spans=spans, parent_length=parent_length) + def __len__(self): return self.length @@ -1599,7 +1652,9 @@ def _with_termini_unknown(self): def get_covering_span(self): span = (self.start, self.end) - return self.__class__(locations=[span], parent_length=self.parent_length) + return self.__class__.from_locations( + locations=[span], parent_length=self.parent_length + ) def covered(self): """>>> Map([(10,20), (15, 25), (80, 90)]).covered().spans @@ -1628,7 +1683,9 @@ def covered(self): last_x = x last_y = y assert y == 0 - return self.__class__(locations=result, parent_length=self.parent_length) + return self.__class__.from_locations( + locations=result, parent_length=self.parent_length + ) def nucleic_reversed(self): """map for a sequence that has itself been reversed and complemented @@ -1671,7 +1728,9 @@ def gaps(self): if s.lost: locations.append((offset, offset + s.length)) offset += s.length - return self.__class__(locations=locations, parent_length=len(self)) + return self.__class__.from_locations( + locations=locations, parent_length=len(self) + ) def shadow(self): """The 'negative' map of the spans not included in this map""" @@ -1684,7 +1743,9 @@ def nongap(self): if not s.lost: locations.append((offset, offset + s.length)) offset += s.length - return self.__class__(locations=locations, parent_length=len(self)) + return self.__class__.from_locations( + locations=locations, parent_length=len(self) + ) def without_gaps(self): return self.__class__( @@ -1782,7 +1843,7 @@ def to_rich_dict(self): """returns dicts for contained spans [dict(), ..]""" spans = [s.to_rich_dict() for s in self.spans] data = copy.deepcopy(self._serialisable) - data.pop("locations") + data.pop("locations", None) data["spans"] = spans data["type"] = get_object_provenance(self) data["version"] = __version__ @@ -1819,7 +1880,7 @@ def zeroed(self): return zeroed - T = Union[ndarray, int] + T = Union[numpy.ndarray, int] def absolute_position(self, rel_pos: T) -> T: """converts rel_pos into an absolute position @@ -1828,7 +1889,9 @@ def absolute_position(self, rel_pos: T) -> T: ------ raises ValueError if rel_pos < 0 """ - check = array([rel_pos], dtype=int) if isinstance(rel_pos, int) else rel_pos + check = ( + numpy.array([rel_pos], dtype=int) if isinstance(rel_pos, int) else rel_pos + ) if check.min() < 0: raise ValueError(f"must positive, not {rel_pos=}") @@ -1845,7 +1908,9 @@ def relative_position(self, abs_pos: T) -> T: ------ raises ValueError if abs_pos < 0 """ - check = array([abs_pos], dtype=int) if isinstance(abs_pos, int) else abs_pos + check = ( + numpy.array([abs_pos], dtype=int) if isinstance(abs_pos, int) else abs_pos + ) if check.min() < 0: raise ValueError(f"must positive, not {abs_pos=}") return abs_pos - self.start @@ -1866,7 +1931,9 @@ def gap_coords_to_map(gaps_lengths: dict, seq_length: int) -> FeatureMap: """ if not gaps_lengths: - return FeatureMap(locations=[(0, seq_length)], parent_length=seq_length) + return FeatureMap.from_locations( + locations=[(0, seq_length)], parent_length=seq_length + ) spans = [] last = pos = 0 diff --git a/src/cogent3/core/sequence.py b/src/cogent3/core/sequence.py index 022fd5a57..f601d876e 100644 --- a/src/cogent3/core/sequence.py +++ b/src/cogent3/core/sequence.py @@ -1060,7 +1060,7 @@ def make_feature(self, feature: FeatureDataType, *args) -> Feature: continue new_spans.append(new.tolist()) - fmap = FeatureMap(locations=new_spans, parent_length=len(self)) + fmap = FeatureMap.from_locations(locations=new_spans, parent_length=len(self)) if pre or post: # create a lost span to represent the segment missing from # the instance @@ -1256,7 +1256,7 @@ def with_masked_annotations( seqid=self.name, name=None, biotype=None, - map=FeatureMap(locations=[], parent_length=len(self)), + map=FeatureMap.from_locations(locations=[], parent_length=len(self)), strand="+", ) else: @@ -1465,7 +1465,9 @@ def parse_out_gaps(self): for match in nongap.finditer(str(self)): segments.append(match.span()) gapless.append(match.group()) - map = IndelMap(locations=segments, parent_length=len(self)).inverse() + map = IndelMap.from_locations( + locations=segments, parent_length=len(self) + ).inverse() seq = self.__class__( "".join(gapless), name=self.get_name(), info=self.info, preserve_case=True ) diff --git a/src/cogent3/parse/gbseq.py b/src/cogent3/parse/gbseq.py index 3e3476599..a917dbd2c 100644 --- a/src/cogent3/parse/gbseq.py +++ b/src/cogent3/parse/gbseq.py @@ -64,7 +64,9 @@ def GbSeqXmlParser(doc): seq = alphabet.make_seq(raw_string, name=name) - feat = location.FeatureMap(locations=[(0, len(seq))], parent_length=len(seq)) + feat = location.FeatureMap.from_locations( + locations=[(0, len(seq))], parent_length=len(seq) + ) seq.add_feature(biotype="source", name=name, spans=feat.get_coordinates()) organism = str( diff --git a/tests/test_core/test_annotation_db.py b/tests/test_core/test_annotation_db.py index 05eb53188..a0910bb13 100644 --- a/tests/test_core/test_annotation_db.py +++ b/tests/test_core/test_annotation_db.py @@ -413,7 +413,9 @@ def test_feature_nucleic(): # 111111 # 0123456789012345 seq = make_seq("AACCTTTGGGGAATTT", moltype="dna") - mmap = loc.FeatureMap(locations=[(4, 7), (10, 12)], parent_length=len(seq)) + mmap = loc.FeatureMap.from_locations( + locations=[(4, 7), (10, 12)], parent_length=len(seq) + ) expect = seq[mmap] rcseq = seq.rc() diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index bcb9f1db1..83ad23e75 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -277,7 +277,7 @@ def test_map_plus_position(): # plus seq AAACCCTGG # orig = Aligned(*DNA.make_seq("AAACCCTGG", name="a").parse_out_gaps()) - orig = FeatureMap(locations=[(0, 9)], parent_length=9) + orig = FeatureMap.from_locations(locations=[(0, 9)], parent_length=9) assert orig.absolute_position(2) == 2 assert orig.absolute_position(6) == 6 @@ -317,7 +317,7 @@ def test_map_nucleic_reversed(cls): # plus seq AAACCCTGG # orig = Aligned(*DNA.make_seq("AAACCCTGG", name="a").parse_out_gaps()) - orig = cls(locations=[(0, 9)], parent_length=9) + orig = cls.from_locations(locations=[(0, 9)], parent_length=9) # minus coords 012345678 # rel coord 01234 # -slice ***** @@ -386,8 +386,8 @@ def test_round_trip_rich_dict(): def test_serialisable_attr(): - im = IndelMap(locations=[(0, 2)], parent_length=20) - set_vals = {"locations": [(0, 2)], "parent_length": 20} + set_vals = {"spans": [Span(0, 2)], "parent_length": 20} + im = IndelMap.from_spans(**set_vals) got = {k: im._serialisable[k] for k in set_vals} assert got == set_vals @@ -439,7 +439,7 @@ def test_terminal_unknown(): @pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) def test_map_inverse(cls): - m = cls(locations=[(0, 2), (4, 6)], parent_length=6) + m = cls.from_locations(locations=[(0, 2), (4, 6)], parent_length=6) assert len(m) == 4 mi = m.inverse() assert len(mi) == 6 @@ -465,7 +465,7 @@ def test_map_offsets(cls): @pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) def test_map_indexed(cls): - m = cls(locations=[(0, 2), (4, 6)], parent_length=6).inverse() + m = cls.from_locations(locations=[(0, 2), (4, 6)], parent_length=6).inverse() indexed = m[2] assert len(indexed) == 1 @@ -540,7 +540,7 @@ def test_indelmap_with_reverse_span(): def test_indelmap_no_gaps(): - imap = IndelMap(locations=(), parent_length=6) + imap = IndelMap.from_locations(locations=(), parent_length=6) gaps = imap.gaps() assert not gaps @@ -548,7 +548,7 @@ def test_indelmap_no_gaps(): def test_get_coords(): """get_coordinates should return raw coordinates matching input""" spans = [(0, 9), (20, 32)] - fmap = FeatureMap(locations=spans, parent_length=100) + fmap = FeatureMap.from_locations(locations=spans, parent_length=100) coords = fmap.get_coordinates() assert coords == spans @@ -559,7 +559,7 @@ def test_get_coords_invalid_order(): # should work for reversed Maps too spans = [(32, 20), (9, 0)] with pytest.raises(ValueError): - FeatureMap(locations=spans, parent_length=100) + FeatureMap.from_locations(locations=spans, parent_length=100) def test_gap_coords_to_map(): diff --git a/tests/test_core/test_maps.py b/tests/test_core/test_maps.py index 54c3e4829..5eca67498 100644 --- a/tests/test_core/test_maps.py +++ b/tests/test_core/test_maps.py @@ -9,7 +9,7 @@ class MapTest(unittest.TestCase): def test_spans(self): # a simple two part map of length 10 - map = FeatureMap(locations=[(0, 5), (5, 10)], parent_length=10) + map = FeatureMap.from_locations(locations=[(0, 5), (5, 10)], parent_length=10) # try different spans on the above map for (start, end), expected in [ ((0, 4), "[0:4]"), From f158d0a07f2ebab7feede1f6ae1a9e72747c04db Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Mon, 22 Apr 2024 08:35:27 +1000 Subject: [PATCH 16/62] API: IndelMap now uses numpy array for gap positions and lengths [NEW] .gap_pos and .cum_gap_lengths store the essential gap data. A The constructor takes an optional gap_lengths argument, which is converted into cum_gap_lengths. [NEW] .from_aligned_segments() for constructing the indel map from the coordinates of ungapped segments in an aligned sequence [CHANGED] IndelMap.merge_maps() provides a more direct solutio instead of the convoluted FeatureMap[FeatureMap.invert()].invert() --- src/cogent3/align/pairwise.py | 11 +- src/cogent3/align/traceback.py | 10 +- src/cogent3/app/align.py | 8 +- src/cogent3/core/alignment.py | 92 +-- src/cogent3/core/location.py | 921 ++++++++++++++++++++---------- src/cogent3/core/sequence.py | 38 +- src/cogent3/draw/dotplot.py | 20 +- src/cogent3/parse/cigar.py | 21 +- tests/test_app/test_align.py | 18 +- tests/test_core/test_alignment.py | 60 +- tests/test_core/test_features.py | 6 +- tests/test_core/test_location.py | 621 +++++++++++++++++--- tests/test_draw/test_dotplot.py | 3 +- tests/test_parse/test_cigar.py | 4 +- 14 files changed, 1319 insertions(+), 514 deletions(-) diff --git a/src/cogent3/align/pairwise.py b/src/cogent3/align/pairwise.py index c517092e1..14aa9a720 100644 --- a/src/cogent3/align/pairwise.py +++ b/src/cogent3/align/pairwise.py @@ -13,6 +13,7 @@ from cogent3.align.traceback import alignment_traceback, map_traceback from cogent3.core.alignment import Aligned +from cogent3.core.location import IndelMap from cogent3.evolve.likelihood_tree import LikelihoodTreeEdge from cogent3.util.misc import ascontiguousarray @@ -477,8 +478,8 @@ def _calcAligneds(self, children): aligneds = [] for dim, child in enumerate(children): for seq_name, aligned in child.aligneds: - aligned = aligned.remapped_to((maps[dim] * word_length).inverse()) - aligneds.append((seq_name, aligned)) + new_map = aligned.map.merge_maps(maps[dim] * word_length) + aligneds.append((seq_name, Aligned(new_map, aligned.data))) return aligneds def backward(self): @@ -526,7 +527,11 @@ def __init__(self, leaf): _Alignable.__init__(self, leaf) if hasattr(leaf, "sequence"): self.seq = leaf.sequence - aligned = Aligned([(0, len(self.seq))], self.seq, len(self.seq)) + seqlen = len(self.seq) + imap = IndelMap.from_aligned_segments( + locations=[(0, seqlen)], aligned_length=seqlen + ) + aligned = Aligned(imap, self.seq) self.aligneds = [(self.leaf.edge_name, aligned)] self.max_preds = 1 self._pog = None diff --git a/src/cogent3/align/traceback.py b/src/cogent3/align/traceback.py index 032750ad9..4a0ca3dba 100644 --- a/src/cogent3/align/traceback.py +++ b/src/cogent3/align/traceback.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python """Conversion of dynamic program results ("arrays of arrows") into gap vectors, gapped sequences or Cogent Alignment objects""" @@ -52,18 +51,17 @@ def gap_traceback(aligned_positions): if consuming[dimension]: gv.append(a) gap_vectors[dimension] = [(gv[i], gv[i + 1]) for i in range(0, len(gv), 2)] - return (starts, ends, gap_vectors, a) + return starts, ends, gap_vectors, a def map_traceback(aligned_positions): # using IndelMap's to keep track of gaps for indel alignment - (starts, ends, gap_vectors, alignment_len) = gap_traceback(aligned_positions) - # print 'gv', gap_vectors + starts, ends, gap_vectors, alignment_len = gap_traceback(aligned_positions) maps = [ - IndelMap.from_locations(locations=gv, parent_length=alignment_len).inverse() + IndelMap.from_aligned_segments(locations=gv, aligned_length=alignment_len) for gv in gap_vectors ] - return (starts, ends, maps) + return starts, ends, maps def alignment_traceback(seqs, aligned_positions, word_length): diff --git a/src/cogent3/app/align.py b/src/cogent3/app/align.py index 94a2200be..04a2abdd4 100644 --- a/src/cogent3/app/align.py +++ b/src/cogent3/app/align.py @@ -171,7 +171,7 @@ def _merged_gaps(a_gaps: dict, b_gaps: dict) -> dict: function to 'max'. Use 'sum' when the gaps derive from different sequences. """ - + # todo convert to using IndelMap functions if not a_gaps: return b_gaps @@ -251,7 +251,8 @@ def _gaps_for_injection(other_seq_gaps: dict, refseq_gaps: dict, seqlen: int) -> # sequence coordinates # we probably need to include the refseq gap union because we need to # establish whether a refseq gap overlaps with a gap in other seq - # and + # todo convert these functions to using IndelMap and the numpy set + # operation functions all_gaps = {} all_gaps.update(other_seq_gaps) for gap_pos, gap_length in sorted(refseq_gaps.items()): @@ -304,8 +305,7 @@ def pairwise_to_multiple(pwise, ref_seq, moltype, info=None): other_seq = aln.named_seqs[other_name] other_gaps = dict(other_seq.map.get_gap_coordinates()) diff_gaps = _combined_refseq_gaps(curr_ref_gaps, ref_gaps) - inject = _gaps_for_injection(other_gaps, diff_gaps, len(other_seq.data)) - if inject: + if inject := _gaps_for_injection(other_gaps, diff_gaps, len(other_seq.data)): m = gap_coords_to_map(inject, len(other_seq.data)) other_seq = Aligned(m, other_seq.data) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index 7178ad048..e34cf8b79 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -30,7 +30,7 @@ from collections import Counter, defaultdict from copy import deepcopy -from functools import total_ordering +from functools import singledispatchmethod, total_ordering from itertools import combinations from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union @@ -2136,11 +2136,10 @@ class Aligned: """One sequence in an alignment, a map between alignment coordinates and sequence coordinates""" - def __init__(self, map, data, length=None): - # Unlike the normal map constructor, here we take a list of pairs of - # alignment coordinates, NOT a list of pairs of sequence coordinates - if isinstance(map, list): - map = IndelMap.from_locations(locations=map, parent_length=length).inverse() + @c3warn.deprecated_args( + "2024.9", reason="now requires an IndelMap", discontinued=["length"] + ) + def __init__(self, map, data): self.map = map self.data = data if hasattr(data, "info"): @@ -2188,12 +2187,8 @@ def deepcopy(self, sliced=True, exclude_annotations=False): if strand == "-" or exclude_annotations: new_seq.annotation_db = None else: - new_seq.annotation_offset = self.map.start new_seq.annotation_db = db - new_map = self.map.zeroed() - else: - new_map = self.map - + new_map = self.map return self.__class__(new_map, new_seq) def __repr__(self): @@ -2243,19 +2238,52 @@ def __add__(self, other): map, seq = seq.parse_out_gaps() return Aligned(map, seq) - def __getitem__(self, slice): + @singledispatchmethod + def __getitem__(self, span: int): + raise NotImplementedError(f"unexpected {type(span)}") + + @__getitem__.register + def _(self, span: int): + return self[span : span + 1] + + @__getitem__.register + def _(self, span: FeatureMap): + # we assume the feature map is in align coordinates + start, end = span.start, span.end + if span.useful and start > end: + empty = numpy.array([], dtype=self.map.gap_pos.dtype) + im = IndelMap(gap_pos=empty, cum_gap_lengths=empty, parent_length=0) + data = self.data[:0] + elif span.useful and len(list(span.spans)) == 1: + im = self.map[start:end] + seq_start = self.map.get_seq_index(start) + seq_end = self.map.get_seq_index(end) + data = self.data[seq_start:seq_end] + elif not span.useful: + im = self.map[start:end] + data = self.data[:0] + else: + # multiple spans + align_coords = span.get_coordinates() + im = self.map.joined_segments(align_coords) + seq_map = self.map.make_seq_feature_map(span, include_gaps=False) + data = self.data.gapped_by_map(seq_map) + + return Aligned(im, data) + + @__getitem__.register + def _(self, span: slice): # todo we need to get the sequence coordinates that slice corresponds to - # so we can update the self.data, plus we will need to zero new_map - new_map = self.map[slice] - data = ( - self.data[new_map.start : new_map.end] if new_map.useful else self.data[:0] - ) - if new_map.useful and new_map.start > new_map.end: + # so we can update the self.data + new_map = self.map[span] + seq_start = self.map.get_seq_index(span.start or 0) + seq_end = self.map.get_seq_index(span.stop or len(self)) + data = self.data[seq_start:seq_end] if new_map.useful else self.data[:0] + if new_map.useful and seq_start > seq_end: # For now, a reverse slice means we should have an empty sequence # todo modify this clause if a negative step is ever allowed new_map = new_map.__class__(locations=(), parent_length=len(self.data)) - elif new_map.useful: - new_map = new_map.zeroed() + return Aligned(new_map, data) def rc(self): @@ -2274,12 +2302,9 @@ def to_dna(self): return Aligned(self.map, self.data.to_dna()) def to_rich_dict(self): - coords = self.map.get_covering_span().get_coordinates() - if len(coords) != 1: - raise NotImplementedError data = dict(version=__version__, type=get_object_provenance(self)) # we are resetting the stored data to begin at 0 - data["map_init"] = self.map.zeroed().to_rich_dict() + data["map_init"] = self.map.to_rich_dict() data["seq_init"] = self.data.to_rich_dict(exclude_annotations=True) return data @@ -2318,7 +2343,7 @@ def remapped_to(self, map): def make_feature(self, feature: FeatureDataType, alignment: "Alignment") -> Feature: """returns a feature, not written into annotation_db""" annot = self.data.make_feature(feature) - inverted = self.map.inverse() + inverted = self.map.to_feature_map().inverse() # todo should indicate whether tidy or not return annot.remapped_to(alignment, inverted) @@ -4721,18 +4746,18 @@ def annotation_db(self, value): def _mapped(self, slicemap): align = [] - for name in self.names: - align.append((name, self.named_seqs[name][slicemap])) + for seq in self.seqs: + sliced = seq[slicemap] + align.append((sliced.name, sliced)) return self.__class__(moltype=self.moltype, data=align, info=self.info) def gapped_by_map(self, keep, **kwargs): # keep is a Map + # seqs = [seq[keep] for seq in self.seqs] seqs = [] - for seq_name in self.names: - aligned = self.named_seqs[seq_name] - seqmap = aligned.map[keep] - seq = aligned.data.gapped_by_map(seqmap) - seqs.append((seq_name, seq)) + for seq in self.seqs: + selected = seq[keep] + seqs.append(selected) return self.__class__(moltype=self.moltype, data=seqs, **kwargs) def get_projected_feature(self, *, seqid: str, feature: Feature) -> Feature: @@ -4834,8 +4859,9 @@ def filtered(self, predicate, motif_length=1, drop_remainder=True, **kwargs): return None locations = [(gv[i], gv[i + 1]) for i in range(0, len(gv), 2)] + # these are alignment coordinate locations + keep = FeatureMap.from_locations(locations=locations, parent_length=len(self)) - keep = IndelMap.from_locations(locations=locations, parent_length=len(self)) return self.gapped_by_map(keep, info=self.info) def get_seq(self, seqname): diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index eee588243..5229f29a2 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -21,7 +21,9 @@ """ import copy import dataclasses +import functools import inspect +import json from abc import ABC, abstractmethod from bisect import bisect_left, bisect_right @@ -31,8 +33,11 @@ import numpy +from numpy.typing import NDArray + from cogent3._version import __version__ from cogent3.util import warning as c3warn +from cogent3.util.deserialise import register_deserialiser from cogent3.util.misc import ( ClassChecker, ConstrainedList, @@ -54,21 +59,6 @@ def _norm_index(i, length, default): return min(max(i, 0), length) -def _norm_slice(index, length): - """_norm_slice(slice(1, -2, 3), 10) -> (1,8,3)""" - if isinstance(index, slice): - start = _norm_index(index.start, length, 0) - end = _norm_index(index.stop, length, length) - return (start, end, index.step) - else: - start = index - if start < 0: - start += length - if start >= length: - raise IndexError(index) - return (start, start + 1, 1) - - def as_map(slice, length, cls): """Take anything that might be used as a subscript: Integer, Slice, or MapABC, and return cls.""" @@ -78,7 +68,7 @@ def as_map(slice, length, cls): for i in slice: spans.extend(as_map(i, length, cls).spans) return cls(spans=spans, parent_length=length) - elif isinstance(slice, (FeatureMap, IndelMap)): + elif isinstance(slice, (FeatureMap, IndelMap, Map)): return slice else: lo, hi, step = _norm_slice(slice, length) @@ -1129,10 +1119,6 @@ def __len__(self): def __add__(self, other): ... - @abstractmethod - def gaps(self): - ... - @abstractmethod def nongap(self): ... @@ -1149,10 +1135,6 @@ def nucleic_reversed(self): def to_rich_dict(self): ... - @abstractmethod - def inverse(self): - ... - @classmethod def from_locations(cls, locations, parent_length, **kwargs): if len(locations): @@ -1218,9 +1200,9 @@ def _spans_from_locations(locations, parent_length) -> Tuple[Union[Span, _LostSp O = tuple[numpy.ndarray, Sequence] -def map_to_gap_coords( - indel_map: Map, dtype: Union[type, str] = numpy.int32 -) -> numpy.ndarray: +def spans_to_gap_coords( + indel_spans: list[Union[Span, LostSpan]], dtype: Union[type, str] = numpy.int32 +) -> tuple[numpy.ndarray, numpy.ndarray]: """returns coordinates of sequence gaps Parameters @@ -1232,10 +1214,51 @@ def map_to_gap_coords( Returns ------- - numpy.array[[gap index, gap length],...]) + numpy.array([gap pos,...]), numpy.array([cum gap length,...]), """ - # Assuming the maximum integer is < 2^31 - return numpy.array(indel_map.get_gap_coordinates(), dtype=dtype) + gap_pos = [] + cum_lengths = [] + cum_length = 0 + for i, span in enumerate(indel_spans): + if not span.lost: + continue + + pos = indel_spans[i - 1].end if i else 0 + cum_length += len(span) + gap_pos.append(pos) + cum_lengths.append(cum_length) + + return numpy.array(gap_pos, dtype=dtype), numpy.array(cum_lengths, dtype=dtype) + + +def _gap_spans( + gap_pos: NDArray[int], cum_gap_lengths: NDArray[int] +) -> tuple[NDArray[int], NDArray[int]]: + """returns 1D arrays in alignment coordinates of + gap start, gap stop""" + if not len(gap_pos): + r = numpy.array([], dtype=gap_pos.dtype) + return r, r + + ends = gap_pos + cum_gap_lengths + starts = gap_pos.copy() + starts[1:] += cum_gap_lengths[:-1] + + return starts, ends + + +def _update_lengths(result_pos, result_lengths, gap_pos, gap_lengths): + """modifies result_lengths in place with gap_lengths + where elements in gap_pos occur in result_pos + + Notes + ----- + result_pos is a superset of gap_pos + """ + _, result_indices, other_indices = numpy.intersect1d( + result_pos, gap_pos, assume_unique=True, return_indices=True + ) + result_lengths[result_indices] += gap_lengths[other_indices] T = Union[List[int], Tuple[int]] @@ -1245,153 +1268,493 @@ def map_to_gap_coords( class IndelMap(MapABC): """store locations of deletions in a Aligned sequence""" - spans: dataclasses.InitVar[Optional[tuple]] = () - termini_unknown: dataclasses.InitVar[bool] = dataclasses.field(default=False) + # gap data is gap positions, gap lengths on input, stored + gap_pos: numpy.ndarray + cum_gap_lengths: Optional[numpy.ndarray] = None + gap_lengths: dataclasses.InitVar[Optional[numpy.ndarray]] = None + termini_unknown: bool = False parent_length: int = 0 - start: Optional[int] = dataclasses.field(init=False, default=0) - end: Optional[int] = dataclasses.field(init=False, default=0) - length: int = dataclasses.field(init=False, default=0) _serialisable: dict = dataclasses.field(init=False, repr=False) - _spans: Tuple[Union[Span, _LostSpan, TerminalPadding]] = dataclasses.field( - default=(), init=False - ) + num_gaps: int = dataclasses.field(init=False, repr=False, default=0) - def __post_init__(self, spans, termini_unknown): - if isinstance(spans, property): - # This clause is due a known issue with dataclasses. - # As we have a spans property, the default spans value is - # ignored, so we have to check for its value being a property - # and then set the default value here - spans = () + def __post_init__(self, gap_lengths): + assert gap_lengths is None or self.cum_gap_lengths is None + if gap_lengths is not None: + self.cum_gap_lengths = gap_lengths.cumsum() - spans = tuple(spans) - last_not_lost = None - start, end = None, None - for i, span in enumerate(spans): - self.length += len(span) - if span.lost: - continue - elif start is None: - # this ugly logic because we're using spans! - start = span.end if span.reverse else span.start + if len(self.gap_pos) != len(self.cum_gap_lengths): + raise ValueError( + f"length of gap_ pos {len(self.gap_pos)} != " + f"length of gap lengths {len(self.cum_gap_lengths)}" + ) - last_not_lost = i + self.num_gaps = self.gap_pos.shape[0] + if self.num_gaps and self.gap_pos[-1] > self.parent_length: + raise ValueError( + f"gap position {self.gap_pos[-1]} outside parent_length {self.parent_length}" + ) - if last_not_lost is not None: - span = spans[last_not_lost] - end = span.start if span.reverse else span.end + # make gap array immutable + self.gap_pos.flags.writeable = False + self.cum_gap_lengths.flags.writeable = False + self._serialisable.pop("gap_lengths", None) - if start is None: - start = 0 + @classmethod + def from_spans(cls, spans, parent_length, termini_unknown=False): + gap_pos, cum_lengths = spans_to_gap_coords(spans) + return cls( + gap_pos=gap_pos, + cum_gap_lengths=cum_lengths, + parent_length=parent_length, + termini_unknown=termini_unknown, + ) - if end is None: - end = self.parent_length + @classmethod + def from_aligned_segments( + cls, locations: list[tuple[int, int]], aligned_length: int + ): + """ + converts coordinates from aligned segments into IndelMap for ungapped sequence + + Parameters + ---------- + locations + list of ungapped segment in alignment coordinates + aligned_length + length of the alignment + """ + if not locations or ( + len(locations) == 1 + and locations[0][0] == 0 + and locations[0][-1] == aligned_length + ): + empty = numpy.array([], dtype=int) + return cls( + gap_pos=empty, + cum_gap_lengths=empty.copy(), + parent_length=aligned_length, + ) - self.start, self.end = start, end + if locations[0][0] != 0: + # starts with a gap + locations = [(0, 0)] + locations + if locations[-1][1] < aligned_length: + # ends with a gap + locations += [(aligned_length, aligned_length)] + + locations = numpy.array(locations, dtype=numpy.int32).flatten()[1:-1] + gap_coords = locations.reshape((locations.shape[0] // 2, 2)) + gap_ends, gap_starts = gap_coords[:, ::-1].T + gap_lengths = gap_ends - gap_starts + cum_lens = gap_lengths.cumsum() + # convert to sequence coords + gap_pos = gap_starts.copy() + gap_pos[1:] -= cum_lens[:-1] + seq_length = aligned_length - cum_lens[-1] - if termini_unknown: - spans = list(spans) - if spans[0].lost: - spans[0] = TerminalPadding(spans[0].length) - if spans[-1].lost: - spans[-1] = TerminalPadding(spans[-1].length) + return cls( + gap_pos=gap_pos, + cum_gap_lengths=cum_lens, + parent_length=seq_length, + ) - self._spans = tuple(spans) + @functools.singledispatchmethod + def __getitem__(self, item): + raise NotImplementedError(f"cannot slice using {type(item)}") + + @__getitem__.register + def _(self, item: int): + return self[item : item + 1] + + @__getitem__.register + def _(self, item: slice): + # we're assuming that this gap object is associated with a sequence + # that will also be sliced. Hence, we need to shift the gap insertion + # positions relative to this newly sliced sequence. + if item.step is not None: + raise NotImplementedError( + f"{type(self).__name__!r} does not yet support strides" + ) + zero_array = numpy.array([], dtype=self.gap_pos.dtype) + start = item.start or 0 + stop = item.stop or len(self) + + # convert negative indices + start = start if start >= 0 else len(self) + start + stop = stop if stop >= 0 else len(self) + stop + if min((start, stop)) < 0: + raise IndexError(f"one of adjusted {start, stop} is < 0") + + if start >= stop: + # standard slice behaviour without negative step + return self.__class__( + gap_pos=zero_array.copy(), + cum_gap_lengths=zero_array.copy(), + parent_length=0, + ) - @classmethod - def from_spans(cls, spans, parent_length, termini_unknown=False): - return cls( - spans=spans, parent_length=parent_length, termini_unknown=termini_unknown + # we address three easy cases: + # 1 - no gaps; 2 - slice before first gap; 3 - after last gap + no_gaps = self.__class__( + gap_pos=zero_array.copy(), + cum_gap_lengths=zero_array.copy(), + parent_length=stop - start, ) + if not self.num_gaps: + return no_gaps + + first_gap = self.gap_pos[0] + last_gap = self.gap_pos[-1] + self.cum_gap_lengths[-1] + if stop < first_gap or start >= last_gap: + return no_gaps + + gap_starts, gap_ends = _gap_spans(self.gap_pos, self.cum_gap_lengths) + gap_pos = self.gap_pos.copy() + cum_lengths = self.cum_gap_lengths.copy() + # we find where the slice starts + l = numpy.searchsorted(gap_ends, start, side="left") + if gap_starts[l] <= start < gap_ends[l] and stop <= gap_ends[l]: + # entire span is within a single gap + # pos now 0 + gap_pos = numpy.array([0], dtype=self.gap_pos.dtype) + cum_lengths = cum_lengths[l : l + 1] + cum_lengths[0] = stop - start + return self.__class__( + gap_pos=gap_pos, cum_gap_lengths=cum_lengths, parent_length=0 + ) - def __getitem__(self, slice): - # A possible shorter map at the same level - new_map = as_map(slice, len(self), self.__class__) - new_parts = [] - for span in new_map.spans: - # we reset tidy start / end to false to avoid changes to - # default behaviour of tidy on IndelMap (which ignores these - # anyway) - old_tidy = span.tidy_start, span.tidy_end - span.tidy_start, span.tidy_end = False, False - new_parts.extend(span.remap_with(self)) - span.tidy_start, span.tidy_end = old_tidy - return self.__class__(spans=new_parts, parent_length=self.parent_length) + lengths = self.get_gap_lengths() + if start < first_gap: + # start is before the first gap, we don't slice or shift + shift = start + begin = 0 + elif gap_starts[l] <= start < gap_ends[l]: + # start is within a gap + # so the absolute gap_pos value remains unchanged, but we shorten + # the gap length + begin = l + begin_diff = start - gap_starts[l] # if l else self.gap_pos[l] + lengths[l] -= begin_diff + shift = (start - cum_lengths[l - 1] - begin_diff) if l else gap_pos[0] + elif start == gap_ends[l]: + # at gap boundary, so beginning of non-gapped segment + # no adjustment to gap lengths + begin = l + 1 + shift = start - cum_lengths[l] + else: + # not within a gap + begin = l + shift = start - cum_lengths[l - 1] if l else start + + # start search for stop from l index + r = numpy.searchsorted(gap_ends[l:], stop, side="right") + l + if r == self.num_gaps: + # stop is after last gap + end = r + elif gap_starts[r] < stop <= gap_ends[r]: + # within gap + end = r + 1 + end_diff = gap_ends[r] - stop + lengths[r] -= end_diff + else: + end = r + + pos_result = gap_pos[begin:end] + pos_result -= shift + lengths = lengths[begin:end] + parent_length = self.get_seq_index(stop) - self.get_seq_index(start) + + return self.__class__( + gap_pos=pos_result, + gap_lengths=lengths, + parent_length=parent_length, + ) + + def get_align_index(self, seq_index: int, slice_stop: bool = False) -> int: + """convert a sequence index into an alignment index + + Parameters + ---------- + seq_index + coordinate on the sequence, must be < parent_length + slice_stop + set to True if the index is to be the end of an alignment slice. + In that case, and if seq_index is in gap_pos then it returns + the first alignment index of the gap run. + """ + # NOTE I explicitly cast all returned values to python int's due to + # need for json serialisation, which does not support numpy int classes + if seq_index < 0: + seq_index += self.parent_length + + if seq_index < 0: + raise NotImplementedError(f"{seq_index} negative seq_index beyond limit ") + + if not self.num_gaps or seq_index < self.gap_pos[0]: + return int(seq_index) + + # if stop_index, check if the seq_index corresponds to a gap position + if slice_stop and (match := seq_index == self.gap_pos).any(): + # if so, we return the alignment coord for the first gap position + (idx,) = numpy.where(match)[0] + if idx: + gap_len = self.cum_gap_lengths[idx] - self.cum_gap_lengths[idx - 1] + else: + gap_len = self.cum_gap_lengths[idx] + gap_end = self.gap_pos[idx] + self.cum_gap_lengths[idx] + return int(gap_end - gap_len) + + cum_gap_lengths = self.cum_gap_lengths + gap_pos = self.gap_pos + + if seq_index >= gap_pos[-1]: + return int(seq_index + cum_gap_lengths[-1]) + + # find gap position before seq_index + index = numpy.searchsorted(gap_pos, seq_index, side="left") + if seq_index < gap_pos[index]: + gap_lengths = cum_gap_lengths[index - 1] if index else 0 + else: + gap_lengths = cum_gap_lengths[index] + + return int(seq_index + gap_lengths) + + def get_seq_index(self, align_index: int) -> int: + """converts alignment index to sequence index""" + # NOTE I explicitly cast all returned values to python int's due to + # need for json serialisation, which does not support numpy int classes + if align_index < 0: + align_index = len(self) + align_index + if align_index < 0: + raise NotImplementedError(f"{align_index} align_index beyond limit") + + if not self.num_gaps or align_index < self.gap_pos[0]: + return align_index + + # these are alignment indices for gaps + cum_lengths = self.cum_gap_lengths + gap_starts, gap_ends = _gap_spans(self.gap_pos, cum_lengths) + if align_index >= gap_ends[-1]: + return int(align_index - cum_lengths[-1]) + + index = numpy.searchsorted(gap_ends, align_index, side="left") + if align_index < gap_starts[index]: + # before the gap at index + return int(align_index - cum_lengths[index - 1]) + + if align_index == gap_ends[index]: + # after the gap at index + return int(align_index - cum_lengths[index]) + + if gap_starts[index] <= align_index < gap_ends[index]: + # within the gap at index + # so the gap insertion position is the sequence position + return int(self.gap_pos[index]) def __len__(self): - return self.length + length_gaps = self.cum_gap_lengths[-1] if self.num_gaps else 0 + return int(self.parent_length + length_gaps) def __add__(self, other): - if other.parent_length != self.parent_length: - raise ValueError("Those maps belong to different sequences") + # what was the purpose of this method? The code seems designed for + # combining Maps from the same parent sequence, which is a union rather + # than addition + # I'm revising this to be suitable for the concatenation of two aligned + # sequences + gap_pos = self.gap_pos.tolist() + (self.parent_length + other.gap_pos).tolist() + gap_pos = numpy.array(gap_pos, dtype=self.gap_pos.dtype) + + cum_length = self.cum_gap_lengths[-1] if self.num_gaps else 0 + cum_gap_lengths = ( + self.cum_gap_lengths.tolist() + + (cum_length + other.cum_gap_lengths).tolist() + ) + cum_gap_lengths = numpy.array(cum_gap_lengths, dtype=self.cum_gap_lengths.dtype) + return self.__class__( - spans=self._spans + tuple(other.spans), parent_length=self.parent_length + gap_pos=gap_pos, + cum_gap_lengths=cum_gap_lengths, + parent_length=self.parent_length + other.parent_length, ) def __mul__(self, scale): - new_parts = [span * scale for span in self._spans] - return self.__class__(spans=new_parts, parent_length=self.parent_length * scale) + # could be used for going from amino-acid alignment to codon alignment + gap_pos = self.gap_pos * scale + cum_gap_lengths = self.cum_gap_lengths * scale + return self.__class__( + gap_pos=gap_pos, + cum_gap_lengths=cum_gap_lengths, + parent_length=self.parent_length * scale, + ) def __repr__(self): - return f"{self._spans!r}/{self.parent_length}" + gap_data = numpy.array([self.gap_pos, self.cum_gap_lengths]).T + return f"{gap_data.tolist()!r}/{self.parent_length}" + + def get_gap_lengths(self) -> NDArray[int]: + lengths = self.cum_gap_lengths.copy() + lengths[1:] = numpy.diff(lengths) + return lengths @property def offsets(self): - return [0] + numpy.array([s.length for s in self._spans[:-1]]).cumsum().tolist() + # offsets are the aligned indices for every starting point of a segment + # when we encounter a gap, we include that position and the end of that gap + + starts, ends = _gap_spans(self.gap_pos, self.cum_gap_lengths) + return numpy.array([starts, ends]).T.flatten()[:-1].tolist() + + def nongap(self) -> Iterator[Union[Span, LostSpan]]: + """ungappeed segments in this map in aligned coordinates""" + # we want to know the coordinates of the ungapped segments on + # the aligned sequence. The gap_pos attribute is in sequence + # coordinates + prev_pos = 0 + for i, pos in enumerate(self.gap_pos): + if pos == 0: + # we start with a gap + prev_pos = pos + continue - def gaps(self): - """The gaps (lost spans) in this map""" - locations = [] - offset = 0 - for s in self._spans: - if s.lost: - locations.append((offset, offset + s.length)) - offset += s.length - return self.__class__.from_locations( - locations=locations, parent_length=len(self) - ) + cum_length = 0 if i == 0 else self.cum_gap_lengths[i - 1] + start = 0 if i == 0 else prev_pos + start += cum_length + end = self.gap_pos[i] + cum_length + yield Span(start, end) + prev_pos = pos - def nongap(self): - """ungappeed segments in this map""" - locations = [] - offset = 0 - for s in self._spans: - if not s.lost: - locations.append((offset, offset + s.length)) - offset += s.length - return self.__class__.from_locations( - locations=locations, parent_length=len(self) - ) + if self.num_gaps and self.gap_pos[-1] + self.cum_gap_lengths[-1] < len(self): + yield Span(prev_pos, len(self)) @property - def spans(self) -> Iterator[Span]: + def spans(self) -> Iterator[Union[Span, LostSpan]]: """generator of spans""" - yield from self._spans + if not self.num_gaps: + yield Span(0, self.parent_length) + return + + for i, pos in enumerate(self.gap_pos): + cum_length = self.cum_gap_lengths[i] + if pos == 0: + cls = TerminalPadding if self.termini_unknown else LostSpan + yield cls(cum_length) + continue + + if i == 0: + start = 0 + prev_length = 0 + else: + start = self.gap_pos[i - 1] + prev_length = self.cum_gap_lengths[i - 1] + yield Span(start, pos) + cls = ( + TerminalPadding + if self.termini_unknown and i == self.num_gaps - 1 + else LostSpan + ) + yield cls(cum_length - prev_length) + + if self.num_gaps and self.gap_pos[-1] < self.parent_length: + yield Span(self.gap_pos[-1], self.parent_length) @property def complete(self): """whether any span represents a gap""" - return not any(span.lost for span in self._spans) + return self.num_gaps != 0 and self.useful @property def useful(self): - return not all(span.lost for span in self._spans) + return self.parent_length != 0 def get_coordinates(self): - """returns span coordinates as [(start, end), ...]""" - return [(s.start, s.end) for s in self._spans if not s.lost] + """returns sequence coordinates of ungapped segments + + Returns + ------- + [(start, end), ...] + """ + coords = [] + last = 0 + for pos, cum_length in zip(self.gap_pos, self.cum_gap_lengths): + pos, cum_length = int(pos), int(cum_length) + + if not pos: + continue + coords.append((last, pos)) + last = pos + + if ( + self.num_gaps + and self.gap_pos[-1] + self.cum_gap_lengths[-1] < self.parent_length + ): + coords.append((last, self.parent_length)) + return coords def get_gap_coordinates(self): """returns [(gap pos, gap length), ...]""" - gap_pos = [] - for i, span in enumerate(self._spans): - if not span.lost: - continue + cum_lengths = self.cum_gap_lengths.copy() + diffs = numpy.diff(cum_lengths) + cum_lengths[1:] = diffs + return numpy.array([self.gap_pos, cum_lengths]).T.tolist() + + def merge_maps(self, other): + """merge gaps of other with self + + Parameters + ---------- + indel_map + instance for same sequence + """ + unique_pos = numpy.union1d(self.gap_pos, other.gap_pos) + gap_lengths = numpy.zeros(unique_pos.shape, dtype=self.cum_gap_lengths.dtype) + self_lengths = self.get_gap_lengths() + other_lengths = other.get_gap_lengths() + _update_lengths(unique_pos, gap_lengths, self.gap_pos, self_lengths) + _update_lengths(unique_pos, gap_lengths, other.gap_pos, other_lengths) + return self.__class__( + gap_pos=unique_pos, + gap_lengths=gap_lengths, + parent_length=self.parent_length, + ) - pos = self._spans[i - 1].end if i else 0 - gap_pos.append((pos, len(span))) + def joined_segments(self, coords: list[tuple[int, int]]): + """returns new map with disjoint gapped segments joined - return gap_pos + Parameters + ---------- + coords + sequence insert gap coordinates [(gap start, gap end), ...] + Returns + ------- + + """ + coords = list(sorted(coords)) + # using a dict here because joining can produce a gap merge + gaps = {} + cum_length = 0 + cum_parent_length = 0 + dtype = self.gap_pos.dtype + for start, end in coords: + im = self[start:end] + for i in range(im.num_gaps): + pos = im.gap_pos[i] + cum_parent_length + if pos in gaps: + gaps[pos] += im.cum_gap_lengths[i] + else: + gaps[pos] = im.cum_gap_lengths[i] + cum_length + cum_parent_length += im.parent_length + if im.num_gaps: + cum_length += im.cum_gap_lengths[-1] + gap_pos = numpy.empty(len(gaps), dtype=dtype) + cum_lengths = numpy.empty(len(gaps), dtype=dtype) + for i, (pos, length) in enumerate(sorted(gaps.items())): + gap_pos[i] = pos + cum_lengths[i] = length + return self.__class__( + gap_pos=gap_pos, + cum_gap_lengths=cum_lengths, + parent_length=cum_parent_length, + ) def nucleic_reversed(self): """map for a sequence that has itself been reversed and complemented @@ -1400,32 +1763,34 @@ def nucleic_reversed(self): ----- discards reverse attribute on both spans and self """ - spans = [] - parent_length = self.parent_length - for s in self._spans: - if not s.lost: - start = parent_length - s.end - assert start >= 0 - end = start + s.length - s = Span(start=start, end=end) - spans.append(s) + new_pos = self.gap_pos.copy() + lengths = self.get_gap_lengths() - spans.reverse() - return self.__class__(spans=spans, parent_length=self.parent_length) + if len(new_pos): + new_pos = self.parent_length - new_pos + new_pos = new_pos[::-1] + lengths = lengths[::-1] + + return self.__class__( + gap_pos=new_pos, + gap_lengths=lengths, + parent_length=self.parent_length, + ) def to_rich_dict(self): """returns dicts for contained spans [dict(), ..]""" - spans = [s.to_rich_dict() for s in self._spans] # exclude spans from deep copy since being overwritten - data = copy.deepcopy( - {k: v for k, v in self._serialisable.items() if k != "spans"} - ) - data.pop("locations", None) - data["spans"] = spans + data = copy.deepcopy(dict(self._serialisable.items())) data["type"] = get_object_provenance(self) data["version"] = __version__ + data["gap_pos"] = self.gap_pos.tolist() + data["cum_gap_lengths"] = self.cum_gap_lengths.tolist() + data["parent_length"] = int(self.parent_length) return data + def to_json(self): + return json.dumps(self.to_rich_dict()) + @classmethod def from_rich_dict(cls, map_element): from cogent3.util.deserialise import _get_class @@ -1433,129 +1798,53 @@ def from_rich_dict(cls, map_element): map_element.pop("version", None) type_ = map_element.pop("type") assert _get_class(type_) == cls - spans = [] - for element in map_element["spans"]: - element.pop("version", None) - klass = _get_class(element.pop("type")) - instance = klass(**element) - spans.append(instance) + map_element["gap_pos"] = numpy.array(map_element["gap_pos"]) + map_element["cum_gap_lengths"] = numpy.array(map_element["cum_gap_lengths"]) - map_element["spans"] = spans return cls(**map_element) def with_termini_unknown(self): """returns new instance with terminal gaps indicated as unknown""" return self.__class__( - spans=self._spans[:], + gap_pos=self.gap_pos.copy(), + cum_gap_lengths=self.cum_gap_lengths.copy(), parent_length=self.parent_length, termini_unknown=True, ) - def inverse(self): - """returns instance with coordinates updated for aligned, unaligned""" - # is this only required for parse_out_gaps? - # NO also used in cogent3.align code - - # can't work if there are overlaps in the map - # tidy ends don't survive inversion - if self.parent_length is None: - raise ValueError("Uninvertable. parent length not known") - - cum_posn = 0 - temp = [] - for span in self._spans: - if not span.lost: - if span.reverse: - temp.append( - (span.start, span.end, cum_posn + span.length, cum_posn) - ) - else: - temp.append( - (span.start, span.end, cum_posn, cum_posn + span.length) - ) - cum_posn += span.length - - temp.sort() - new_spans = [] - last_start = 0 - for start, end, cum_start, cum_end in temp: - if start > last_start: - new_spans.append(LostSpan(start - last_start)) - elif start < last_start: - raise ValueError(f"Uninvertable. Overlap: {start} < {last_start}") - - # we force tidy_ to be same as self, attribute has no meaning - # for IndelMap, but retained for compatability for now - new_spans.append( - Span( - cum_start, - cum_end, - reverse=cum_start > cum_end, - ) - ) - last_start = end - - if self.parent_length > last_start: - new_spans.append(LostSpan(self.parent_length - last_start)) - - return self.__class__(spans=new_spans, parent_length=len(self)) - - def get_covering_span(self): - span = self.start, self.end - return self.__class__.from_locations( - locations=[span], parent_length=self.parent_length - ) - - def zeroed(self): - """returns a new instance with the first span starting at 0 + def to_feature_map(self): + """returns a Map type, suited to Features""" + return FeatureMap(spans=list(self.spans), parent_length=self.parent_length) - Note - ---- + def make_seq_feature_map(self, align_feature_map, include_gaps=True): + """converts align_feature_map to a FeatureMap with sequence coordinates - Useful when an annotated sequence is sliced, but the connection to - the original parent is being deliberately broken as in the - Sequence.deepcopy(sliced=True) case. + Parameters + ---------- + align_feature_map + with alignment coordinates + include_gaps + whether to include gaps from self as LostSpan's """ - # todo is this really required, best ifn we can rely on SeqView to - # store all relationship to underlying sequence - min_val = min(self.start, self.end) + gap_spans = {} + if include_gaps and self.num_gaps: + last = 0 + for pos, cum_length in zip(self.gap_pos, self.cum_gap_lengths): + gap_spans[pos] = LostSpan(cum_length - last) + last = cum_length spans = [] - for span in self._spans: + for span in align_feature_map.spans: if span.lost: spans.append(span) continue - kwargs = span.to_rich_dict() - del kwargs["version"] - del kwargs["type"] - kwargs["start"] = kwargs["start"] - min_val - kwargs["end"] = kwargs["end"] - min_val - spans.append(Span(**kwargs)) - - kwargs = self.to_rich_dict() - del kwargs["version"] - del kwargs["type"] - kwargs["spans"] = spans - kwargs["parent_length"] = abs(self.start - self.end) - return self.__class__(**kwargs) - def to_feature_map(self): - """returns a Map type, suited to Features""" - spans = [] - for span in self._spans: - if span.lost: - spans.append(span) - continue - kwargs = span.to_rich_dict() - del kwargs["version"] - del kwargs["type"] - spans.append(Span(**kwargs)) + start = self.get_seq_index(span.start) + end = self.get_seq_index(span.end) + if lost := gap_spans.pop(span.start, None): + spans.append(lost) + spans.append(Span(start, end)) - kwargs = self.to_rich_dict() - del kwargs["version"] - del kwargs["type"] - del kwargs["termini_unknown"] # not used for FeatureMap - kwargs["spans"] = spans - return FeatureMap(**kwargs) + return FeatureMap(spans=spans, parent_length=align_feature_map.parent_length) @dataclasses.dataclass @@ -1574,7 +1863,7 @@ class FeatureMap(MapABC): def __post_init__(self, spans): assert self.parent_length is not None - + self.parent_length = int(self.parent_length) if isinstance(spans, property): # This clause is due a known issue with dataclasses. # As we have a spans property, the default spans value is @@ -1642,14 +1931,6 @@ def __add__(self, other): def spans(self): yield from self._spans - def _with_termini_unknown(self): - return self.__class__( - self, - spans=self.spans[:], - parent_length=self.parent_length, - termini_unknown=True, - ) - def get_covering_span(self): span = (self.start, self.end) return self.__class__.from_locations( @@ -1743,9 +2024,7 @@ def nongap(self): if not s.lost: locations.append((offset, offset + s.length)) offset += s.length - return self.__class__.from_locations( - locations=locations, parent_length=len(self) - ) + return _spans_from_locations(locations=locations, parent_length=len(self)) def without_gaps(self): return self.__class__( @@ -1802,35 +2081,6 @@ def inverse(self): return self.__class__(spans=new_spans, parent_length=len(self)) - def _inverse(self): - # can't work if there are overlaps in the map - # tidy ends don't survive inversion - if self.parent_length is None: - raise ValueError("Uninvertable. parent length not known") - posn = 0 - temp = [] - for span in self.spans: - if not span.lost: - if span.reverse: - temp.append((span.start, span.end, posn + span.length, posn)) - else: - temp.append((span.start, span.end, posn, posn + span.length)) - posn += span.length - - temp.sort() - new_spans = [] - last_hi = 0 - for lo, hi, start, end in temp: - if lo > last_hi: - new_spans.append(LostSpan(lo - last_hi)) - elif lo < last_hi: - raise ValueError(f"Uninvertable. Overlap: {lo} < {last_hi}") - new_spans.append(Span(start, end, reverse=start > end)) - last_hi = hi - if self.parent_length > last_hi: - new_spans.append(LostSpan(self.parent_length - last_hi)) - return self.__class__(spans=new_spans, parent_length=len(self)) - def get_coordinates(self): """returns span coordinates as [(v1, v2), ...] @@ -1847,8 +2097,29 @@ def to_rich_dict(self): data["spans"] = spans data["type"] = get_object_provenance(self) data["version"] = __version__ + data["parent_length"] = int(self.parent_length) return data + def to_json(self): + return json.dumps(self.to_rich_dict()) + + @classmethod + def from_rich_dict(cls, map_element): + from cogent3.util.deserialise import _get_class + + map_element.pop("version", None) + type_ = map_element.pop("type") + assert _get_class(type_) == cls + spans = [] + for element in map_element.pop("spans"): + element.pop("version", None) + klass = _get_class(element.pop("type")) + instance = klass(**element) + spans.append(instance) + + map_element["spans"] = spans + return cls(**map_element) + def zeroed(self): """returns a new instance with the first span starting at 0 @@ -1916,7 +2187,7 @@ def relative_position(self, abs_pos: T) -> T: return abs_pos - self.start -def gap_coords_to_map(gaps_lengths: dict, seq_length: int) -> FeatureMap: +def gap_coords_to_map(gaps_lengths: dict, seq_length: int) -> IndelMap: """ Parameters ---------- @@ -1931,23 +2202,53 @@ def gap_coords_to_map(gaps_lengths: dict, seq_length: int) -> FeatureMap: """ if not gaps_lengths: - return FeatureMap.from_locations( - locations=[(0, seq_length)], parent_length=seq_length - ) + gap_pos = numpy.array([], dtype=int) + lengths = gap_pos.copy() + else: + gap_pos, lengths = list(zip(*sorted(gaps_lengths.items()))) + gap_pos = numpy.array(gap_pos, dtype=int) + lengths = numpy.array(lengths, dtype=int) + + return IndelMap(gap_pos=gap_pos, gap_lengths=lengths, parent_length=seq_length) + + +@register_deserialiser(get_object_provenance(IndelMap)) +def deserialise_indelmap(data: dict) -> IndelMap: + return IndelMap.from_rich_dict(data) + + +@register_deserialiser(get_object_provenance(FeatureMap)) +def deserialise_featuremap(data: dict) -> FeatureMap: + return FeatureMap.from_rich_dict(data) + + +@functools.singledispatch +def _norm_slice(index, length): + """_norm_slice(slice(1, -2, 3), 10) -> (1,8,3)""" + start = index + if start < 0: + start += length + if start >= length: + raise IndexError(index) + return start, start + 1, 1 + + +@_norm_slice.register +def _(index: slice, length): + start = _norm_index(index.start, length, 0) + end = _norm_index(index.stop, length, length) + return start, end, index.step - spans = [] - last = pos = 0 - for pos in sorted(gaps_lengths): - if pos > seq_length: - raise ValueError( - f"cannot have gap at position {pos} beyond seq_length= {seq_length}" - ) - gap = LostSpan(length=gaps_lengths[pos]) - spans.extend([gap] if pos == 0 else [Span(last, pos), gap]) - last = pos +@_norm_slice.register +def _(index: Span, length): + start = _norm_index(index.start, length, 0) + end = _norm_index(index.end, length, length) + return start, end, None - if pos < seq_length: - spans.append(Span(last, seq_length)) - return FeatureMap(spans=spans, parent_length=seq_length) +@_norm_slice.register +def _(index: FeatureMap, length): + start = _norm_index(index.start, length, 0) + end = _norm_index(index.end, length, length) + return start, end, None diff --git a/src/cogent3/core/sequence.py b/src/cogent3/core/sequence.py index f601d876e..2b28e59e7 100644 --- a/src/cogent3/core/sequence.py +++ b/src/cogent3/core/sequence.py @@ -143,7 +143,7 @@ def to_rich_dict(self, exclude_annotations=False): version=__version__, ) if hasattr(self, "annotation_offset"): - offset = self._seq.parent_start + offset = int(self._seq.parent_start) data.update(dict(annotation_offset=offset)) if ( @@ -1459,20 +1459,27 @@ def get_in_motif_size(self, motif_length=1, warn=False): def parse_out_gaps(self): """returns Map corresponding to gap locations and ungapped Sequence""" - gapless = [] - segments = [] - nongap = re.compile(f"([^{re.escape('-')}]+)") - for match in nongap.finditer(str(self)): - segments.append(match.span()) - gapless.append(match.group()) - map = IndelMap.from_locations( - locations=segments, parent_length=len(self) - ).inverse() + gap = re.compile(f"[{re.escape(self.moltype.gap)}]+") + seq = str(self) + gap_pos = [] + cum_lengths = [] + for match in gap.finditer(seq): + pos = match.start() + gap_pos.append(pos) + cum_lengths.append(match.end() - pos) + + gap_pos = array(gap_pos) + cum_lengths = array(cum_lengths).cumsum() + gap_pos[1:] = gap_pos[1:] - cum_lengths[:-1] + seq = self.__class__( - "".join(gapless), name=self.get_name(), info=self.info, preserve_case=True + gap.sub("", seq), name=self.get_name(), info=self.info, preserve_case=True + ) + indel_map = IndelMap( + gap_pos=gap_pos, cum_gap_lengths=cum_lengths, parent_length=len(seq) ) seq.annotation_db = self.annotation_db - return map, seq + return indel_map, seq def replace(self, oldchar, newchar): """return new instance with oldchar replaced by newchar""" @@ -1730,7 +1737,7 @@ def trim_stop_codon( if not gc.is_stop(end): return self - if not len(m.gaps()): + if not m.num_gaps: # has zero length if no gaps return self[:-3] @@ -1935,7 +1942,8 @@ def offset(self) -> int: @offset.setter def offset(self, value: int): - self._offset = value or 0 + value = value or 0 + self._offset = int(value) @property def seqid(self) -> str: @@ -2331,7 +2339,7 @@ def to_rich_dict(self): start, stop = self.start, self.stop data["init_args"]["seq"] = self.seq[start:stop] - data["init_args"]["offset"] = self.parent_start + data["init_args"]["offset"] = int(self.parent_start) data["init_args"]["seqid"] = self.seqid return data diff --git a/src/cogent3/draw/dotplot.py b/src/cogent3/draw/dotplot.py index 00c67c48e..c08795777 100644 --- a/src/cogent3/draw/dotplot.py +++ b/src/cogent3/draw/dotplot.py @@ -27,14 +27,9 @@ def suitable_threshold(window, desired_probability): return matches -def len_seq(span): - """length of a Annotatable map object""" - return len(span.nongap()) - - def not_gap(span): """whether a span corresponds to a non-gap""" - return len(span.gaps()) == 0 + return span.num_gaps == 0 def _convert_input(seq, moltype): @@ -77,20 +72,23 @@ def get_align_coords(map1, map2, aligned=False) -> MatchedSeqPaths: x_not_gap = not_gap(map1[i]) y_not_gap = not_gap(map2[i]) if x_not_gap and y_not_gap and start_x is None: - start_x = len_seq(map1[:i]) - start_y = len_seq(map2[:i]) + start_x = map1[:i].parent_length + start_y = map2[:i].parent_length elif (not x_not_gap or not y_not_gap) and start_x is not None: paths[start_y - start_x].append( ( - segment(start_x, len_seq(map1[:i]) - 1), - segment(start_y, len_seq(map2[:i]) - 1), + segment(start_x, map1[:i].parent_length - 1), + segment(start_y, map2[:i].parent_length - 1), ) ) start_x = start_y = None if start_x is not None: paths[start_y - start_x].append( - (segment(start_x, len_seq(map1) - 1), segment(start_y, len_seq(map2) - 1)) + ( + segment(start_x, map1.parent_length - 1), + segment(start_y, map2.parent_length - 1), + ) ) return paths diff --git a/src/cogent3/parse/cigar.py b/src/cogent3/parse/cigar.py index 6c6dc28f6..0abfabc2d 100644 --- a/src/cogent3/parse/cigar.py +++ b/src/cogent3/parse/cigar.py @@ -49,7 +49,7 @@ def cigar_to_map(cigar_text): posn += n else: spans.append(LostSpan(n)) - return IndelMap(spans=spans, parent_length=posn) + return IndelMap.from_spans(spans=spans, parent_length=posn) def aligned_from_cigar(cigar_text, seq, moltype=DNA): @@ -62,16 +62,18 @@ def aligned_from_cigar(cigar_text, seq, moltype=DNA): def _slice_by_aln(map, left, right): slicemap = map[left:right] - location = [slicemap.start, slicemap.end] if hasattr(slicemap, "start") else [] + location = [ + map.get_seq_index(left), + map.get_seq_index(right), + ] return slicemap, location def _slice_by_seq(map, start, end): - re_map = map.inverse() - slicemap = re_map[start:end] - aln_start, aln_end = slicemap.start, slicemap.end - new_map = map[aln_start:aln_end] - return new_map, [aln_start, aln_end] + # start, end are in sequence coords + aln_start = map.get_align_index(start) + aln_end = map.get_align_index(end, slice_stop=True) + return map[aln_start:aln_end], [aln_start, aln_end] def _remap(map): @@ -136,10 +138,11 @@ def CigarParser( ]: m, seq_loc = slice_cigar(cigars[seqname], aln_loc[0], aln_loc[1]) if seq_loc: - seq = seqs[seqname] + start, stop = seq_loc + seq = seqs[seqname][start:stop] if isinstance(seq, str): seq = moltype.make_seq(seq) - data[seqname] = seq[seq_loc[0] : seq_loc[1]].gapped_by_map(m) + data[seqname] = seq.gapped_by_map(m) else: data[seqname] = DNA.make_seq("-" * (aln_loc[1] - aln_loc[0])) return make_aligned_seqs(data) diff --git a/tests/test_app/test_align.py b/tests/test_app/test_align.py index 9e53d9b4e..a6f12f563 100644 --- a/tests/test_app/test_align.py +++ b/tests/test_app/test_align.py @@ -148,7 +148,7 @@ def test_aln_to_ref_known(self): ) expect = orig.to_dict() aligner = align_app.align_to_ref(ref_seq="Ref") - aln = aligner(orig.degap()) + aln = aligner.main(orig.degap()) self.assertEqual(aln.to_dict(), expect) def test_gap_union(self): @@ -726,3 +726,19 @@ def test_smith_waterman_raises(seqs): coll = make_unaligned_seqs(data=[seqs.get_seq("Human")], moltype="dna") aln = aligner(coll) assert isinstance(aln, NotCompleted) + + +def test_aln_two(): + """correctly recapitulates known case""" + orig = make_aligned_seqs( + { + "Ref": "CAGGAGAACAGAAACCCATTACTCACT", + "Qu7": "CAGGA--ACAGA--CCCGTTA---ACT", + }, + moltype="dna", + ) + expect = orig.to_dict() + aligner = align_app.align_to_ref(ref_seq="Ref") + seqs = orig.degap() + aln = aligner.main(seqs) + assert aln.to_dict() == expect diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py index 1f6c394d3..06fe5fd72 100644 --- a/tests/test_core/test_alignment.py +++ b/tests/test_core/test_alignment.py @@ -775,23 +775,6 @@ def test_set_repr_policy_invalid_input(self): dict(num_seqs=10, num_pos=60, ref_name="longest", wrap=60), ) - def test_set_repr_policy_valid_input(self): - """repr_policy should be set to new values""" - seqs = self.Class({"a": "AAAAA", "b": "AAA--"}) - seqs.set_repr_policy(num_seqs=5, num_pos=40, ref_name="a", wrap=10) - self.assertEqual( - seqs._repr_policy, dict(num_seqs=5, num_pos=40, ref_name="a", wrap=10) - ) - - if self.Class == SequenceCollection: - # this class cannot slice - return - - # should persist in slicing - self.assertEqual( - seqs[:2]._repr_policy, dict(num_seqs=5, num_pos=40, ref_name="a", wrap=10) - ) - def test_set_wrap_affects_repr_html(self): """the wrap argument affects the number of columns""" if self.Class == SequenceCollection: @@ -1996,17 +1979,21 @@ def test_model_aln_to_model_aln(self): self.assertEqual(self.r1.name, "x") -def test_featuremap_slice_aligned(): +@pytest.mark.parametrize( + "raw_seq,coords", + (("ACGGTAAAG", ((2, 4), (5, 8))), ("CCC---CCC", ((0, 3), (6, 9)))), +) +def test_featuremap_slice_aligned(raw_seq, coords): from cogent3.core.alignment import Aligned from cogent3.core.location import FeatureMap, Span - raw_seq = "ACGGTAAAG" im, seq = DNA.make_seq(raw_seq).parse_out_gaps() ia = Aligned(im, seq) length = len(raw_seq) - fmap = FeatureMap(spans=[Span(2, 4), Span(5, 8)], parent_length=length) + fmap = FeatureMap(spans=[Span(s, e) for s, e in coords], parent_length=length) + expect = "".join(raw_seq[s:e] for s, e in fmap.get_coordinates()) got = ia[fmap] - assert str(got) == "GGAAA" + assert str(got) == expect @pytest.mark.parametrize("cls", (ArrayAlignment, Alignment)) @@ -2829,7 +2816,8 @@ def test_get_seq_with_sliced_rced_aln_multiple_spans(name): } aln = make_aligned_seqs(data=seqs, moltype="dna", array_align=False) start, stop = 1, 10 - a1 = aln[start:stop].rc() + a1 = aln[start:stop] + a1 = a1.rc() got = str(a1.get_seq(name)) dna = get_moltype("dna") expect = dna.complement(seqs[name][start:stop].replace("-", ""))[::-1] @@ -3481,6 +3469,24 @@ def test_positions(cls): assert list(r.positions) == expect +@pytest.mark.parametrize("cls", (Alignment, ArrayAlignment, SequenceCollection)) +def test_set_repr_policy_valid_input(cls): + """repr_policy should be set to new values""" + seqs = cls({"a": "AAAAA", "b": "AAA--"}) + seqs.set_repr_policy(num_seqs=5, num_pos=40, ref_name="a", wrap=10) + assert seqs._repr_policy == dict(num_seqs=5, num_pos=40, ref_name="a", wrap=10) + + +@pytest.mark.parametrize("cls", (Alignment, ArrayAlignment)) +def test_set_repr_policy_valid_input_slices(cls): + """repr_policy should be set to new values""" + seqs = cls({"a": "AAAAA", "b": "AAA--"}) + seqs.set_repr_policy(num_seqs=5, num_pos=40, ref_name="a", wrap=10) + # should persist in slicing + sliced = seqs[:2] + assert sliced._repr_policy == dict(num_seqs=5, num_pos=40, ref_name="a", wrap=10) + + def test_array_align_error_with_mixed_length(): data = dict(s1="ACGG", s2="A-G") with pytest.raises(ValueError, match=".* not all the same length.*"): @@ -3601,7 +3607,7 @@ def test_no_degenerates(cls): assert result == expect # motif length of 3, defaults - no gaps allowed - result = aln.no_degenerates(motif_length=3).to_dict() + result = aln.no_degenerates(motif_length=3, allow_gap=False).to_dict() expect = { "s1": "TTT".replace(" ", ""), "s2": "AAA".replace(" ", ""), @@ -3654,3 +3660,11 @@ def test_quick_tree(cls, calc, brca1_data): if not edge.is_root() } assert types == {float} + + +@pytest.mark.parametrize("raw", ("-AAAGGGGGAACCCT", "AAAGGGGGAACCCT")) +def test_slice_aligned(raw): + imap, seq = DNA.make_seq(raw, name="x").parse_out_gaps() + al = Aligned(imap, seq) + sliced = al[:-3] + assert str(sliced) == raw[:-3] diff --git a/tests/test_core/test_features.py b/tests/test_core/test_features.py index bb597276d..40290d0e6 100755 --- a/tests/test_core/test_features.py +++ b/tests/test_core/test_features.py @@ -650,11 +650,15 @@ def test_roundtripped_alignment_with_slices(): db.add_feature(seqid="x", biotype="exon", name="E2", spans=[(10, 13)]) aln.annotation_db = db # at the alignment level + sl = aln.seqs[0][:-3] + assert str(sl) == "-AAAGGGGGAACCCT"[:-3] sub_aln = aln[:-3] feats = list(sub_aln.get_features(biotype="exon", allow_partial=True)) assert len(feats) == 2 new = deserialise_object(sub_aln.to_json()) - gf1, gf2 = list(new.get_features(biotype="exon", allow_partial=True)) + feats = list(new.get_features(biotype="exon", allow_partial=True)) + assert len(feats) == 2 + gf1, gf2 = feats assert gf1.get_slice().to_dict() == {"x": "GGGGG", "y": "--TTT"} assert gf2.get_slice().to_dict() == {"x": "C", "y": "G"} diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 83ad23e75..512da7a39 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -1,10 +1,12 @@ """Unit tests for Span classes. """ +from itertools import combinations from unittest import TestCase +import numpy import pytest -from cogent3 import DNA +from cogent3 import DNA, make_seq from cogent3.core.location import ( FeatureMap, IndelMap, @@ -302,22 +304,19 @@ def test_map_plus_position(): def test_indel_map_useful_complete(): - im = IndelMap(spans=[LostSpan(3)], parent_length=0) + im = IndelMap.from_spans(spans=[LostSpan(3)], parent_length=0) assert not im.useful assert not im.complete - assert len(im) == im.length == 3 + assert len(im) == 3 -@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) -def test_map_nucleic_reversed(cls): +def test_map_nucleic_reversed(): expect = [(0, 9)] # seq is 9 long # plus coords 012345678 - # +slice ** # plus seq AAACCCTGG - # orig = Aligned(*DNA.make_seq("AAACCCTGG", name="a").parse_out_gaps()) - orig = cls.from_locations(locations=[(0, 9)], parent_length=9) + orig = FeatureMap.from_locations(locations=[(0, 9)], parent_length=9) # minus coords 012345678 # rel coord 01234 # -slice ***** @@ -334,7 +333,7 @@ def test_coordinate(cls): # 01 2 345 seq = DNA.make_seq("AC---G-TAA--") m, _ = seq.parse_out_gaps() - m = cls(spans=tuple(m.spans), parent_length=m.parent_length) + m = cls.from_spans(spans=tuple(m.spans), parent_length=m.parent_length) got = m.get_coordinates() assert got == [(0, 2), (2, 3), (3, 6)] @@ -343,19 +342,18 @@ def test_coordinate(cls): def test_gap_coordinate(cls): seq = DNA.make_seq("AC---G-TAA--") m, _ = seq.parse_out_gaps() - m = cls(spans=tuple(m.spans), parent_length=m.parent_length) + m = cls.from_spans(spans=tuple(m.spans), parent_length=m.parent_length) got = m.get_gap_coordinates() - assert got == [(2, 3), (3, 1), (6, 2)] + assert [tuple(c) for c in got] == [(2, 3), (3, 1), (6, 2)] -@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) -def test_gaps(cls): +def test_gaps(): # returns spans corresponding to position on "aligned" seq of gaps # 000000000011 # 012345678901 seq = DNA.make_seq("AC---G-TAA--") m, s = seq.parse_out_gaps() - m = cls(spans=tuple(m.spans), parent_length=m.parent_length) + m = FeatureMap.from_spans(spans=tuple(m.spans), parent_length=m.parent_length) got = [(g.start, g.end) for g in tuple(m.gaps().spans)] assert got == [(2, 5), (6, 7), (10, 12)] @@ -367,41 +365,76 @@ def test_nongap(cls): # 012345678901 seq = DNA.make_seq("AC---G-TAA--") m, _ = seq.parse_out_gaps() - m = cls(spans=tuple(m.spans), parent_length=m.parent_length) + m = cls.from_spans(spans=tuple(m.spans), parent_length=m.parent_length) - got = [(g.start, g.end) for g in m.nongap().spans] + got = [(s.start, s.end) for s in m.nongap()] assert got == [(0, 2), (5, 6), (7, 10)] +def test_spans_gen(): + # returns spans corresponding to position on "aligned" seq of nongaps + # 000000000011 + # 012345678901 + seq = DNA.make_seq("AC---G-TAA--") + expect = [Span(0, 2), LostSpan(3), Span(2, 3), LostSpan(1), Span(3, 6), LostSpan(2)] + m, _ = seq.parse_out_gaps() + gap_data = numpy.array([(2, 3), (3, 1), (6, 2)]) + pos, lengths = gap_data.T + im = IndelMap(gap_pos=pos, gap_lengths=lengths, parent_length=m.parent_length) + got = list(im.spans) + assert got == expect + + +def test_spans_gap_start(): + # returns spans corresponding to position on "aligned" seq of nongaps + # 000000000011 + # 012345678901 + seq = DNA.make_seq("---TAA") + expect = [LostSpan(3), Span(0, 3)] + im, _ = seq.parse_out_gaps() + got = list(im.spans) + assert got == expect + + +def test_spans_gen_nogaps(): + # returns spans corresponding to position on "aligned" seq of nongaps + # 000000000011 + # 012345678901 + seq = DNA.make_seq("ACGTAA") + m, _ = seq.parse_out_gaps() + spans = list(m.spans) + assert len(spans) == 1 + assert len(spans[0]) == len(seq) + + def test_round_trip_rich_dict(): seq = DNA.make_seq("AC---G-TAA--") - m, s = seq.parse_out_gaps() - # reversed() reverses the order of spans, but keeps their coordinates - # differs from nucleic reversed, which computes a new relative position - im = IndelMap(spans=m.spans, parent_length=m.parent_length) + im, s = seq.parse_out_gaps() rd = im.to_rich_dict() got = IndelMap.from_rich_dict(rd) assert im is not got assert got.to_rich_dict() == im.to_rich_dict() -def test_serialisable_attr(): - set_vals = {"spans": [Span(0, 2)], "parent_length": 20} - im = IndelMap.from_spans(**set_vals) - got = {k: im._serialisable[k] for k in set_vals} - assert got == set_vals - - def test_terminal_unknown(): - # span idx 01 2 345 6 - seq = DNA.make_seq("-AC---G-TAA--") - m, _ = seq.parse_out_gaps() + # seq idx 01 2 345 6 + # -AC---G-TAA-- + # aligned seq length is 13 + gap_data = numpy.array([[0, 1], [2, 3], [3, 1], [6, 2]]) + gap_pos, gap_lengths = gap_data.T + + m = IndelMap( + gap_pos=gap_pos.copy(), gap_lengths=gap_lengths.copy(), parent_length=6 + ) # not unknown, by default m_spans = tuple(m.spans) assert m_spans[0].lost and not isinstance(m_spans[0], TerminalPadding) # use the constructor arg m = IndelMap( - spans=tuple(m.spans), parent_length=m.parent_length, termini_unknown=True + gap_pos=gap_pos.copy(), + gap_lengths=gap_lengths.copy(), + parent_length=6, + termini_unknown=True, ) m_spans = tuple(m.spans) assert isinstance(m_spans[0], TerminalPadding) @@ -410,9 +443,8 @@ def test_terminal_unknown(): assert m_spans[4].lost and not isinstance(m_spans[2], TerminalPadding) # use the method - m, _ = seq.parse_out_gaps() m = IndelMap( - spans=tuple(m.spans), parent_length=m.parent_length + gap_pos=gap_pos.copy(), gap_lengths=gap_lengths.copy(), parent_length=6 ).with_termini_unknown() m_spans = tuple(m.spans) assert isinstance(m_spans[0], TerminalPadding) @@ -421,25 +453,26 @@ def test_terminal_unknown(): assert not isinstance(m_spans[2], TerminalPadding) # no gaps, no effect - seq = DNA.make_seq("ACGTAA") - m, _ = seq.parse_out_gaps() # use the constructor arg + empty = numpy.array([], dtype=int) m = IndelMap( - spans=tuple(m.spans), parent_length=m.parent_length, termini_unknown=True + gap_pos=empty.copy(), + gap_lengths=empty.copy(), + parent_length=6, + termini_unknown=True, ) m_spans = tuple(m.spans) - assert not isinstance(m_spans[0], TerminalPadding) + assert len(m_spans) == 1 and not m_spans[0].lost # use the method m = IndelMap( - spans=tuple(m.spans), parent_length=m.parent_length + gap_pos=empty.copy(), gap_lengths=empty.copy(), parent_length=6 ).with_termini_unknown() m_spans = tuple(m.spans) - assert not isinstance(m_spans[0], TerminalPadding) + assert len(m_spans) == 1 and not m_spans[0].lost -@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) -def test_map_inverse(cls): - m = cls.from_locations(locations=[(0, 2), (4, 6)], parent_length=6) +def test_featuremap_inverse(): + m = FeatureMap.from_locations(locations=[(0, 2), (4, 6)], parent_length=6) assert len(m) == 4 mi = m.inverse() assert len(mi) == 6 @@ -452,6 +485,49 @@ def test_map_inverse(cls): assert got == expect +def test_indelmap_from_aligned_segments(): + locations = [(0, 2), (4, 6)] + im = IndelMap.from_aligned_segments(locations=locations, aligned_length=6) + assert len(im) == 6 + expected_length = sum(e - s for s, e in locations) + assert im.parent_length == expected_length + im_spans = tuple(im.spans) + assert im_spans[1].lost and len(im_spans[1]) == 2 + + +def test_indelmap_from_aligned_segments2(): + locations = [(0, 5), (7, 12), (14, 21), (24, 27)] + im = IndelMap.from_aligned_segments(locations=locations, aligned_length=27) + expected_length = sum(e - s for s, e in locations) + assert im.parent_length == expected_length + + +def test_indelmap_inverse(): + im1 = IndelMap( + gap_pos=numpy.array([], dtype=int), + gap_lengths=numpy.array([], dtype=int), + parent_length=4, + ) + im2 = IndelMap( + gap_pos=numpy.array([0], dtype=int), + gap_lengths=numpy.array([2], dtype=int), + parent_length=4, + ) + + assert len(im1) == 4 + assert im1.parent_length == 4 + inv = im1.merge_maps(im2) + assert inv.parent_length == 4 + assert len(inv) == 6 + + fmap1 = im1.to_feature_map() + fmap2 = im2.to_feature_map() + got = fmap2.inverse()[fmap1.inverse()].inverse() + assert inv.get_gap_coordinates() == [ + list(coords) for coords in got.get_gap_coordinates() + ] + + @pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) def test_map_offsets(cls): # offsets are absolute starts of spans @@ -463,9 +539,8 @@ def test_map_offsets(cls): assert got == [0, 1, 3, 6, 7, 8, 11] -@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) -def test_map_indexed(cls): - m = cls.from_locations(locations=[(0, 2), (4, 6)], parent_length=6).inverse() +def test_map_indexed(): + m = FeatureMap.from_locations(locations=[(0, 2), (4, 6)], parent_length=6).inverse() indexed = m[2] assert len(indexed) == 1 @@ -473,76 +548,50 @@ def test_map_indexed(cls): def test_compare_map_indexed(): from cogent3.core.alignment import Aligned - seq = DNA.make_seq("--AC-GTAA--".replace("-", "")) - spans = [LostSpan(2), Span(0, 2), LostSpan(2), Span(2, 6), LostSpan(2)] - kwargs = dict(spans=spans, parent_length=len(seq)) - mm = FeatureMap(**kwargs) - im = IndelMap(**kwargs) - ma = Aligned(mm, seq) + raw_seq = "--AC-GTAA--" + im, seq = DNA.make_seq(raw_seq).parse_out_gaps() ia = Aligned(im, seq) - first = ia[0] - assert first == "-" - length = len(ma) + length = len(raw_seq) got = [str(ia[i]) for i in range(length)] - expect = [str(ma[i]) for i in range(length)] + expect = list("--AC-GTAA--") assert got == expect @pytest.mark.parametrize("slice_it", (True, False)) -def test_indel_map_zeroed(slice_it): +def test_feature_map_zeroed(slice_it): spans = [LostSpan(2), Span(2, 4), LostSpan(2), Span(4, 8), LostSpan(2)] kwargs = dict(spans=spans, parent_length=6) - mm = FeatureMap(**kwargs) - if slice_it: - mm = mm[:6] - mm_zeroed = mm.zeroed() - - im = IndelMap(**kwargs) + fm = FeatureMap(**kwargs) if slice_it: - im = im[:6] + fm = fm[3:6] - im_zeroed = im.zeroed() - assert im_zeroed.get_coordinates() == mm_zeroed.get_coordinates() - assert im_zeroed.parent_length == mm_zeroed.parent_length + fm_zeroed = fm.zeroed() + assert fm.start > 0 + assert fm_zeroed.start == 0 def test_indelmap_to_feature_map(): spans = [LostSpan(2), Span(2, 4), LostSpan(2), Span(4, 8), LostSpan(2)] - kwargs = dict(spans=spans, parent_length=6) - im = IndelMap(**kwargs) + kwargs = dict(spans=spans, parent_length=8) + im = IndelMap.from_spans(**kwargs) mm = im.to_feature_map() assert mm.get_coordinates() == im.get_coordinates() def test_indelmap_nucleic_reversed(): - spans = [LostSpan(2), Span(2, 4), LostSpan(2), Span(4, 8), LostSpan(2)] - kwargs = dict(spans=spans, parent_length=12) - orig = IndelMap(**kwargs) + # 01 2345 + # --AC--GGGG-- + orig, s = DNA.make_seq("--AC--GGGG--").parse_out_gaps() + plus_coords = [(0, 2), (2, 6)] + assert orig.get_coordinates() == plus_coords rev = orig.nucleic_reversed() - rev_spans = tuple(rev.spans) - assert rev_spans[1].reverse == rev_spans[3].reverse == False - old = orig.nucleic_reversed() - old_spans = tuple(old.spans) - assert rev.get_coordinates() == old.get_coordinates() - - -def test_indelmap_with_reverse_span(): - spans = [ - LostSpan(2), - Span(8, 4, reverse=True), - LostSpan(2), - Span(4, 2, reverse=True), - LostSpan(2), - ] - imap = IndelMap(spans=spans, parent_length=12) - assert (imap.start, imap.end) == (8, 2) - - -def test_indelmap_no_gaps(): - imap = IndelMap.from_locations(locations=(), parent_length=6) - gaps = imap.gaps() - assert not gaps + assert rev.num_gaps == orig.num_gaps + minus_coords = [(0, 4), (4, 6)] + assert rev.get_coordinates() == minus_coords + # reversing again returns original + back = rev.nucleic_reversed() + assert orig.get_coordinates() == back.get_coordinates() def test_get_coords(): @@ -553,6 +602,65 @@ def test_get_coords(): assert coords == spans +def test_indel_map_get_coords(): + """get_coordinates should return raw coordinates matching input""" + imap = IndelMap( + gap_pos=numpy.array([9]), gap_lengths=numpy.array([10]), parent_length=20 + ) + locations = [(0, 9), (9, 20)] + coords = imap.get_coordinates() + assert coords == locations + + +def test_indel_map_get_gap_coords(): + gap_data = numpy.array([(2, 3), (3, 1), (6, 2)]) + gap_pos, lengths = gap_data[:, 0], gap_data[:, 1] + m = IndelMap(gap_pos=gap_pos, gap_lengths=lengths, parent_length=10) + got = m.get_gap_coordinates() + assert got == gap_data.tolist() + + +@pytest.mark.parametrize( + "coords", ([(0, 3), (7, 11)], [(0, 3)], [(2, 4), (6, 10)], [(4, 6)]) +) +def test_indelmap_joined_segments(coords): + raw = "--AC--GGGG--" + expect, _ = DNA.make_seq("".join(raw[s:e] for s, e in coords)).parse_out_gaps() + imap, s = DNA.make_seq(raw).parse_out_gaps() + got = imap.joined_segments(coords) + assert got.gap_pos.tolist() == expect.gap_pos.tolist() + assert got.cum_gap_lengths.tolist() == expect.cum_gap_lengths.tolist() + + +def test_indelmap_slice_terminating(): + raw = "-CB-A--" + start, end = 4, 6 + expect, _ = DNA.make_seq(raw[start:end]).parse_out_gaps() + imap, s = DNA.make_seq(raw).parse_out_gaps() + got = imap[start:end] + assert got.gap_pos.tolist() == expect.gap_pos.tolist() + assert got.cum_gap_lengths.tolist() == expect.cum_gap_lengths.tolist() + + +def test_indelmap_slice_innner_gap(): + start, end = 4, 6 + raw = "--AC--GGGG--" + expect, _ = DNA.make_seq(raw[start:end]).parse_out_gaps() + imap, _ = DNA.make_seq(raw).parse_out_gaps() + imap = imap[start:end] + assert imap.gap_pos.tolist() == expect.gap_pos.tolist() + + +def test_indelmap_slice_cum_length(): + start, end = 7, 11 + raw = "--AC--GGGG--" + expect, _ = DNA.make_seq(raw[start:end]).parse_out_gaps() + imap, _ = DNA.make_seq(raw).parse_out_gaps() + imap = imap[start:end] + assert imap.gap_pos.tolist() == expect.gap_pos.tolist() + assert imap.cum_gap_lengths.tolist() == expect.cum_gap_lengths.tolist() + + def test_get_coords_invalid_order(): """get_coordinates should return raw coordinates matching input""" @@ -562,6 +670,331 @@ def test_get_coords_invalid_order(): FeatureMap.from_locations(locations=spans, parent_length=100) +def test_indelmap_slice_gap_into_seq(): + pos, lengths = numpy.array([[3, 1], [7, 1]]).T + gaps = IndelMap(gap_pos=pos, gap_lengths=lengths, parent_length=7) + # example seq + # 012345678 + # 012 3456 + # ACA-CGAC- + # slicing from aligned coords 3-5, gives '-C' + got = gaps[3:5] + assert got.gap_pos.tolist() == [0] + assert got.cum_gap_lengths.tolist() == [1] + + +def test_indelmap_slice_one_gap(): + pos, lengths = numpy.array([[3, 6]]).T + gaps = IndelMap(gap_pos=pos, gap_lengths=lengths, parent_length=24) + # example seq + # 1 + # 01234567890 + # 012 3456789 + # GTG------GTAGAAGTTCCAAATAATGAA + # slicing from aligned coords 3-5, gives 'TG------G' + # in sliced seq, gap at 2 + got = gaps[1:10] + assert got.gap_pos.tolist() == [2] + assert got.cum_gap_lengths.tolist() == [6] + + +@pytest.mark.parametrize( + "data", + ( + "AB---CD--EF", + "---ABCD--EF", + "ABCD---EF--", + "-----ABCDEF", + "ABCDEF-----", + "-ABCDEF----", + "-A-B-C-D-EF", + "A-B-C-D-EF-", + ), +) +@pytest.mark.parametrize("index", range(4, 6)) # the ungapped sequence is 6 long +def test_gapped_convert_seq2aln(data, index): + # converting a sequence index to alignment index + ungapped = data.replace("-", "") + seq = make_seq(data, moltype="text") + gaps, s = seq.parse_out_gaps() + idx = gaps.get_align_index(index) + assert data[idx] == ungapped[index] + + +@pytest.mark.parametrize( + "data", + ( + "AB---CD--EF", + "---ABCD--EF", + "ABCD---EF--", + "-----ABCDEF", + "ABCDEF-----", + "-ABCDEF----", + "-A-B-C-D-EF", + "A-B-C-D-EF-", + ), +) +@pytest.mark.parametrize( + "start,end", list(combinations(range(6), 2)) +) # the ungapped sequence is 6 long +def test_indelmap_align_index_slice_stop(data, start, end): + # converting a sequence index to alignment index + ungapped = data.replace("-", "") + seq = make_seq(data, moltype="text") + gaps, s = seq.parse_out_gaps() + stop = gaps.get_align_index(end, slice_stop=True) + assert data[stop - 1] == ungapped[end - 1] + + +@pytest.mark.parametrize( + "data", + ( + "AB---CD--EF", + "---ABCD--EF", + "ABCD---EF--", + "-----ABCDEF", + "ABCDEF-----", + "-ABCDEF----", + "-A-B-C-D-EF", + "A-B-C-D-EF-", + ), +) +@pytest.mark.parametrize("index", range(6)) # the ungapped sequence is 6 long +def test_gapped_convert_seq2aln2seq(data, index): + # round tripping seq to alignment to seq indices + gaps, s = make_seq(data, moltype="text").parse_out_gaps() + align_index = gaps.get_align_index(index) + got = gaps.get_seq_index(align_index) + assert got == index + + +@pytest.mark.parametrize("expect", range(10)) +def test_indelmap_get_seq_index_negative(expect): + parent_length = 10 + gap_pos = [0] + gap_lengths = [3] + imap = IndelMap( + gap_pos=numpy.array(gap_pos), + gap_lengths=numpy.array(gap_lengths), + parent_length=parent_length, + ) + neg_index = expect - parent_length + got = imap.get_seq_index(neg_index) + assert got == expect + + +@pytest.mark.parametrize("expect", range(10)) +def test_indelmap_get_align_index_negative(expect): + parent_length = 10 + gap_pos = [0] + gap_lengths = [3] + gap_length = gap_lengths[0] + imap = IndelMap( + gap_pos=numpy.array(gap_pos), + gap_lengths=numpy.array(gap_lengths), + parent_length=parent_length, + ) + neg_index = expect + gap_length - len(imap) + got = imap.get_seq_index(neg_index) + assert got == expect + + +@pytest.mark.parametrize( + "data", + ( + "AB--CDE-FG", + "--ABC-DEFG", + "AB--CDE-FG--", + "ABCDE--FG---", + "-----ABCDEFG", + "-A-B-C-D-E-F-G-", + ), +) +@pytest.mark.parametrize("seq_index", range(7)) +def test_gapped_convert_aln2seq_nongap_char(data, seq_index): + # test alignment indexes when aligned position is NOT a gap + ungapped = "ABCDEFG" + align_index = data.find(ungapped[seq_index]) + gaps, seq = make_seq(data, moltype="text").parse_out_gaps() + idx = gaps.get_seq_index(align_index) + assert idx == seq_index + + +def _find_nth_gap_index(data: str, n: int) -> int: + num = -1 + for i, c in enumerate(data): + if c == "-": + num += 1 + if num == n: + return i + raise ValueError(f"{data=}, {n=}") + + +def _get_expected_seqindex(data: str, align_index: int) -> int: + # compute the expected seqindex + refseq = data.replace("-", "") + got = data[align_index:].lstrip("-") + return refseq.find(got[0]) if got else len(refseq) + + +@pytest.mark.parametrize( + "data", + ( + "AB-----CDE-F--G", + "----ABC-DEFG---", + "AB--CDE-FG-----", + "ABCDE--FG------", + "--------ABCDEFG", + "-A-B-C-D-E-F-G-", + ), +) +@pytest.mark.parametrize("gap_number", range(8)) +def test_gapped_convert_aln2seq_gapchar(data, gap_number): + # test alignment indexes when aligned position IS a gap + # in this case we expect the position of the next non-gap + # to be the result + # find the alignment index corresponding to the + align_index = _find_nth_gap_index(data, gap_number) + assert data[align_index] == "-", (data, gap_number) + # find nearest non-gap + seq_index = _get_expected_seqindex(data, align_index) + gaps, seq = make_seq(data, moltype="text").parse_out_gaps() + idx = gaps.get_seq_index(align_index) + assert idx == seq_index + + +def test_gapped_convert_aln2seq_invalid(): + gaps, seq = make_seq("AC--GTA-TG", moltype="dna").parse_out_gaps() + with pytest.raises(NotImplementedError): + # absolute value of negative indices must be < seq length + gaps.get_seq_index(-100) + + +@pytest.mark.parametrize( + "invalid_slice", + (slice(None, None, -1), slice(None, None, 2)), +) +def test_gap_pos_invalid_slice(invalid_slice): + pos, lengths = numpy.array([[1, 3]], dtype=numpy.int32).T + gp = IndelMap(gap_pos=pos, gap_lengths=lengths, parent_length=20) + with pytest.raises(NotImplementedError): + gp[invalid_slice] + + +@pytest.mark.parametrize( + "slice", + ( + slice(3, 7), + slice(20, None), + ), +) +def test_no_gaps_in_slice(slice): + # aligned length is 25 + seq_length = 20 + gap_length = 5 + pos, lengths = numpy.array([[10, gap_length]], dtype=numpy.int32).T + gp = IndelMap(gap_pos=pos, gap_lengths=lengths, parent_length=seq_length) + got = gp[slice] + assert not got.num_gaps + start = slice.start or 0 + stop = slice.stop or (seq_length + gap_length) + assert len(got) == stop - start + + +def test_len_gapped(): + seq_length = 20 + gap_length = 5 + pos, lengths = numpy.array([[10, gap_length]], dtype=numpy.int32).T + gp = IndelMap(gap_pos=pos, gap_lengths=lengths, parent_length=seq_length) + assert len(gp) == (seq_length + gap_length) + + +def test_all_gaps_in_slice(): + # slicing GapPositions + # sample seq 1 + data = "AC--GTA-TG" + gp, seq = make_seq(data, moltype="dna").parse_out_gaps() + sl = slice(1, 9) + + got = gp[sl] + expect_gaps, expect_seq = make_seq(data[sl], moltype="dna").parse_out_gaps() + assert got.get_gap_coordinates() == expect_gaps.get_gap_coordinates() + assert got.parent_length == 5 + + +@pytest.mark.parametrize( + "data", + ( + "----GTA-TG", + "AC--GTA---", + "AC--GTA-TG", + "A-C-G-T-A-", + "-A-C-G-T-A", + "ACGTAACGTA", + "----------", + ), +) +@pytest.mark.parametrize( + "slice", + [slice(i, j) for i, j in combinations(range(10), 2)], +) +def test_variant_slices(data, slice): + gaps, _ = make_seq(data, moltype="dna").parse_out_gaps() + got = gaps[slice] + + expect_gaps, expect_seq = make_seq(data[slice], moltype="dna").parse_out_gaps() + assert got.parent_length == len(expect_seq) + assert (got.gap_pos == expect_gaps.gap_pos).all() + assert (got.cum_gap_lengths == expect_gaps.cum_gap_lengths).all() + + +def test_indelmap_joined(): + pos = numpy.array([0, 1]) + cum_len = numpy.array([1, 5]) + imap = IndelMap(gap_pos=pos, cum_gap_lengths=cum_len, parent_length=5) + fmap = FeatureMap.from_locations(locations=[(0, 1), (2, 5)], parent_length=10) + coords = fmap.get_coordinates() + got = imap.joined_segments(coords) + assert got.num_gaps == 1 + assert got.gap_pos[0] == 0 + assert got.cum_gap_lengths[0] == (1 + 5 - 2) + + +def test_indel_map_sliced_with_negative(): + imap = IndelMap( + gap_pos=numpy.array([0]), cum_gap_lengths=numpy.array([1]), parent_length=14 + ) + got = imap[:-3] + assert got.parent_length == 14 - 3 + + +def test_indelmap_roundtrip_json(): + from cogent3.util.deserialise import deserialise_object + + imap = IndelMap( + gap_pos=numpy.array([0, 9]), + cum_gap_lengths=numpy.array([1, 3]), + parent_length=14, + ) + got = deserialise_object(imap.to_json()) + assert (got.gap_pos == imap.gap_pos).all() + assert (got.cum_gap_lengths == imap.cum_gap_lengths).all() + assert got.parent_length == imap.parent_length + + +def test_featuremap_roundtrip_json(): + from cogent3.util.deserialise import deserialise_object + + fmap = FeatureMap.from_locations( + locations=[[0, 9], [20, 23]], + parent_length=140, + ) + got = deserialise_object(fmap.to_json()) + coords = fmap.get_coordinates() + assert coords == [(0, 9), (20, 23)] + assert got.parent_length == fmap.parent_length == 140 + + def test_gap_coords_to_map(): """construct a Map from coordinates of gap alone""" gap_coords = {0: 1, 2: 2, 4: 1, 7: 2} diff --git a/tests/test_draw/test_dotplot.py b/tests/test_draw/test_dotplot.py index 38c8d6a63..19180a7c0 100644 --- a/tests/test_draw/test_dotplot.py +++ b/tests/test_draw/test_dotplot.py @@ -6,7 +6,6 @@ Dotplot, _convert_input, get_align_coords, - len_seq, not_gap, ) @@ -15,7 +14,7 @@ class TestUtilFunctions(TestCase): def test_len_seq(self): """returns length of sequence minus gaps""" m, seq = DNA.make_seq("ACGGT--A").parse_out_gaps() - self.assertEqual(len_seq(m), 6) + self.assertEqual(m.parent_length, 6) def test_not_gap(self): """distinguishes Map instances that include gap or not""" diff --git a/tests/test_parse/test_cigar.py b/tests/test_parse/test_cigar.py index 5f09808ff..e8547afb4 100644 --- a/tests/test_parse/test_cigar.py +++ b/tests/test_parse/test_cigar.py @@ -49,7 +49,7 @@ def test_slice_cigar(self): slicealn1 = self.seq[loc1[0] : loc1[1]].gapped_by_map(map1) assert ori1 == slicealn1 else: - assert map1.length == len(ori1) + assert len(map1) == len(ori1) # test by_align = False map2, loc2 = slice_cigar(self.cigar_text, start, end, by_align=False) @@ -84,7 +84,7 @@ def test_CigarParser(self): start=start, end=end, ) - assert cmp_aln.to_dict() == slice_aln.to_dict() + assert cmp_aln.to_dict() == slice_aln.to_dict(), (start, end) if __name__ == "__main__": From f8fe8fb2f6191696e10e1077ea0c1181bcdaef20 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 23 Apr 2024 15:26:08 +1000 Subject: [PATCH 17/62] API: IndelMap.merge_map(parent_length) sets a specific value [NEW] argument is optional, with the default being self.parent_length --- src/cogent3/core/location.py | 7 +++++-- tests/test_core/test_location.py | 20 +++++++++++++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 5229f29a2..4f8f274c6 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1697,13 +1697,15 @@ def get_gap_coordinates(self): cum_lengths[1:] = diffs return numpy.array([self.gap_pos, cum_lengths]).T.tolist() - def merge_maps(self, other): + def merge_maps(self, other, parent_length: Optional[int] = None): """merge gaps of other with self Parameters ---------- indel_map instance for same sequence + parent_length + overrides property """ unique_pos = numpy.union1d(self.gap_pos, other.gap_pos) gap_lengths = numpy.zeros(unique_pos.shape, dtype=self.cum_gap_lengths.dtype) @@ -1711,10 +1713,11 @@ def merge_maps(self, other): other_lengths = other.get_gap_lengths() _update_lengths(unique_pos, gap_lengths, self.gap_pos, self_lengths) _update_lengths(unique_pos, gap_lengths, other.gap_pos, other_lengths) + parent_length = parent_length or self.parent_length return self.__class__( gap_pos=unique_pos, gap_lengths=gap_lengths, - parent_length=self.parent_length, + parent_length=parent_length, ) def joined_segments(self, coords: list[tuple[int, int]]): diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 512da7a39..ece8c9a84 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -502,7 +502,7 @@ def test_indelmap_from_aligned_segments2(): assert im.parent_length == expected_length -def test_indelmap_inverse(): +def test_indelmap_merge(): im1 = IndelMap( gap_pos=numpy.array([], dtype=int), gap_lengths=numpy.array([], dtype=int), @@ -528,6 +528,24 @@ def test_indelmap_inverse(): ] +def test_indelmap_merge_parent_length(): + im1 = IndelMap( + gap_pos=numpy.array([], dtype=int), + gap_lengths=numpy.array([], dtype=int), + parent_length=4, + ) + im2 = IndelMap( + gap_pos=numpy.array([0], dtype=int), + gap_lengths=numpy.array([2], dtype=int), + parent_length=4, + ) + # providing a value for parent_length overrides standard + im3 = im1.merge_maps(im2) + ov = im1.merge_maps(im2, parent_length=20) + assert ov.parent_length != im2.parent_length + assert ov.parent_length == 20 + + @pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) def test_map_offsets(cls): # offsets are absolute starts of spans From 6d5a1acef93e61b8ef92de64c33b97a293048a15 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 23 Apr 2024 15:30:44 +1000 Subject: [PATCH 18/62] ENH: codon alignment now explicitly trims sequences during alignment [CHANGED] this was previously implicit during the string conversion of Aligned() instances, relying on the behaviour of the Sequence.gapped_by_map() method. The problem was it allowed for a mismatch in the length of IndelMap's and the associated sequences. We now enforce consistency between IndelMap.parent_length, and the length of the associated instance. This is done in AlignablePOG._calcAligneds() --- src/cogent3/align/pairwise.py | 16 ++++++++++++++-- tests/test_app/test_align.py | 10 ++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/cogent3/align/pairwise.py b/src/cogent3/align/pairwise.py index 14aa9a720..f91ae6420 100644 --- a/src/cogent3/align/pairwise.py +++ b/src/cogent3/align/pairwise.py @@ -478,8 +478,20 @@ def _calcAligneds(self, children): aligneds = [] for dim, child in enumerate(children): for seq_name, aligned in child.aligneds: - new_map = aligned.map.merge_maps(maps[dim] * word_length) - aligneds.append((seq_name, Aligned(new_map, aligned.data))) + # if word_length != 1 then maps are forced to have + # sequences lengths that are modulo word_length + # Likewise, if the data is not modulo word_length, + # it is trimmed to match + new_map = aligned.map.merge_maps( + maps[dim] * word_length, + parent_length=maps[dim].parent_length * word_length, + ) + data = ( + aligned.data + if new_map.parent_length == len(aligned.data) + else aligned.data[: new_map.parent_length] + ) + aligneds.append((seq_name, Aligned(new_map, data))) return aligneds def backward(self): diff --git a/tests/test_app/test_align.py b/tests/test_app/test_align.py index a6f12f563..5bf8298f3 100644 --- a/tests/test_app/test_align.py +++ b/tests/test_app/test_align.py @@ -10,6 +10,7 @@ DNA, get_app, get_moltype, + load_aligned_seqs, make_aligned_seqs, make_tree, make_unaligned_seqs, @@ -742,3 +743,12 @@ def test_aln_two(): seqs = orig.degap() aln = aligner.main(seqs) assert aln.to_dict() == expect + + +def test_codon_incomplete(DATA_DIR): + names = ["FlyingFox", "DogFaced", "FreeTaile"] + aln = load_aligned_seqs(DATA_DIR / "brca1.fasta", moltype="dna") + seqs = aln.take_seqs(names)[2700:3000].degap() + aligner = align_app.progressive_align("codon") + aln = aligner(seqs) + assert aln # will fail if aln is a NotCompleted instance From 4bd4800c35ce6fb89854e6570eb1a8e7257d841e Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 23 Apr 2024 15:40:03 +1000 Subject: [PATCH 19/62] TST: add clause to test --- tests/test_app/test_align.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_app/test_align.py b/tests/test_app/test_align.py index 5bf8298f3..288a84230 100644 --- a/tests/test_app/test_align.py +++ b/tests/test_app/test_align.py @@ -752,3 +752,6 @@ def test_codon_incomplete(DATA_DIR): aligner = align_app.progressive_align("codon") aln = aligner(seqs) assert aln # will fail if aln is a NotCompleted instance + # now make sure the resulting ungapped sequences are modulo 3 + seqs = aln.degap().to_dict().values() + assert {len(s) % 3 for s in seqs} == {0} From cacccc2c91e8728c27c852b7f330ec5db49ebfa2 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 23 Apr 2024 15:48:32 +1000 Subject: [PATCH 20/62] MAINT: move comment to better place --- src/cogent3/align/pairwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cogent3/align/pairwise.py b/src/cogent3/align/pairwise.py index f91ae6420..2214f0edb 100644 --- a/src/cogent3/align/pairwise.py +++ b/src/cogent3/align/pairwise.py @@ -480,12 +480,12 @@ def _calcAligneds(self, children): for seq_name, aligned in child.aligneds: # if word_length != 1 then maps are forced to have # sequences lengths that are modulo word_length - # Likewise, if the data is not modulo word_length, - # it is trimmed to match new_map = aligned.map.merge_maps( maps[dim] * word_length, parent_length=maps[dim].parent_length * word_length, ) + # Likewise, if the data is not modulo word_length, + # it is trimmed to match data = ( aligned.data if new_map.parent_length == len(aligned.data) From 37bd424bcf282c6add0bea6e5b690e59725742ff Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Wed, 24 Apr 2024 21:10:47 +1000 Subject: [PATCH 21/62] MAINT: use IndelMap for dotplotting [CHANGED] reimplement the get_align_coords() function, now simpler and more robust [NEW] added IndelMap.get_gap_align_coordinates() returns a 2D array of gap start, end in alignment coordinates. Used by get_align_coords(). --- src/cogent3/core/location.py | 5 +++ src/cogent3/draw/dotplot.py | 73 ++++++++++++++++----------------- tests/test_draw/test_dotplot.py | 48 +++++++++++++++------- 3 files changed, 74 insertions(+), 52 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 4f8f274c6..a19bf5170 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1697,6 +1697,11 @@ def get_gap_coordinates(self): cum_lengths[1:] = diffs return numpy.array([self.gap_pos, cum_lengths]).T.tolist() + def get_gap_align_coordinates(self) -> NDArray[int]: + """returns [(gap start, gap end), ...] in alignment indices""" + starts, ends = _gap_spans(self.gap_pos, self.cum_gap_lengths) + return numpy.array([starts, ends]).T + def merge_maps(self, other, parent_length: Optional[int] = None): """merge gaps of other with self diff --git a/src/cogent3/draw/dotplot.py b/src/cogent3/draw/dotplot.py index c08795777..6c0ecf73f 100644 --- a/src/cogent3/draw/dotplot.py +++ b/src/cogent3/draw/dotplot.py @@ -1,3 +1,5 @@ +import numpy + from cogent3.align.pycompare import ( MatchedSeqPaths, SeqKmers, @@ -27,9 +29,23 @@ def suitable_threshold(window, desired_probability): return matches -def not_gap(span): - """whether a span corresponds to a non-gap""" - return span.num_gaps == 0 +def _ungapped_spans( + starts_ends: list[list[int, int]], align_length: int, aligned=False +) -> numpy.ndarray: + """returns numpy array of [(non gap start, non gap stop), ...}""" + if len(starts_ends) == 0: + data = [(0, align_length)] if aligned else [] + return numpy.array(data, dtype=int) + + if starts_ends[0][0] != 0: + # does not start with a gap, so we adjust to get an ungapped span at + # the start + starts_ends = [(0, 0)] + starts_ends + if starts_ends[-1][1] != align_length: + # does not end with a gap, so we adjust to get an ungapped span at + # the end + starts_ends.append((align_length, align_length)) + return numpy.array(starts_ends).flatten()[1:-1].reshape((len(starts_ends) - 1, 2)) def _convert_input(seq, moltype): @@ -54,43 +70,24 @@ def get_align_coords(map1, map2, aligned=False) -> MatchedSeqPaths: """sequence coordinates of aligned segments""" from cogent3.align.pycompare import segment - paths = MatchedSeqPaths("Alignment") - if not_gap(map1) and not_gap(map2): - # no gaps - if aligned: - assert len(map1) == len(map2), "Aligned sequences inconsistent length" - paths[0].append((segment(0, len(map1)), segment(0, len(map2)))) - return paths - assert len(map1) == len(map2), "aligned sequences not equal length" - # diagonals are places where both sequences are NOT gaps - # so we record start of a diagonal and when we hit a 'gap' - # in either sequence, we have the end of the diagonal - - start_x = start_y = None - for i in range(len(map1)): - x_not_gap = not_gap(map1[i]) - y_not_gap = not_gap(map2[i]) - if x_not_gap and y_not_gap and start_x is None: - start_x = map1[:i].parent_length - start_y = map2[:i].parent_length - elif (not x_not_gap or not y_not_gap) and start_x is not None: - paths[start_y - start_x].append( - ( - segment(start_x, map1[:i].parent_length - 1), - segment(start_y, map2[:i].parent_length - 1), - ) - ) - start_x = start_y = None - - if start_x is not None: - paths[start_y - start_x].append( - ( - segment(start_x, map1.parent_length - 1), - segment(start_y, map2.parent_length - 1), - ) - ) + # we get the gap coordinates in alignment indices for both sequences + # sorting this allows us to trivially identify the alignment indices of + # ungapped segments, + # e.g. [(gap start 1, gap end 1), (gap start 2, gap end 2),... + # then [(gap end 1, gap start 2), ... + # is what we need to identify the segments for plotting + starts_ends = map1.get_gap_align_coordinates().tolist() + starts_ends.extend(map2.get_gap_align_coordinates().tolist()) + starts_ends.sort() + starts_ends = _ungapped_spans(starts_ends, len(map1), aligned=aligned) + paths = MatchedSeqPaths("Alignment") + for align_start, align_end in starts_ends: + # ends are inclusive for plotting + s1 = segment(map1.get_seq_index(align_start), map1.get_seq_index(align_end) - 1) + s2 = segment(map2.get_seq_index(align_start), map2.get_seq_index(align_end) - 1) + paths.append(s1, s2) return paths diff --git a/tests/test_draw/test_dotplot.py b/tests/test_draw/test_dotplot.py index 19180a7c0..30b3c44a7 100644 --- a/tests/test_draw/test_dotplot.py +++ b/tests/test_draw/test_dotplot.py @@ -1,13 +1,11 @@ from unittest import TestCase +import numpy + from cogent3 import DNA, make_unaligned_seqs from cogent3.core.alignment import Aligned, ArrayAlignment -from cogent3.draw.dotplot import ( - Dotplot, - _convert_input, - get_align_coords, - not_gap, -) +from cogent3.core.location import IndelMap +from cogent3.draw.dotplot import Dotplot, _convert_input, get_align_coords class TestUtilFunctions(TestCase): @@ -16,12 +14,6 @@ def test_len_seq(self): m, seq = DNA.make_seq("ACGGT--A").parse_out_gaps() self.assertEqual(m.parent_length, 6) - def test_not_gap(self): - """distinguishes Map instances that include gap or not""" - m, seq = DNA.make_seq("ACGGT--A").parse_out_gaps() - self.assertTrue(not_gap(m[0])) - self.assertFalse(not_gap(m[5])) - def test_convert_input(self): """converts data for dotplotting""" m, seq = DNA.make_seq("ACGGT--A").parse_out_gaps() @@ -56,7 +48,8 @@ def test_get_align_coords(self): m1, seq1 = DNA.make_seq("ACGGTTTA").parse_out_gaps() m2, seq2 = DNA.make_seq("GGGGTTTA").parse_out_gaps() paths = get_align_coords(m1, m2, aligned=True) - self.assertEqual(paths.get_coords(), ([0, len(seq1)], [0, len(seq1)])) + # display ranges are inclusive, thus length - 1 + self.assertEqual(paths.get_coords(), ([0, len(seq1) - 1], [0, len(seq1) - 1])) # raises an exception if the Aligned seqs are different lengths m1, seq1 = DNA.make_seq("ACGGTTTA").parse_out_gaps() @@ -66,6 +59,13 @@ def test_get_align_coords(self): def test_display2d(self): """correctly constructs a Display2d""" + # 111111 + # 0123456789012345 + # 11111 + # 012345678901234 + # -TGATGTAAGGTAGTT + # CTGG---AAG---GGT + # 0123 456 789 # check alignment coords are correct dp = Dotplot("-TGATGTAAGGTAGTT", "CTGG---AAG---GGT", is_aligned=True, window=5) expect = [0, 2, None, 6, 8, None, 12, 14], [1, 3, None, 4, 6, None, 7, 9] @@ -79,7 +79,7 @@ def test_display2d(self): self.assertEqual(traces[-1].name, "Alignment") self.assertEqual(traces[0].name, "+ strand") # check + strand has integers/float/None - expect = {int, float, type(None)} + expect = {int, float, type(None), numpy.int64} for trace in traces: for axis in "xy": got = {type(v) for v in trace[axis]} @@ -146,3 +146,23 @@ def test_dotplot_title(self): ) dp = seqs.dotplot("seq1", "seq3", title="") self.assertEqual(dp.figure.layout.title, "") + + +def test_aligned_path(): + imap1 = IndelMap( + gap_pos=numpy.array([4, 5, 6, 8, 10]), + cum_gap_lengths=numpy.array([6, 10, 12, 14, 15]), + parent_length=10, + ) + imap2 = IndelMap( + gap_pos=numpy.array([], dtype=int), + cum_gap_lengths=numpy.array([], dtype=int), + parent_length=25, + ) + + coords = get_align_coords(imap1, imap2) + + assert coords.get_coords() == ( + [0, 3, None, 4, 4, None, 5, 5, None, 6, 7, None, 8, 9], + [0, 3, None, 10, 10, None, 15, 15, None, 18, 19, None, 22, 23], + ) From d1d18e8597035a8f0e5490bbacde744f64660150 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 25 Apr 2024 07:47:10 +1000 Subject: [PATCH 22/62] TST: address windows error --- tests/test_draw/test_dotplot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_draw/test_dotplot.py b/tests/test_draw/test_dotplot.py index 30b3c44a7..334961bf9 100644 --- a/tests/test_draw/test_dotplot.py +++ b/tests/test_draw/test_dotplot.py @@ -79,7 +79,7 @@ def test_display2d(self): self.assertEqual(traces[-1].name, "Alignment") self.assertEqual(traces[0].name, "+ strand") # check + strand has integers/float/None - expect = {int, float, type(None), numpy.int64} + expect = {int, float, type(None), numpy.int64, numpy.int32} for trace in traces: for axis in "xy": got = {type(v) for v in trace[axis]} From b657bea56b02b9574adb62bb75a82f1b57bbdc62 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 25 Apr 2024 14:39:47 +1000 Subject: [PATCH 23/62] MAINT: dotplot only check map lengths if seqs are aligned --- src/cogent3/draw/dotplot.py | 3 ++- tests/test_draw/test_dotplot.py | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/cogent3/draw/dotplot.py b/src/cogent3/draw/dotplot.py index 6c0ecf73f..5bd0b2f1b 100644 --- a/src/cogent3/draw/dotplot.py +++ b/src/cogent3/draw/dotplot.py @@ -70,7 +70,8 @@ def get_align_coords(map1, map2, aligned=False) -> MatchedSeqPaths: """sequence coordinates of aligned segments""" from cogent3.align.pycompare import segment - assert len(map1) == len(map2), "aligned sequences not equal length" + if aligned: + assert len(map1) == len(map2), "aligned sequences not equal length" # we get the gap coordinates in alignment indices for both sequences # sorting this allows us to trivially identify the alignment indices of diff --git a/tests/test_draw/test_dotplot.py b/tests/test_draw/test_dotplot.py index 334961bf9..a113c039b 100644 --- a/tests/test_draw/test_dotplot.py +++ b/tests/test_draw/test_dotplot.py @@ -166,3 +166,12 @@ def test_aligned_path(): [0, 3, None, 4, 4, None, 5, 5, None, 6, 7, None, 8, 9], [0, 3, None, 10, 10, None, 15, 15, None, 18, 19, None, 22, 23], ) + + +def test_dotplot_unaligned(): + seqs = make_unaligned_seqs(dict(a="ACGGT", b="CGTT"), moltype="dna") + dp = seqs.dotplot(window=3, k=2) + assert dp + # trigger building traces + _ = dp.figure + assert len(dp.traces[0].x) From 96aa98fc5b922cafa7c56cd47469116c5eb32e23 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 25 Apr 2024 15:13:06 +1000 Subject: [PATCH 24/62] TST: IndelMap.get_gap_align_coordinates() [CHANGED] this method always returns an array with shape (#, 2) where # is the number of rows. --- src/cogent3/core/location.py | 13 +++++++++-- tests/test_core/test_location.py | 38 ++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index a19bf5170..f1fe053c5 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1698,9 +1698,18 @@ def get_gap_coordinates(self): return numpy.array([self.gap_pos, cum_lengths]).T.tolist() def get_gap_align_coordinates(self) -> NDArray[int]: - """returns [(gap start, gap end), ...] in alignment indices""" + """returns [(gap start, gap end), ...] in alignment indices + + Returns + ------- + A 2D numpy array of integers. If the result is empty, it still + has shape (0, 2). + """ starts, ends = _gap_spans(self.gap_pos, self.cum_gap_lengths) - return numpy.array([starts, ends]).T + result = numpy.array([starts, ends]).T + if not len(result): + result = result.reshape((0, 2)) + return result def merge_maps(self, other, parent_length: Optional[int] = None): """merge gaps of other with self diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index ece8c9a84..45fdc2915 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -1037,3 +1037,41 @@ def test_gap_coords_to_map(): # and gaps outside sequence with pytest.raises(ValueError): gap_coords_to_map({20: 1}, len(seq)) + + +@pytest.mark.parametrize( + "seq, expected", + [ + ("ACGTACGT", []), + ("----ACGTACGT----", [(0, 4), (12, 16)]), + ("ACGT----ACGT", [4, 8]), + ("----", [[0, 4]]), + ("----ACGT", [[0, 4]]), + ("ACGTACGT----", [(8, 12)]), + ], +) +def test_get_gap_align_coordinates1(seq, expected): + expected = numpy.array(expected, dtype=int) + if not len(expected): + expected = expected.reshape((0, 2)) + im, _ = make_seq(seq).parse_out_gaps() + result = im.get_gap_align_coordinates() + assert (result == expected).all(), f"{expected=}, {result=}" + + +@pytest.mark.parametrize( + "seq, expected", + [ + ("", []), + ("A", []), + ("-", [[0, 1]]), + ], +) +def test_get_gap_align_coordinates_edge_cases(seq, expected): + expected = numpy.array(expected, dtype=int) + if not len(expected): + expected = expected.reshape((0, 2)) + + im, _ = make_seq(seq).parse_out_gaps() + result = im.get_gap_align_coordinates() + assert (result == expected).all(), f"{expected=}, {result=}" From d8775ae6c78a000b45ddba5ab1e777e0ba5f1c4d Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 25 Apr 2024 15:55:18 +1000 Subject: [PATCH 25/62] MAINT: improve type hints --- src/cogent3/core/location.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index f1fe053c5..7acbfeeb8 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -522,6 +522,9 @@ def __repr__(self): return f"?{self.length}?" +SpanTypes = Union[Span, _LostSpan] + + class Map: # pragma: no cover """A map holds a list of spans.""" @@ -1201,7 +1204,7 @@ def _spans_from_locations(locations, parent_length) -> Tuple[Union[Span, _LostSp def spans_to_gap_coords( - indel_spans: list[Union[Span, LostSpan]], dtype: Union[type, str] = numpy.int32 + indel_spans: list[SpanTypes], dtype: Union[type, str] = numpy.int32 ) -> tuple[numpy.ndarray, numpy.ndarray]: """returns coordinates of sequence gaps @@ -1299,8 +1302,10 @@ def __post_init__(self, gap_lengths): self.cum_gap_lengths.flags.writeable = False self._serialisable.pop("gap_lengths", None) + T = Union[list[SpanTypes], tuple[SpanTypes]] + @classmethod - def from_spans(cls, spans, parent_length, termini_unknown=False): + def from_spans(cls, spans: T, parent_length: int, termini_unknown: bool = False): gap_pos, cum_lengths = spans_to_gap_coords(spans) return cls( gap_pos=gap_pos, @@ -1604,7 +1609,7 @@ def offsets(self): starts, ends = _gap_spans(self.gap_pos, self.cum_gap_lengths) return numpy.array([starts, ends]).T.flatten()[:-1].tolist() - def nongap(self) -> Iterator[Union[Span, LostSpan]]: + def nongap(self) -> Iterator[SpanTypes]: """ungappeed segments in this map in aligned coordinates""" # we want to know the coordinates of the ungapped segments on # the aligned sequence. The gap_pos attribute is in sequence @@ -1627,7 +1632,7 @@ def nongap(self) -> Iterator[Union[Span, LostSpan]]: yield Span(prev_pos, len(self)) @property - def spans(self) -> Iterator[Union[Span, LostSpan]]: + def spans(self) -> Iterator[SpanTypes]: """generator of spans""" if not self.num_gaps: yield Span(0, self.parent_length) @@ -1909,8 +1914,10 @@ def __post_init__(self, spans): self._spans = tuple(spans) self.length = posn + T = Union[list[SpanTypes], tuple[SpanTypes]] + @classmethod - def from_spans(cls, spans, parent_length): + def from_spans(cls, spans: T, parent_length: int, **kwargs): return cls(spans=spans, parent_length=parent_length) def __len__(self): From 2f81c29d764282f644ef17cbf055146c548c2a5d Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 25 Apr 2024 17:30:00 +1000 Subject: [PATCH 26/62] MAINT: address some issues identified by codacy --- src/cogent3/core/location.py | 4 +++- tests/test_core/test_alignment.py | 1 - tests/test_core/test_location.py | 37 +++++++++++++++---------------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 7acbfeeb8..65c95608b 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1147,9 +1147,11 @@ def from_locations(cls, locations, parent_length, **kwargs): return cls.from_spans(spans=spans, parent_length=parent_length, **kwargs) + T = Union[list[SpanTypes], tuple[SpanTypes]] + @classmethod @abstractmethod - def from_spans(cls, spans, parent_length, **kwargs): + def from_spans(cls, spans: T, parent_length: int, **kwargs): ... diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py index 06fe5fd72..2cb416735 100644 --- a/tests/test_core/test_alignment.py +++ b/tests/test_core/test_alignment.py @@ -1984,7 +1984,6 @@ def test_model_aln_to_model_aln(self): (("ACGGTAAAG", ((2, 4), (5, 8))), ("CCC---CCC", ((0, 3), (6, 9)))), ) def test_featuremap_slice_aligned(raw_seq, coords): - from cogent3.core.alignment import Aligned from cogent3.core.location import FeatureMap, Span im, seq = DNA.make_seq(raw_seq).parse_out_gaps() diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 45fdc2915..5b8213bce 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -540,7 +540,6 @@ def test_indelmap_merge_parent_length(): parent_length=4, ) # providing a value for parent_length overrides standard - im3 = im1.merge_maps(im2) ov = im1.merge_maps(im2, parent_length=20) assert ov.parent_length != im2.parent_length assert ov.parent_length == 20 @@ -600,7 +599,7 @@ def test_indelmap_to_feature_map(): def test_indelmap_nucleic_reversed(): # 01 2345 # --AC--GGGG-- - orig, s = DNA.make_seq("--AC--GGGG--").parse_out_gaps() + orig, _ = DNA.make_seq("--AC--GGGG--").parse_out_gaps() plus_coords = [(0, 2), (2, 6)] assert orig.get_coordinates() == plus_coords rev = orig.nucleic_reversed() @@ -654,7 +653,7 @@ def test_indelmap_slice_terminating(): raw = "-CB-A--" start, end = 4, 6 expect, _ = DNA.make_seq(raw[start:end]).parse_out_gaps() - imap, s = DNA.make_seq(raw).parse_out_gaps() + imap, _ = DNA.make_seq(raw).parse_out_gaps() got = imap[start:end] assert got.gap_pos.tolist() == expect.gap_pos.tolist() assert got.cum_gap_lengths.tolist() == expect.cum_gap_lengths.tolist() @@ -734,7 +733,7 @@ def test_gapped_convert_seq2aln(data, index): # converting a sequence index to alignment index ungapped = data.replace("-", "") seq = make_seq(data, moltype="text") - gaps, s = seq.parse_out_gaps() + gaps, _ = seq.parse_out_gaps() idx = gaps.get_align_index(index) assert data[idx] == ungapped[index] @@ -780,7 +779,7 @@ def test_indelmap_align_index_slice_stop(data, start, end): @pytest.mark.parametrize("index", range(6)) # the ungapped sequence is 6 long def test_gapped_convert_seq2aln2seq(data, index): # round tripping seq to alignment to seq indices - gaps, s = make_seq(data, moltype="text").parse_out_gaps() + gaps, _ = make_seq(data, moltype="text").parse_out_gaps() align_index = gaps.get_align_index(index) got = gaps.get_seq_index(align_index) assert got == index @@ -833,7 +832,7 @@ def test_gapped_convert_aln2seq_nongap_char(data, seq_index): # test alignment indexes when aligned position is NOT a gap ungapped = "ABCDEFG" align_index = data.find(ungapped[seq_index]) - gaps, seq = make_seq(data, moltype="text").parse_out_gaps() + gaps, _ = make_seq(data, moltype="text").parse_out_gaps() idx = gaps.get_seq_index(align_index) assert idx == seq_index @@ -882,7 +881,7 @@ def test_gapped_convert_aln2seq_gapchar(data, gap_number): def test_gapped_convert_aln2seq_invalid(): - gaps, seq = make_seq("AC--GTA-TG", moltype="dna").parse_out_gaps() + gaps, _ = make_seq("AC--GTA-TG", moltype="dna").parse_out_gaps() with pytest.raises(NotImplementedError): # absolute value of negative indices must be < seq length gaps.get_seq_index(-100) @@ -896,26 +895,26 @@ def test_gap_pos_invalid_slice(invalid_slice): pos, lengths = numpy.array([[1, 3]], dtype=numpy.int32).T gp = IndelMap(gap_pos=pos, gap_lengths=lengths, parent_length=20) with pytest.raises(NotImplementedError): - gp[invalid_slice] + _ = gp[invalid_slice] @pytest.mark.parametrize( - "slice", + "aslice", ( slice(3, 7), slice(20, None), ), ) -def test_no_gaps_in_slice(slice): +def test_no_gaps_in_slice(aslice): # aligned length is 25 seq_length = 20 gap_length = 5 pos, lengths = numpy.array([[10, gap_length]], dtype=numpy.int32).T gp = IndelMap(gap_pos=pos, gap_lengths=lengths, parent_length=seq_length) - got = gp[slice] + got = gp[aslice] assert not got.num_gaps - start = slice.start or 0 - stop = slice.stop or (seq_length + gap_length) + start = aslice.start or 0 + stop = aslice.stop or (seq_length + gap_length) assert len(got) == stop - start @@ -931,11 +930,11 @@ def test_all_gaps_in_slice(): # slicing GapPositions # sample seq 1 data = "AC--GTA-TG" - gp, seq = make_seq(data, moltype="dna").parse_out_gaps() + gp, _ = make_seq(data, moltype="dna").parse_out_gaps() sl = slice(1, 9) got = gp[sl] - expect_gaps, expect_seq = make_seq(data[sl], moltype="dna").parse_out_gaps() + expect_gaps, _ = make_seq(data[sl], moltype="dna").parse_out_gaps() assert got.get_gap_coordinates() == expect_gaps.get_gap_coordinates() assert got.parent_length == 5 @@ -953,14 +952,14 @@ def test_all_gaps_in_slice(): ), ) @pytest.mark.parametrize( - "slice", + "aslice", [slice(i, j) for i, j in combinations(range(10), 2)], ) -def test_variant_slices(data, slice): +def test_variant_slices(data, aslice): gaps, _ = make_seq(data, moltype="dna").parse_out_gaps() - got = gaps[slice] + got = gaps[aslice] - expect_gaps, expect_seq = make_seq(data[slice], moltype="dna").parse_out_gaps() + expect_gaps, expect_seq = make_seq(data[aslice], moltype="dna").parse_out_gaps() assert got.parent_length == len(expect_seq) assert (got.gap_pos == expect_gaps.gap_pos).all() assert (got.cum_gap_lengths == expect_gaps.cum_gap_lengths).all() From f5974d743950f467ed3a7fee680f979a818b6d34 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Fri, 26 Apr 2024 07:12:44 +1000 Subject: [PATCH 27/62] MAINT: addresses additional issues identified by codacy --- tests/test_core/test_location.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 5b8213bce..4d84c53c1 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -352,7 +352,7 @@ def test_gaps(): # 000000000011 # 012345678901 seq = DNA.make_seq("AC---G-TAA--") - m, s = seq.parse_out_gaps() + m, _ = seq.parse_out_gaps() m = FeatureMap.from_spans(spans=tuple(m.spans), parent_length=m.parent_length) got = [(g.start, g.end) for g in tuple(m.gaps().spans)] assert got == [(2, 5), (6, 7), (10, 12)] @@ -409,7 +409,7 @@ def test_spans_gen_nogaps(): def test_round_trip_rich_dict(): seq = DNA.make_seq("AC---G-TAA--") - im, s = seq.parse_out_gaps() + im, _ = seq.parse_out_gaps() rd = im.to_rich_dict() got = IndelMap.from_rich_dict(rd) assert im is not got @@ -758,7 +758,7 @@ def test_indelmap_align_index_slice_stop(data, start, end): # converting a sequence index to alignment index ungapped = data.replace("-", "") seq = make_seq(data, moltype="text") - gaps, s = seq.parse_out_gaps() + gaps, _ = seq.parse_out_gaps() stop = gaps.get_align_index(end, slice_stop=True) assert data[stop - 1] == ungapped[end - 1] @@ -875,7 +875,7 @@ def test_gapped_convert_aln2seq_gapchar(data, gap_number): assert data[align_index] == "-", (data, gap_number) # find nearest non-gap seq_index = _get_expected_seqindex(data, align_index) - gaps, seq = make_seq(data, moltype="text").parse_out_gaps() + gaps, _ = make_seq(data, moltype="text").parse_out_gaps() idx = gaps.get_seq_index(align_index) assert idx == seq_index From 05b984274f4bc6662aabe1f4e992e0dc59fd8404 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Fri, 26 Apr 2024 07:31:53 +1000 Subject: [PATCH 28/62] MAINT: yet more codacy issues [CHANGED] apparently codacy does not report all violations, so if there's a pattern to them it's best to search throughout and address all rather than wasting compute time on GitHub --- tests/test_core/test_location.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 4d84c53c1..5a9eef690 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -267,7 +267,7 @@ class MapTests(TestCase): def test_get_gap_coords(self): """returns gap start and lengths""" - m, seq = DNA.make_seq("-AC--GT-TTA--").parse_out_gaps() + m, _ = DNA.make_seq("-AC--GT-TTA--").parse_out_gaps() got = m.get_gap_coordinates() self.assertEqual(dict(got), {0: 1, 2: 2, 4: 1, 7: 2}) @@ -278,7 +278,6 @@ def test_map_plus_position(): # +slice ** # plus seq AAACCCTGG - # orig = Aligned(*DNA.make_seq("AAACCCTGG", name="a").parse_out_gaps()) orig = FeatureMap.from_locations(locations=[(0, 9)], parent_length=9) assert orig.absolute_position(2) == 2 assert orig.absolute_position(6) == 6 @@ -551,7 +550,7 @@ def test_map_offsets(cls): # 1 # 01 3 678 1 seq = DNA.make_seq("-AC---G-TAA--") - m, s = seq.parse_out_gaps() + m, _ = seq.parse_out_gaps() got = m.offsets assert got == [0, 1, 3, 6, 7, 8, 11] @@ -643,7 +642,7 @@ def test_indel_map_get_gap_coords(): def test_indelmap_joined_segments(coords): raw = "--AC--GGGG--" expect, _ = DNA.make_seq("".join(raw[s:e] for s, e in coords)).parse_out_gaps() - imap, s = DNA.make_seq(raw).parse_out_gaps() + imap, _ = DNA.make_seq(raw).parse_out_gaps() got = imap.joined_segments(coords) assert got.gap_pos.tolist() == expect.gap_pos.tolist() assert got.cum_gap_lengths.tolist() == expect.cum_gap_lengths.tolist() From 822e408e19b43471780e50f86bb36b20bab493ef Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Fri, 26 Apr 2024 08:46:04 +1000 Subject: [PATCH 29/62] DEV: added changelog comments about Map refactor --- changelog.d/20240426_082600_Gavin.Huttley.md | 55 ++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 changelog.d/20240426_082600_Gavin.Huttley.md diff --git a/changelog.d/20240426_082600_Gavin.Huttley.md b/changelog.d/20240426_082600_Gavin.Huttley.md new file mode 100644 index 000000000..4eb438ef7 --- /dev/null +++ b/changelog.d/20240426_082600_Gavin.Huttley.md @@ -0,0 +1,55 @@ + + + + +### ENH + +- The new IndelMap class uses numpy arrays to store information about gap + locations for aligned sequences. This will greatly reduce the memory + overhead for aligned sequences. The class also provides explicit methods + for inter-converting between sequence and alignment coordinates. An important + difference to the original Map implementation is that IndelMap is memoryless, + meaning the history of slice operations is now fully delegated to the SeqView + class. +- The new FeatureMap class is a slightly modified version of the original Map + class. It is used solely for handling sequence feature mappings to sequences + and alignments. Like IndelMap, it is memoryless but it does record its + orientation with respect to the parent sequence. +- Note that both IndelMap and FeatureMap replace the spans attribute with a + generator. + + + + + +### Discontinued + +- The cogent3.core.location.Map class is now marked for deprecation. It is + being replaced by two classes, IndelMap and FeatureMap. The latter has + largely the same functionality of Map. + + From 3b5a44cfba2a17ce4ebc0621c6f8d2afbd152da7 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 30 Apr 2024 14:51:33 +1000 Subject: [PATCH 30/62] MAINT: removed location.py module docstring as out-of-date --- src/cogent3/core/location.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 65c95608b..306963767 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1,24 +1,3 @@ -""" -Maps have a list of Spans. - -Span is a region with a start, an end, and a direction. - -Notes ------ - -Spans behave much like Python's slices: a Span contains the element -after its start but does not contain the element after its end. It may help to -think of the Span indices occurring _between_ the list elements: - - a b c d e - | | | | | | - 0 1 2 3 4 5 - -...so that a Span whose start is its end contains no elements (e.g. 2:2), and -a Span whose end is 2 more than its start contains 2 elements (e.g. 2:4 has c -and d), etc. Similarly, Span(0,2) does _not_ overlap Span(2,3), since the -former contains a and b while the latter contains c. -""" import copy import dataclasses import functools From 3deb20892afdbdbf4e1cedfceeeeea158ccb2f33 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 30 Apr 2024 14:52:07 +1000 Subject: [PATCH 31/62] MAINT: declare type hints at module level for reuse --- src/cogent3/core/location.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 306963767..1a6d4c93f 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -28,6 +28,8 @@ strip = str.strip +_DEFAULT_GAP_DTYPE = numpy.int32 + def _norm_index(i, length, default): """For converting s[:3] to s[0:3], s[-1] to s[len(s)-1] and s[0:lots] to s[0:len(s)]""" @@ -501,7 +503,11 @@ def __repr__(self): return f"?{self.length}?" +IntTypes = Union[int, numpy.int32, numpy.int64] +IntArrayTypes = NDArray[int] SpanTypes = Union[Span, _LostSpan] +SeqSpanTypes = Sequence[SpanTypes] +SeqCoordTypes = Sequence[Sequence[IntTypes]] class Map: # pragma: no cover From 0dae775118d79aa1257792259e399bcb0f396e83 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 30 Apr 2024 14:52:33 +1000 Subject: [PATCH 32/62] DEP: add discontinued warning to Map class --- src/cogent3/core/location.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 1a6d4c93f..d68fe95bb 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -513,6 +513,11 @@ def __repr__(self): class Map: # pragma: no cover """A map holds a list of spans.""" + @c3warn.deprecated_callable( + version="2024.6", + reason="Replaced by IndelMap and FeatureMap", + is_discontinued=True, + ) def __init__( self, locations=None, From b09ee2be2095cae32359a78afb6b67fc435add1a Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 30 Apr 2024 14:57:55 +1000 Subject: [PATCH 33/62] MAINT: improve type hinting in location --- src/cogent3/core/location.py | 175 +++++++++++++++++------------------ 1 file changed, 87 insertions(+), 88 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index d68fe95bb..53c30bf3a 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -8,7 +8,7 @@ from bisect import bisect_left, bisect_right from functools import total_ordering from itertools import chain -from typing import Iterator, List, Optional, Sequence, Tuple, Union +from typing import Any, Iterator, Optional, Sequence, Union import numpy @@ -1105,7 +1105,7 @@ def __new__(cls, *args, **kwargs): return obj @abstractmethod - def __len__(self): + def __len__(self) -> int: ... @abstractmethod @@ -1113,11 +1113,11 @@ def __add__(self, other): ... @abstractmethod - def nongap(self): + def nongap(self) -> Iterator[SpanTypes]: ... @abstractmethod - def get_coordinates(self): + def get_coordinates(self) -> SeqCoordTypes: ... @abstractmethod @@ -1125,11 +1125,11 @@ def nucleic_reversed(self): ... @abstractmethod - def to_rich_dict(self): + def to_rich_dict(self) -> dict[str, Any]: ... @classmethod - def from_locations(cls, locations, parent_length, **kwargs): + def from_locations(cls, locations: SeqCoordTypes, parent_length: int, **kwargs): if len(locations): spans = _spans_from_locations(locations, parent_length=parent_length) else: @@ -1137,15 +1137,13 @@ def from_locations(cls, locations, parent_length, **kwargs): return cls.from_spans(spans=spans, parent_length=parent_length, **kwargs) - T = Union[list[SpanTypes], tuple[SpanTypes]] - @classmethod @abstractmethod - def from_spans(cls, spans: T, parent_length: int, **kwargs): + def from_spans(cls, spans: SeqSpanTypes, parent_length: int, **kwargs): ... -def _spans_from_locations(locations, parent_length) -> Tuple[Union[Span, _LostSpan]]: +def _spans_from_locations(locations: SeqCoordTypes, parent_length: int) -> SeqSpanTypes: if not len(locations): # using len() because locations can be a numpy array return () @@ -1192,20 +1190,17 @@ def _spans_from_locations(locations, parent_length) -> Tuple[Union[Span, _LostSp return tuple(spans) -O = tuple[numpy.ndarray, Sequence] - - def spans_to_gap_coords( - indel_spans: list[SpanTypes], dtype: Union[type, str] = numpy.int32 -) -> tuple[numpy.ndarray, numpy.ndarray]: + indel_spans: SeqSpanTypes, dtype: IntTypes = _DEFAULT_GAP_DTYPE +) -> tuple[IntArrayTypes, IntArrayTypes]: """returns coordinates of sequence gaps Parameters ---------- - indel_map - old style Map object + indel_spans + sequence of Span/LostSpan instances dtype - string or numpy type, default to 32-bit integer + numpy type, default to _DEFAULT_GAP_DTYPE Returns ------- @@ -1227,8 +1222,8 @@ def spans_to_gap_coords( def _gap_spans( - gap_pos: NDArray[int], cum_gap_lengths: NDArray[int] -) -> tuple[NDArray[int], NDArray[int]]: + gap_pos: IntArrayTypes, cum_gap_lengths: IntArrayTypes +) -> tuple[IntArrayTypes, IntArrayTypes]: """returns 1D arrays in alignment coordinates of gap start, gap stop""" if not len(gap_pos): @@ -1242,7 +1237,12 @@ def _gap_spans( return starts, ends -def _update_lengths(result_pos, result_lengths, gap_pos, gap_lengths): +def _update_lengths( + result_pos: IntArrayTypes, + result_lengths: IntArrayTypes, + gap_pos: IntArrayTypes, + gap_lengths: IntArrayTypes, +) -> None: """modifies result_lengths in place with gap_lengths where elements in gap_pos occur in result_pos @@ -1256,9 +1256,6 @@ def _update_lengths(result_pos, result_lengths, gap_pos, gap_lengths): result_lengths[result_indices] += gap_lengths[other_indices] -T = Union[List[int], Tuple[int]] - - @dataclasses.dataclass class IndelMap(MapABC): """store locations of deletions in a Aligned sequence""" @@ -1294,10 +1291,10 @@ def __post_init__(self, gap_lengths): self.cum_gap_lengths.flags.writeable = False self._serialisable.pop("gap_lengths", None) - T = Union[list[SpanTypes], tuple[SpanTypes]] - @classmethod - def from_spans(cls, spans: T, parent_length: int, termini_unknown: bool = False): + def from_spans( + cls, spans: SeqSpanTypes, parent_length: int, termini_unknown: bool = False + ) -> "IndelMap": gap_pos, cum_lengths = spans_to_gap_coords(spans) return cls( gap_pos=gap_pos, @@ -1308,18 +1305,19 @@ def from_spans(cls, spans: T, parent_length: int, termini_unknown: bool = False) @classmethod def from_aligned_segments( - cls, locations: list[tuple[int, int]], aligned_length: int - ): + cls, locations: SeqCoordTypes, aligned_length: int + ) -> "IndelMap": """ converts coordinates from aligned segments into IndelMap for ungapped sequence Parameters ---------- locations - list of ungapped segment in alignment coordinates + sequence of ungapped segment in alignment coordinates aligned_length length of the alignment """ + locations = list(locations) if not locations or ( len(locations) == 1 and locations[0][0] == 0 @@ -1355,6 +1353,7 @@ def from_aligned_segments( parent_length=seq_length, ) + # NOTE: cannot use string type hints with singledispatchmethod @functools.singledispatchmethod def __getitem__(self, item): raise NotImplementedError(f"cannot slice using {type(item)}") @@ -1548,11 +1547,11 @@ def get_seq_index(self, align_index: int) -> int: # so the gap insertion position is the sequence position return int(self.gap_pos[index]) - def __len__(self): + def __len__(self) -> int: length_gaps = self.cum_gap_lengths[-1] if self.num_gaps else 0 return int(self.parent_length + length_gaps) - def __add__(self, other): + def __add__(self, other) -> "IndelMap": # what was the purpose of this method? The code seems designed for # combining Maps from the same parent sequence, which is a union rather # than addition @@ -1574,7 +1573,7 @@ def __add__(self, other): parent_length=self.parent_length + other.parent_length, ) - def __mul__(self, scale): + def __mul__(self, scale: int) -> "IndelMap": # could be used for going from amino-acid alignment to codon alignment gap_pos = self.gap_pos * scale cum_gap_lengths = self.cum_gap_lengths * scale @@ -1584,17 +1583,17 @@ def __mul__(self, scale): parent_length=self.parent_length * scale, ) - def __repr__(self): + def __repr__(self) -> str: gap_data = numpy.array([self.gap_pos, self.cum_gap_lengths]).T return f"{gap_data.tolist()!r}/{self.parent_length}" - def get_gap_lengths(self) -> NDArray[int]: + def get_gap_lengths(self) -> IntArrayTypes: lengths = self.cum_gap_lengths.copy() lengths[1:] = numpy.diff(lengths) return lengths @property - def offsets(self): + def offsets(self) -> IntArrayTypes: # offsets are the aligned indices for every starting point of a segment # when we encounter a gap, we include that position and the end of that gap @@ -1655,15 +1654,15 @@ def spans(self) -> Iterator[SpanTypes]: yield Span(self.gap_pos[-1], self.parent_length) @property - def complete(self): + def complete(self) -> bool: """whether any span represents a gap""" return self.num_gaps != 0 and self.useful @property - def useful(self): + def useful(self) -> bool: return self.parent_length != 0 - def get_coordinates(self): + def get_coordinates(self) -> SeqCoordTypes: """returns sequence coordinates of ungapped segments Returns @@ -1687,14 +1686,14 @@ def get_coordinates(self): coords.append((last, self.parent_length)) return coords - def get_gap_coordinates(self): + def get_gap_coordinates(self) -> SeqCoordTypes: """returns [(gap pos, gap length), ...]""" cum_lengths = self.cum_gap_lengths.copy() diffs = numpy.diff(cum_lengths) cum_lengths[1:] = diffs return numpy.array([self.gap_pos, cum_lengths]).T.tolist() - def get_gap_align_coordinates(self) -> NDArray[int]: + def get_gap_align_coordinates(self) -> SeqCoordTypes: """returns [(gap start, gap end), ...] in alignment indices Returns @@ -1708,7 +1707,7 @@ def get_gap_align_coordinates(self) -> NDArray[int]: result = result.reshape((0, 2)) return result - def merge_maps(self, other, parent_length: Optional[int] = None): + def merge_maps(self, other, parent_length: Optional[int] = None) -> "IndelMap": """merge gaps of other with self Parameters @@ -1731,7 +1730,7 @@ def merge_maps(self, other, parent_length: Optional[int] = None): parent_length=parent_length, ) - def joined_segments(self, coords: list[tuple[int, int]]): + def joined_segments(self, coords: SeqCoordTypes) -> "IndelMap": """returns new map with disjoint gapped segments joined Parameters @@ -1770,7 +1769,7 @@ def joined_segments(self, coords: list[tuple[int, int]]): parent_length=cum_parent_length, ) - def nucleic_reversed(self): + def nucleic_reversed(self) -> "IndelMap": """map for a sequence that has itself been reversed and complemented Notes @@ -1791,7 +1790,7 @@ def nucleic_reversed(self): parent_length=self.parent_length, ) - def to_rich_dict(self): + def to_rich_dict(self) -> dict[str, Any]: """returns dicts for contained spans [dict(), ..]""" # exclude spans from deep copy since being overwritten data = copy.deepcopy(dict(self._serialisable.items())) @@ -1802,11 +1801,11 @@ def to_rich_dict(self): data["parent_length"] = int(self.parent_length) return data - def to_json(self): + def to_json(self) -> str: return json.dumps(self.to_rich_dict()) @classmethod - def from_rich_dict(cls, map_element): + def from_rich_dict(cls, map_element) -> "IndelMap": from cogent3.util.deserialise import _get_class map_element.pop("version", None) @@ -1817,7 +1816,7 @@ def from_rich_dict(cls, map_element): return cls(**map_element) - def with_termini_unknown(self): + def with_termini_unknown(self) -> "IndelMap": """returns new instance with terminal gaps indicated as unknown""" return self.__class__( gap_pos=self.gap_pos.copy(), @@ -1826,11 +1825,13 @@ def with_termini_unknown(self): termini_unknown=True, ) - def to_feature_map(self): + def to_feature_map(self) -> "FeatureMap": """returns a Map type, suited to Features""" return FeatureMap(spans=list(self.spans), parent_length=self.parent_length) - def make_seq_feature_map(self, align_feature_map, include_gaps=True): + def make_seq_feature_map( + self, align_feature_map: "FeatureMap", include_gaps: bool = True + ) -> "FeatureMap": """converts align_feature_map to a FeatureMap with sequence coordinates Parameters @@ -1865,17 +1866,15 @@ def make_seq_feature_map(self, align_feature_map, include_gaps=True): class FeatureMap(MapABC): """A map holds a list of spans.""" - spans: dataclasses.InitVar[Optional[tuple]] = () + spans: dataclasses.InitVar[Optional[SeqSpanTypes]] = () parent_length: int = 0 offsets: list[int] = dataclasses.field(init=False, repr=False) useful: bool = dataclasses.field(init=False, repr=False, default=False) complete: bool = dataclasses.field(init=False, repr=False, default=True) _serialisable: dict = dataclasses.field(init=False, repr=False) - _spans: Tuple[Union[Span, _LostSpan, TerminalPadding]] = dataclasses.field( - default=(), init=False - ) + _spans: SeqSpanTypes = dataclasses.field(default=(), init=False) - def __post_init__(self, spans): + def __post_init__(self, spans: SeqSpanTypes): assert self.parent_length is not None self.parent_length = int(self.parent_length) if isinstance(spans, property): @@ -1906,19 +1905,19 @@ def __post_init__(self, spans): self._spans = tuple(spans) self.length = posn - T = Union[list[SpanTypes], tuple[SpanTypes]] - @classmethod - def from_spans(cls, spans: T, parent_length: int, **kwargs): + def from_spans( + cls, spans: SeqSpanTypes, parent_length: int, **kwargs + ) -> "FeatureMap": return cls(spans=spans, parent_length=parent_length) - def __len__(self): + def __len__(self) -> int: return self.length - def __repr__(self): + def __repr__(self) -> str: return f"{list(self.spans)!r}/{self.parent_length}" - def __getitem__(self, new_map): + def __getitem__(self, new_map) -> "FeatureMap": # A possible shorter map at the same level new_map = as_map(new_map, len(self), self.__class__) new_parts = [] @@ -1926,17 +1925,17 @@ def __getitem__(self, new_map): new_parts.extend(span.remap_with(self)) return self.__class__(spans=new_parts, parent_length=self.parent_length) - def __mul__(self, scale): + def __mul__(self, scale) -> "FeatureMap": new_parts = [span * scale for span in self.spans] return self.__class__(spans=new_parts, parent_length=self.parent_length * scale) - def __div__(self, scale): + def __div__(self, scale) -> "FeatureMap": new_parts = [span / scale for span in self.spans] return self.__class__( spans=new_parts, parent_length=self.parent_length // scale ) - def __add__(self, other): + def __add__(self, other) -> "FeatureMap": if other.parent_length != self.parent_length: raise ValueError("Those maps belong to different sequences") return self.__class__( @@ -1944,16 +1943,16 @@ def __add__(self, other): ) @property - def spans(self): + def spans(self) -> Iterator[SeqSpanTypes]: yield from self._spans - def get_covering_span(self): + def get_covering_span(self) -> "FeatureMap": span = (self.start, self.end) return self.__class__.from_locations( locations=[span], parent_length=self.parent_length ) - def covered(self): + def covered(self) -> "FeatureMap": """>>> Map([(10,20), (15, 25), (80, 90)]).covered().spans [Span(10,25), Span(80, 90)]""" @@ -1984,7 +1983,7 @@ def covered(self): locations=result, parent_length=self.parent_length ) - def nucleic_reversed(self): + def nucleic_reversed(self) -> "FeatureMap": """map for a sequence that has itself been reversed and complemented Notes @@ -2004,7 +2003,7 @@ def nucleic_reversed(self): spans.reverse() return self.__class__(spans=spans, parent_length=self.parent_length) - def get_gap_coordinates(self): + def get_gap_coordinates(self) -> SeqCoordTypes: """returns [(gap pos, gap length), ...]""" gap_pos = [] spans = list(self.spans) @@ -2017,7 +2016,7 @@ def get_gap_coordinates(self): return gap_pos - def gaps(self): + def gaps(self) -> "FeatureMap": """The gaps (lost spans) in this map""" locations = [] offset = 0 @@ -2029,11 +2028,11 @@ def gaps(self): locations=locations, parent_length=len(self) ) - def shadow(self): + def shadow(self) -> "FeatureMap": """The 'negative' map of the spans not included in this map""" return self.inverse().gaps() - def nongap(self): + def nongap(self) -> SeqSpanTypes: locations = [] offset = 0 for s in self.spans: @@ -2042,13 +2041,13 @@ def nongap(self): offset += s.length return _spans_from_locations(locations=locations, parent_length=len(self)) - def without_gaps(self): + def without_gaps(self) -> "FeatureMap": return self.__class__( spans=[s for s in self.spans if not s.lost], parent_length=self.parent_length, ) - def inverse(self): + def inverse(self) -> "FeatureMap": """returns instance with coordinates updated for aligned, unaligned""" # is this only required for parse_out_gaps? # NO also used in cogent3.align code @@ -2097,7 +2096,7 @@ def inverse(self): return self.__class__(spans=new_spans, parent_length=len(self)) - def get_coordinates(self): + def get_coordinates(self) -> SeqCoordTypes: """returns span coordinates as [(v1, v2), ...] v1/v2 are (start, end) unless the map is reversed, in which case it will @@ -2105,7 +2104,7 @@ def get_coordinates(self): return [(s.start, s.end) for s in self.spans if not s.lost] - def to_rich_dict(self): + def to_rich_dict(self) -> dict[str, Any]: """returns dicts for contained spans [dict(), ..]""" spans = [s.to_rich_dict() for s in self.spans] data = copy.deepcopy(self._serialisable) @@ -2116,11 +2115,11 @@ def to_rich_dict(self): data["parent_length"] = int(self.parent_length) return data - def to_json(self): + def to_json(self) -> str: return json.dumps(self.to_rich_dict()) @classmethod - def from_rich_dict(cls, map_element): + def from_rich_dict(cls, map_element) -> "FeatureMap": from cogent3.util.deserialise import _get_class map_element.pop("version", None) @@ -2136,7 +2135,7 @@ def from_rich_dict(cls, map_element): map_element["spans"] = spans return cls(**map_element) - def zeroed(self): + def zeroed(self) -> "FeatureMap": """returns a new instance with the first span starting at 0 Note @@ -2167,9 +2166,7 @@ def zeroed(self): return zeroed - T = Union[numpy.ndarray, int] - - def absolute_position(self, rel_pos: T) -> T: + def absolute_position(self, rel_pos: IntTypes) -> IntTypes: """converts rel_pos into an absolute position Raises @@ -2188,7 +2185,7 @@ def absolute_position(self, rel_pos: T) -> T: return self.start + rel_pos - def relative_position(self, abs_pos: T) -> T: + def relative_position(self, abs_pos: IntTypes) -> IntTypes: """converts abs_pos into an relative position Raises @@ -2203,7 +2200,9 @@ def relative_position(self, abs_pos: T) -> T: return abs_pos - self.start -def gap_coords_to_map(gaps_lengths: dict, seq_length: int) -> IndelMap: +def gap_coords_to_map( + gaps_lengths: dict[IntTypes, IntTypes], seq_length: int +) -> IndelMap: """ Parameters ---------- @@ -2239,7 +2238,7 @@ def deserialise_featuremap(data: dict) -> FeatureMap: @functools.singledispatch -def _norm_slice(index, length): +def _norm_slice(index, length: int) -> tuple[int, int, Union[int, None]]: """_norm_slice(slice(1, -2, 3), 10) -> (1,8,3)""" start = index if start < 0: @@ -2250,21 +2249,21 @@ def _norm_slice(index, length): @_norm_slice.register -def _(index: slice, length): +def _(index: slice, length: int) -> tuple[int, int, Union[int, None]]: start = _norm_index(index.start, length, 0) end = _norm_index(index.stop, length, length) return start, end, index.step @_norm_slice.register -def _(index: Span, length): +def _(index: Span, length) -> tuple[int, int, Union[int, None]]: start = _norm_index(index.start, length, 0) end = _norm_index(index.end, length, length) return start, end, None @_norm_slice.register -def _(index: FeatureMap, length): +def _(index: FeatureMap, length) -> tuple[int, int, Union[int, None]]: start = _norm_index(index.start, length, 0) end = _norm_index(index.end, length, length) return start, end, None From f9b2f0865705a3b5dfdfeb7bb5d03b64c3c547f5 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 30 Apr 2024 15:29:23 +1000 Subject: [PATCH 34/62] MAINT: simplify logic in _spans_from_locations [CHANGED] constrain all coordinates to be positive and satisfy start < end --- src/cogent3/core/location.py | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 53c30bf3a..7f233233e 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1153,32 +1153,14 @@ def _spans_from_locations(locations: SeqCoordTypes, parent_length: int) -> SeqSp spans = [] for start, end in locations: - if start > end: - raise ValueError("locations must be ordered smallest-> largest") - if max(start, end) < 0 or min(start, end) > parent_length: + if start > end or min(start, end) < 0: + raise ValueError("locations must be ordered smallest-> largest and >= 0") + if start > parent_length: raise RuntimeError( f"located outside sequence: {(start, end, parent_length)}" ) - if max(start, end) > parent_length and min(start, end) < 0: - l_diff = min(start, end) - r_diff = max(start, end) - parent_length - start, end = (0, parent_length) if start < end else (parent_length, 0) - spans += [ - LostSpan(abs(l_diff)), - Span(start, end), - LostSpan(abs(r_diff)), - ] - elif min(start, end) < 0: - diff = min(start, end) - start = max(start, 0) - end = max(end, 0) - spans += [ - LostSpan(abs(diff)), - Span(start, end), - ] - elif max(start, end) > parent_length: - diff = max(start, end) - parent_length - start = min(start, parent_length) + if end > parent_length: + diff = end - parent_length end = min(end, parent_length) spans += [ Span(start, end), From d7cd778e5766fa06354c3b165fe5ac29beb7b2d7 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 30 Apr 2024 15:29:43 +1000 Subject: [PATCH 35/62] MAINT: more type hint fixes --- src/cogent3/core/location.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 7f233233e..45b84740c 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1243,15 +1243,15 @@ class IndelMap(MapABC): """store locations of deletions in a Aligned sequence""" # gap data is gap positions, gap lengths on input, stored - gap_pos: numpy.ndarray - cum_gap_lengths: Optional[numpy.ndarray] = None - gap_lengths: dataclasses.InitVar[Optional[numpy.ndarray]] = None + gap_pos: IntArrayTypes + cum_gap_lengths: Optional[IntArrayTypes] = None + gap_lengths: dataclasses.InitVar[Optional[IntArrayTypes]] = None termini_unknown: bool = False parent_length: int = 0 _serialisable: dict = dataclasses.field(init=False, repr=False) num_gaps: int = dataclasses.field(init=False, repr=False, default=0) - def __post_init__(self, gap_lengths): + def __post_init__(self, gap_lengths: IntArrayTypes): assert gap_lengths is None or self.cum_gap_lengths is None if gap_lengths is not None: self.cum_gap_lengths = gap_lengths.cumsum() From 9e84f0474773274babfbe3fd923b24a703707962 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 30 Apr 2024 15:30:48 +1000 Subject: [PATCH 36/62] MAINT: raise IndexError if negative indices beyond parent length --- src/cogent3/core/location.py | 4 +-- tests/test_core/test_location.py | 48 +++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 45b84740c..063e60ec5 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1466,7 +1466,7 @@ def get_align_index(self, seq_index: int, slice_stop: bool = False) -> int: seq_index += self.parent_length if seq_index < 0: - raise NotImplementedError(f"{seq_index} negative seq_index beyond limit ") + raise IndexError(f"{seq_index} negative seq_index beyond limit ") if not self.num_gaps or seq_index < self.gap_pos[0]: return int(seq_index) @@ -1504,7 +1504,7 @@ def get_seq_index(self, align_index: int) -> int: if align_index < 0: align_index = len(self) + align_index if align_index < 0: - raise NotImplementedError(f"{align_index} align_index beyond limit") + raise IndexError(f"{align_index} align_index beyond limit") if not self.num_gaps or align_index < self.gap_pos[0]: return align_index diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 5a9eef690..a4fd8e904 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -494,6 +494,18 @@ def test_indelmap_from_aligned_segments(): assert im_spans[1].lost and len(im_spans[1]) == 2 +@pytest.mark.parametrize("swap", (False, True)) +def test_indelmap_inconsistent_input(swap): + a = numpy.array([0, 1], dtype=int) + b = numpy.array([3], dtype=int) + a, b = (b, a) if swap else (a, b) + with pytest.raises(ValueError): + IndelMap(gap_pos=a, gap_lengths=b, parent_length=10) + + with pytest.raises(ValueError): + IndelMap(gap_pos=a, cum_gap_lengths=b, parent_length=10) + + def test_indelmap_from_aligned_segments2(): locations = [(0, 5), (7, 12), (14, 21), (24, 27)] im = IndelMap.from_aligned_segments(locations=locations, aligned_length=27) @@ -658,6 +670,32 @@ def test_indelmap_slice_terminating(): assert got.cum_gap_lengths.tolist() == expect.cum_gap_lengths.tolist() +def test_indelmap_invalid_slice_range(): + imap = IndelMap( + gap_pos=numpy.array([10], dtype=int), + gap_lengths=numpy.array([2], dtype=int), + parent_length=10, + ) + with pytest.raises(IndexError): + imap[-100] + + with pytest.raises(IndexError): + imap[-100:] + + with pytest.raises(IndexError): + imap[:-99] + + +def test_indelmap_get_indices_errors(): + imap = IndelMap( + gap_pos=numpy.array([10], dtype=int), + gap_lengths=numpy.array([2], dtype=int), + parent_length=10, + ) + with pytest.raises(IndexError): + imap.get_align_index(-1000) + + def test_indelmap_slice_innner_gap(): start, end = 4, 6 raw = "--AC--GGGG--" @@ -881,7 +919,7 @@ def test_gapped_convert_aln2seq_gapchar(data, gap_number): def test_gapped_convert_aln2seq_invalid(): gaps, _ = make_seq("AC--GTA-TG", moltype="dna").parse_out_gaps() - with pytest.raises(NotImplementedError): + with pytest.raises(IndexError): # absolute value of negative indices must be < seq length gaps.get_seq_index(-100) @@ -1011,6 +1049,14 @@ def test_featuremap_roundtrip_json(): assert got.parent_length == fmap.parent_length == 140 +@pytest.mark.parametrize( + "error_type,locations", ((ValueError, ((-2, 2),)), (RuntimeError, ((20, 25),))) +) +def test_invalid_locations(error_type, locations): + with pytest.raises(error_type): + FeatureMap.from_locations(locations=locations, parent_length=10) + + def test_gap_coords_to_map(): """construct a Map from coordinates of gap alone""" gap_coords = {0: 1, 2: 2, 4: 1, 7: 2} From 9292f4e172b074ef36bbbd009be0bfe7bd4be514 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 30 Apr 2024 15:50:00 +1000 Subject: [PATCH 37/62] MAINT: fix FeatureMap add, include a test --- src/cogent3/core/location.py | 2 +- tests/test_core/test_location.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 063e60ec5..e4abac2e8 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1921,7 +1921,7 @@ def __add__(self, other) -> "FeatureMap": if other.parent_length != self.parent_length: raise ValueError("Those maps belong to different sequences") return self.__class__( - spans=self.spans + other.spans, parent_length=self.parent_length + spans=list(self.spans) + list(other.spans), parent_length=self.parent_length ) @property diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index a4fd8e904..39f012152 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -1119,3 +1119,14 @@ def test_get_gap_align_coordinates_edge_cases(seq, expected): im, _ = make_seq(seq).parse_out_gaps() result = im.get_gap_align_coordinates() assert (result == expected).all(), f"{expected=}, {result=}" + + +def test_featuremap_add(): + spans_a = [LostSpan(2), Span(2, 4)] + spans_b = [LostSpan(2), Span(4, 8), LostSpan(2)] + kwargs = dict(parent_length=6) + + fm_a = FeatureMap(spans=spans_a, **kwargs) + fm_b = FeatureMap(spans=spans_b, **kwargs) + fm_ab = fm_a + fm_b + assert list(fm_ab.spans) == (spans_a + spans_b) From 9173ff1174b45d4527c9eff921b118605b2a8a03 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 30 Apr 2024 15:50:28 +1000 Subject: [PATCH 38/62] TST: test FeatureMap multiply --- tests/test_core/test_location.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 39f012152..2c9d7d062 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -1130,3 +1130,12 @@ def test_featuremap_add(): fm_b = FeatureMap(spans=spans_b, **kwargs) fm_ab = fm_a + fm_b assert list(fm_ab.spans) == (spans_a + spans_b) + + +def test_featuremap_mul(): + spans = [LostSpan(2), Span(2, 4)] + fm = FeatureMap(spans=spans, parent_length=6) + fm_3 = fm * 3 + assert list(fm_3.spans) == [sp * 3 for sp in spans] + assert fm_3.parent_length == 6 * 3 + From 5746f71d21794ecb5414b6dc0a831744b718946b Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Tue, 30 Apr 2024 15:52:20 +1000 Subject: [PATCH 39/62] MAINT: FeatureMap and Spans now support division [FIXED] __div__() hasn't been valid for division since Python 3! Replaced by __trudediv__ on FeatureMap and Span classes --- src/cogent3/core/location.py | 6 +++--- tests/test_core/test_location.py | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index e4abac2e8..808af9fe3 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -281,7 +281,7 @@ def __mul__(self, scale): self.reverse, ) - def __div__(self, scale): + def __truediv__(self, scale: int): assert not self.start % scale or self.end % scale return Span( self.start // scale, @@ -471,7 +471,7 @@ def __getitem__(self, slice): def __mul__(self, scale): return LostSpan(self.length * scale, self.value) - def __div__(self, scale): + def __truediv__(self, scale): assert not self.length % 3 return LostSpan(self.length // scale, self.value) @@ -1911,7 +1911,7 @@ def __mul__(self, scale) -> "FeatureMap": new_parts = [span * scale for span in self.spans] return self.__class__(spans=new_parts, parent_length=self.parent_length * scale) - def __div__(self, scale) -> "FeatureMap": + def __truediv__(self, scale: int) -> "FeatureMap": new_parts = [span / scale for span in self.spans] return self.__class__( spans=new_parts, parent_length=self.parent_length // scale diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 2c9d7d062..493e52451 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -1139,3 +1139,10 @@ def test_featuremap_mul(): assert list(fm_3.spans) == [sp * 3 for sp in spans] assert fm_3.parent_length == 6 * 3 + +def test_featuremap_div(): + spans = [LostSpan(3), Span(3, 6)] + fm_3 = FeatureMap(spans=spans, parent_length=6) + fm_1 = fm_3 / 3 + assert list(fm_1.spans) == [sp / 3 for sp in spans] + assert fm_1.parent_length == 6 / 3 From 226868bfdea6052bd166677a4fcfc849ebfdb288 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 07:42:10 +1000 Subject: [PATCH 40/62] MAINT: type hints to gap_traceback() and map_traceback() [CHANGED] addresses review comment --- src/cogent3/align/traceback.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/cogent3/align/traceback.py b/src/cogent3/align/traceback.py index 4a0ca3dba..44142eeec 100644 --- a/src/cogent3/align/traceback.py +++ b/src/cogent3/align/traceback.py @@ -1,10 +1,16 @@ """Conversion of dynamic program results ("arrays of arrows") into gap vectors, gapped sequences or Cogent Alignment objects""" +import typing from cogent3.core.alignment import Aligned, Alignment from cogent3.core.location import IndelMap +IntOrNone = typing.Union[int, None] +IntListType = list[int] +CoordsListType = list[list[typing.Sequence[int]]] + + def seq_traceback(s1, s2, aligned_positions, gap_value): """gapped sequences from state matrix and ending point gaps are signified by 'gap_value' inserted in the sequences. @@ -29,7 +35,9 @@ def seq_traceback(s1, s2, aligned_positions, gap_value): return alignments -def gap_traceback(aligned_positions): +def gap_traceback( + aligned_positions: list[list[IntOrNone, IntOrNone]] +) -> tuple[IntListType, IntListType, CoordsListType, int]: """gap Vectors from state matrix and ending point""" consuming = [False, False] starts = [None, None] @@ -54,7 +62,9 @@ def gap_traceback(aligned_positions): return starts, ends, gap_vectors, a -def map_traceback(aligned_positions): +def map_traceback( + aligned_positions: list[list[IntOrNone, IntOrNone]] +) -> tuple[IntListType, IntListType, list[IndelMap]]: # using IndelMap's to keep track of gaps for indel alignment starts, ends, gap_vectors, alignment_len = gap_traceback(aligned_positions) maps = [ From 86d205e3e87da86158fac4414bd59b64757364c2 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 07:55:59 +1000 Subject: [PATCH 41/62] MAINT: more type hints on IndelMap and MapABC --- src/cogent3/core/location.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 808af9fe3..608c8483c 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1109,7 +1109,7 @@ def __len__(self) -> int: ... @abstractmethod - def __add__(self, other): + def __add__(self, other: "MapABC") -> "MapABC": ... @abstractmethod @@ -1121,7 +1121,7 @@ def get_coordinates(self) -> SeqCoordTypes: ... @abstractmethod - def nucleic_reversed(self): + def nucleic_reversed(self) -> "MapABC": ... @abstractmethod @@ -1129,7 +1129,9 @@ def to_rich_dict(self) -> dict[str, Any]: ... @classmethod - def from_locations(cls, locations: SeqCoordTypes, parent_length: int, **kwargs): + def from_locations( + cls, locations: SeqCoordTypes, parent_length: int, **kwargs + ) -> "MapABC": if len(locations): spans = _spans_from_locations(locations, parent_length=parent_length) else: @@ -1139,7 +1141,7 @@ def from_locations(cls, locations: SeqCoordTypes, parent_length: int, **kwargs): @classmethod @abstractmethod - def from_spans(cls, spans: SeqSpanTypes, parent_length: int, **kwargs): + def from_spans(cls, spans: SeqSpanTypes, parent_length: int, **kwargs) -> "MapABC": ... From ca14c275eadad1030b684fc96ec3a0a77e79c283 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 08:11:42 +1000 Subject: [PATCH 42/62] MAINT: use default numpy dtype throughout --- src/cogent3/core/location.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 608c8483c..ea829915f 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1321,7 +1321,7 @@ def from_aligned_segments( # ends with a gap locations += [(aligned_length, aligned_length)] - locations = numpy.array(locations, dtype=numpy.int32).flatten()[1:-1] + locations = numpy.array(locations, dtype=_DEFAULT_GAP_DTYPE).flatten()[1:-1] gap_coords = locations.reshape((locations.shape[0] // 2, 2)) gap_ends, gap_starts = gap_coords[:, ::-1].T gap_lengths = gap_ends - gap_starts From e63de3434d615b910acb6f96d698fd3ad0b3ed2a Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 08:12:03 +1000 Subject: [PATCH 43/62] MAINT: better error message --- src/cogent3/core/location.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index ea829915f..c83054886 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1363,7 +1363,7 @@ def _(self, item: slice): start = start if start >= 0 else len(self) + start stop = stop if stop >= 0 else len(self) + stop if min((start, stop)) < 0: - raise IndexError(f"one of adjusted {start, stop} is < 0") + raise IndexError(f"item.start or item.stop is out of range") if start >= stop: # standard slice behaviour without negative step From 49211c23a3730a8a61717f8795ba14f04f597d5c Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 08:12:44 +1000 Subject: [PATCH 44/62] MAINT: code tidy --- src/cogent3/core/location.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index c83054886..2def8c73e 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1462,6 +1462,8 @@ def get_align_index(self, seq_index: int, slice_stop: bool = False) -> int: In that case, and if seq_index is in gap_pos then it returns the first alignment index of the gap run. """ + cum_lengths = self.cum_gap_lengths + gap_pos = self.gap_pos # NOTE I explicitly cast all returned values to python int's due to # need for json serialisation, which does not support numpy int classes if seq_index < 0: @@ -1470,32 +1472,29 @@ def get_align_index(self, seq_index: int, slice_stop: bool = False) -> int: if seq_index < 0: raise IndexError(f"{seq_index} negative seq_index beyond limit ") - if not self.num_gaps or seq_index < self.gap_pos[0]: + if not self.num_gaps or seq_index < gap_pos[0]: return int(seq_index) # if stop_index, check if the seq_index corresponds to a gap position - if slice_stop and (match := seq_index == self.gap_pos).any(): + if slice_stop and (match := seq_index == gap_pos).any(): # if so, we return the alignment coord for the first gap position (idx,) = numpy.where(match)[0] if idx: - gap_len = self.cum_gap_lengths[idx] - self.cum_gap_lengths[idx - 1] + gap_len = cum_lengths[idx] - cum_lengths[idx - 1] else: - gap_len = self.cum_gap_lengths[idx] - gap_end = self.gap_pos[idx] + self.cum_gap_lengths[idx] + gap_len = cum_lengths[idx] + gap_end = gap_pos[idx] + cum_lengths[idx] return int(gap_end - gap_len) - cum_gap_lengths = self.cum_gap_lengths - gap_pos = self.gap_pos - if seq_index >= gap_pos[-1]: - return int(seq_index + cum_gap_lengths[-1]) + return int(seq_index + cum_lengths[-1]) # find gap position before seq_index index = numpy.searchsorted(gap_pos, seq_index, side="left") if seq_index < gap_pos[index]: - gap_lengths = cum_gap_lengths[index - 1] if index else 0 + gap_lengths = cum_lengths[index - 1] if index else 0 else: - gap_lengths = cum_gap_lengths[index] + gap_lengths = cum_lengths[index] return int(seq_index + gap_lengths) From b830f63d39c2c286436d650e9f6f4dfbef1e0eea Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 08:19:06 +1000 Subject: [PATCH 45/62] MAINT: use default gap numpy dtype throughout --- src/cogent3/core/location.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 2def8c73e..66b252962 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1211,7 +1211,7 @@ def _gap_spans( """returns 1D arrays in alignment coordinates of gap start, gap stop""" if not len(gap_pos): - r = numpy.array([], dtype=gap_pos.dtype) + r = numpy.array([], dtype=_DEFAULT_GAP_DTYPE) return r, r ends = gap_pos + cum_gap_lengths @@ -1307,7 +1307,7 @@ def from_aligned_segments( and locations[0][0] == 0 and locations[0][-1] == aligned_length ): - empty = numpy.array([], dtype=int) + empty = numpy.array([], dtype=_DEFAULT_GAP_DTYPE) return cls( gap_pos=empty, cum_gap_lengths=empty.copy(), @@ -1355,7 +1355,7 @@ def _(self, item: slice): raise NotImplementedError( f"{type(self).__name__!r} does not yet support strides" ) - zero_array = numpy.array([], dtype=self.gap_pos.dtype) + zero_array = numpy.array([], dtype=_DEFAULT_GAP_DTYPE) start = item.start or 0 stop = item.stop or len(self) @@ -1396,7 +1396,7 @@ def _(self, item: slice): if gap_starts[l] <= start < gap_ends[l] and stop <= gap_ends[l]: # entire span is within a single gap # pos now 0 - gap_pos = numpy.array([0], dtype=self.gap_pos.dtype) + gap_pos = numpy.array([0], dtype=_DEFAULT_GAP_DTYPE) cum_lengths = cum_lengths[l : l + 1] cum_lengths[0] = stop - start return self.__class__( @@ -1541,14 +1541,14 @@ def __add__(self, other) -> "IndelMap": # I'm revising this to be suitable for the concatenation of two aligned # sequences gap_pos = self.gap_pos.tolist() + (self.parent_length + other.gap_pos).tolist() - gap_pos = numpy.array(gap_pos, dtype=self.gap_pos.dtype) + gap_pos = numpy.array(gap_pos, dtype=_DEFAULT_GAP_DTYPE) cum_length = self.cum_gap_lengths[-1] if self.num_gaps else 0 cum_gap_lengths = ( self.cum_gap_lengths.tolist() + (cum_length + other.cum_gap_lengths).tolist() ) - cum_gap_lengths = numpy.array(cum_gap_lengths, dtype=self.cum_gap_lengths.dtype) + cum_gap_lengths = numpy.array(cum_gap_lengths, dtype=_DEFAULT_GAP_DTYPE) return self.__class__( gap_pos=gap_pos, @@ -1701,7 +1701,7 @@ def merge_maps(self, other, parent_length: Optional[int] = None) -> "IndelMap": overrides property """ unique_pos = numpy.union1d(self.gap_pos, other.gap_pos) - gap_lengths = numpy.zeros(unique_pos.shape, dtype=self.cum_gap_lengths.dtype) + gap_lengths = numpy.zeros(unique_pos.shape, dtype=_DEFAULT_GAP_DTYPE) self_lengths = self.get_gap_lengths() other_lengths = other.get_gap_lengths() _update_lengths(unique_pos, gap_lengths, self.gap_pos, self_lengths) @@ -1729,7 +1729,6 @@ def joined_segments(self, coords: SeqCoordTypes) -> "IndelMap": gaps = {} cum_length = 0 cum_parent_length = 0 - dtype = self.gap_pos.dtype for start, end in coords: im = self[start:end] for i in range(im.num_gaps): @@ -1741,8 +1740,8 @@ def joined_segments(self, coords: SeqCoordTypes) -> "IndelMap": cum_parent_length += im.parent_length if im.num_gaps: cum_length += im.cum_gap_lengths[-1] - gap_pos = numpy.empty(len(gaps), dtype=dtype) - cum_lengths = numpy.empty(len(gaps), dtype=dtype) + gap_pos = numpy.empty(len(gaps), dtype=_DEFAULT_GAP_DTYPE) + cum_lengths = numpy.empty(len(gaps), dtype=_DEFAULT_GAP_DTYPE) for i, (pos, length) in enumerate(sorted(gaps.items())): gap_pos[i] = pos cum_lengths[i] = length @@ -2157,7 +2156,9 @@ def absolute_position(self, rel_pos: IntTypes) -> IntTypes: raises ValueError if rel_pos < 0 """ check = ( - numpy.array([rel_pos], dtype=int) if isinstance(rel_pos, int) else rel_pos + numpy.array([rel_pos], dtype=_DEFAULT_GAP_DTYPE) + if isinstance(rel_pos, int) + else rel_pos ) if check.min() < 0: raise ValueError(f"must positive, not {rel_pos=}") @@ -2176,7 +2177,9 @@ def relative_position(self, abs_pos: IntTypes) -> IntTypes: raises ValueError if abs_pos < 0 """ check = ( - numpy.array([abs_pos], dtype=int) if isinstance(abs_pos, int) else abs_pos + numpy.array([abs_pos], dtype=_DEFAULT_GAP_DTYPE) + if isinstance(abs_pos, int) + else abs_pos ) if check.min() < 0: raise ValueError(f"must positive, not {abs_pos=}") @@ -2200,12 +2203,12 @@ def gap_coords_to_map( """ if not gaps_lengths: - gap_pos = numpy.array([], dtype=int) + gap_pos = numpy.array([], dtype=_DEFAULT_GAP_DTYPE) lengths = gap_pos.copy() else: gap_pos, lengths = list(zip(*sorted(gaps_lengths.items()))) - gap_pos = numpy.array(gap_pos, dtype=int) - lengths = numpy.array(lengths, dtype=int) + gap_pos = numpy.array(gap_pos, dtype=_DEFAULT_GAP_DTYPE) + lengths = numpy.array(lengths, dtype=_DEFAULT_GAP_DTYPE) return IndelMap(gap_pos=gap_pos, gap_lengths=lengths, parent_length=seq_length) From 952aefb12e58dd4cb55282c2c376eafe8feff99a Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 08:24:14 +1000 Subject: [PATCH 46/62] MAINT: document IndelMap dunder methods and update comments --- src/cogent3/core/location.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 66b252962..4b016c4ea 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1413,7 +1413,7 @@ def _(self, item: slice): # so the absolute gap_pos value remains unchanged, but we shorten # the gap length begin = l - begin_diff = start - gap_starts[l] # if l else self.gap_pos[l] + begin_diff = start - gap_starts[l] lengths[l] -= begin_diff shift = (start - cum_lengths[l - 1] - begin_diff) if l else gap_pos[0] elif start == gap_ends[l]: @@ -1534,12 +1534,8 @@ def __len__(self) -> int: length_gaps = self.cum_gap_lengths[-1] if self.num_gaps else 0 return int(self.parent_length + length_gaps) - def __add__(self, other) -> "IndelMap": - # what was the purpose of this method? The code seems designed for - # combining Maps from the same parent sequence, which is a union rather - # than addition - # I'm revising this to be suitable for the concatenation of two aligned - # sequences + def __add__(self, other: "IndelMap") -> "IndelMap": + """designed to support concatenation of two aligned sequences""" gap_pos = self.gap_pos.tolist() + (self.parent_length + other.gap_pos).tolist() gap_pos = numpy.array(gap_pos, dtype=_DEFAULT_GAP_DTYPE) @@ -1557,7 +1553,7 @@ def __add__(self, other) -> "IndelMap": ) def __mul__(self, scale: int) -> "IndelMap": - # could be used for going from amino-acid alignment to codon alignment + """used for going from amino-acid alignment to codon alignment""" gap_pos = self.gap_pos * scale cum_gap_lengths = self.cum_gap_lengths * scale return self.__class__( From 1b092e54f1b720b5dfdd29310f036344f1c65573 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 08:24:50 +1000 Subject: [PATCH 47/62] API: IndelMap.offsets property deleted as no longer used --- src/cogent3/core/location.py | 8 -------- tests/test_core/test_location.py | 11 ----------- 2 files changed, 19 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 4b016c4ea..fa79dca13 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1571,14 +1571,6 @@ def get_gap_lengths(self) -> IntArrayTypes: lengths[1:] = numpy.diff(lengths) return lengths - @property - def offsets(self) -> IntArrayTypes: - # offsets are the aligned indices for every starting point of a segment - # when we encounter a gap, we include that position and the end of that gap - - starts, ends = _gap_spans(self.gap_pos, self.cum_gap_lengths) - return numpy.array([starts, ends]).T.flatten()[:-1].tolist() - def nongap(self) -> Iterator[SpanTypes]: """ungappeed segments in this map in aligned coordinates""" # we want to know the coordinates of the ungapped segments on diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 493e52451..3a24340ed 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -556,17 +556,6 @@ def test_indelmap_merge_parent_length(): assert ov.parent_length == 20 -@pytest.mark.parametrize("cls", (FeatureMap, IndelMap)) -def test_map_offsets(cls): - # offsets are absolute starts of spans - # 1 - # 01 3 678 1 - seq = DNA.make_seq("-AC---G-TAA--") - m, _ = seq.parse_out_gaps() - got = m.offsets - assert got == [0, 1, 3, 6, 7, 8, 11] - - def test_map_indexed(): m = FeatureMap.from_locations(locations=[(0, 2), (4, 6)], parent_length=6).inverse() indexed = m[2] From 5debc138306302dff906b9ae8c1e88de83ec12a9 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 08:26:55 +1000 Subject: [PATCH 48/62] MAINT: code tidy --- src/cogent3/core/location.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index fa79dca13..0b41cde8a 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1659,10 +1659,8 @@ def get_coordinates(self) -> SeqCoordTypes: def get_gap_coordinates(self) -> SeqCoordTypes: """returns [(gap pos, gap length), ...]""" - cum_lengths = self.cum_gap_lengths.copy() - diffs = numpy.diff(cum_lengths) - cum_lengths[1:] = diffs - return numpy.array([self.gap_pos, cum_lengths]).T.tolist() + lengths = self.get_gap_lengths() + return numpy.array([self.gap_pos, lengths]).T.tolist() def get_gap_align_coordinates(self) -> SeqCoordTypes: """returns [(gap start, gap end), ...] in alignment indices @@ -2151,11 +2149,7 @@ def absolute_position(self, rel_pos: IntTypes) -> IntTypes: if check.min() < 0: raise ValueError(f"must positive, not {rel_pos=}") - if len(self) == self.parent_length: - # handle case of reversed here? - return rel_pos - - return self.start + rel_pos + return rel_pos if len(self) == self.parent_length else self.start + rel_pos def relative_position(self, abs_pos: IntTypes) -> IntTypes: """converts abs_pos into an relative position From a027c8bc1dfb82d1a8864b899500bdd2ef0b0617 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 08:29:22 +1000 Subject: [PATCH 49/62] MAINT: code tidy --- src/cogent3/core/location.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 0b41cde8a..a09c08aa7 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1706,11 +1706,8 @@ def joined_segments(self, coords: SeqCoordTypes) -> "IndelMap": ---------- coords sequence insert gap coordinates [(gap start, gap end), ...] - Returns - ------- - """ - coords = list(sorted(coords)) + coords = sorted(coords) # using a dict here because joining can produce a gap merge gaps = {} cum_length = 0 From 54e739b9d715bac1ed0e0df89c417435730999b2 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 08:38:18 +1000 Subject: [PATCH 50/62] TST: test additional cases for IndelMap.nongap --- src/cogent3/core/location.py | 2 +- tests/test_core/test_location.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index a09c08aa7..ce89d7031 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1591,7 +1591,7 @@ def nongap(self) -> Iterator[SpanTypes]: prev_pos = pos if self.num_gaps and self.gap_pos[-1] + self.cum_gap_lengths[-1] < len(self): - yield Span(prev_pos, len(self)) + yield Span(self.gap_pos[-1] + self.cum_gap_lengths[-1], len(self)) @property def spans(self) -> Iterator[SpanTypes]: diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 3a24340ed..e1bf70b43 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -370,6 +370,30 @@ def test_nongap(cls): assert got == [(0, 2), (5, 6), (7, 10)] +@pytest.mark.parametrize("cls", (IndelMap, FeatureMap)) +def test_nongap_startswith(cls): + # returns spans corresponding to position on "aligned" seq of nongaps + # 012345678 + seq = DNA.make_seq("--G-TAA--") + m, _ = seq.parse_out_gaps() + m = cls.from_spans(spans=tuple(m.spans), parent_length=m.parent_length) + + got = [(s.start, s.end) for s in m.nongap()] + assert got == [(2, 3), (4, 7)] + + +@pytest.mark.parametrize("cls", (IndelMap, FeatureMap)[:1]) +def test_nongap_not_endswith(cls): + # returns spans corresponding to position on "aligned" seq of nongaps + # 0123456 + seq = DNA.make_seq("--G-TAA") + m, _ = seq.parse_out_gaps() + m = cls.from_spans(spans=tuple(m.spans), parent_length=m.parent_length) + + got = [(s.start, s.end) for s in m.nongap()] + assert got == [(2, 3), (4, 7)] + + def test_spans_gen(): # returns spans corresponding to position on "aligned" seq of nongaps # 000000000011 From 603a8608de78be9ee5ae9ce732ac897fe1f393c1 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 08:40:54 +1000 Subject: [PATCH 51/62] MAINT: code tidy as per rmcar17 suggestion --- src/cogent3/core/location.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index ce89d7031..a534a7f91 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1716,10 +1716,7 @@ def joined_segments(self, coords: SeqCoordTypes) -> "IndelMap": im = self[start:end] for i in range(im.num_gaps): pos = im.gap_pos[i] + cum_parent_length - if pos in gaps: - gaps[pos] += im.cum_gap_lengths[i] - else: - gaps[pos] = im.cum_gap_lengths[i] + cum_length + gaps[pos] = gaps.get(pos, cum_length) + im.cum_gap_lengths[i] cum_parent_length += im.parent_length if im.num_gaps: cum_length += im.cum_gap_lengths[-1] From a40c22bf381b6e591dfb2951eaea394b49fd9539 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 09:06:32 +1000 Subject: [PATCH 52/62] TST: undo test restriction --- tests/test_core/test_location.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index e1bf70b43..9d209dca1 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -382,7 +382,7 @@ def test_nongap_startswith(cls): assert got == [(2, 3), (4, 7)] -@pytest.mark.parametrize("cls", (IndelMap, FeatureMap)[:1]) +@pytest.mark.parametrize("cls", (IndelMap, FeatureMap)) def test_nongap_not_endswith(cls): # returns spans corresponding to position on "aligned" seq of nongaps # 0123456 From 853707622ef3b4ebb2e8a1214dc9828d58eb8d01 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 09:09:14 +1000 Subject: [PATCH 53/62] MAINT: code tidy in INdelMap.get_coordinates() --- src/cogent3/core/location.py | 37 ++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index a534a7f91..f932ceace 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1640,22 +1640,27 @@ def get_coordinates(self) -> SeqCoordTypes: ------- [(start, end), ...] """ - coords = [] - last = 0 - for pos, cum_length in zip(self.gap_pos, self.cum_gap_lengths): - pos, cum_length = int(pos), int(cum_length) - - if not pos: - continue - coords.append((last, pos)) - last = pos - - if ( - self.num_gaps - and self.gap_pos[-1] + self.cum_gap_lengths[-1] < self.parent_length - ): - coords.append((last, self.parent_length)) - return coords + if not self.num_gaps or (self.num_gaps == 1 and not self.gap_pos[0]): + return [(0, int(self.parent_length))] + elif self.num_gaps == 1: + # does not start with a gap + starts = [0, int(self.gap_pos[0])] + ends = [int(self.gap_pos[0]), self.parent_length] + return list(zip(starts, ends)) + + starts = self.gap_pos[:-1].tolist() + ends = self.gap_pos[1:].tolist() + if self.gap_pos[0]: + # does not start with a gap + ends = starts[:1] + ends + starts = [0] + starts + + if self.gap_pos[-1] + self.cum_gap_lengths[-1] < self.parent_length: + # does end with a gap + starts.append(ends[-1]) + ends.append(self.parent_length) + + return list(zip(starts, ends)) def get_gap_coordinates(self) -> SeqCoordTypes: """returns [(gap pos, gap length), ...]""" From 5c4bfef6dbab3f5b339273ef026c942e784477f1 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 10:23:51 +1000 Subject: [PATCH 54/62] MAINT: handle slice edge case of IndelMap[:0] --- src/cogent3/core/location.py | 2 +- tests/test_core/test_location.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index f932ceace..0e67a7fc1 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1357,7 +1357,7 @@ def _(self, item: slice): ) zero_array = numpy.array([], dtype=_DEFAULT_GAP_DTYPE) start = item.start or 0 - stop = item.stop or len(self) + stop = item.stop if item.stop is not None else len(self) # convert negative indices start = start if start >= 0 else len(self) + start diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 9d209dca1..036bac175 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -683,6 +683,17 @@ def test_indelmap_slice_terminating(): assert got.cum_gap_lengths.tolist() == expect.cum_gap_lengths.tolist() +def test_indelmap_slice_zero(): + raw = "-CB-A--" + start, end = 0, 0 + expect, s = DNA.make_seq(raw[start:end]).parse_out_gaps() + imap, _ = DNA.make_seq(raw).parse_out_gaps() + got = imap[start:end] + got.parent_length == len(s) + assert got.gap_pos.tolist() == expect.gap_pos.tolist() + assert got.cum_gap_lengths.tolist() == expect.cum_gap_lengths.tolist() + + def test_indelmap_invalid_slice_range(): imap = IndelMap( gap_pos=numpy.array([10], dtype=int), From f07d57e0d103e8a1d5dbeb493b4e0b3666fbd5ba Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 10:26:45 +1000 Subject: [PATCH 55/62] MAINT: Feature.map add start stop properties [NEW] these were being dynamically created in post_init, now define _start, _end attributes and associated properties. Cannot be set except via private attributes, which is being done within the class methods. --- src/cogent3/core/location.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 0e67a7fc1..fbc0c051d 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1840,6 +1840,8 @@ class FeatureMap(MapABC): complete: bool = dataclasses.field(init=False, repr=False, default=True) _serialisable: dict = dataclasses.field(init=False, repr=False) _spans: SeqSpanTypes = dataclasses.field(default=(), init=False) + _start: Optional[int] = dataclasses.field(default=None, init=False) + _end: Optional[int] = dataclasses.field(default=None, init=False) def __post_init__(self, spans: SeqSpanTypes): assert self.parent_length is not None @@ -1864,10 +1866,10 @@ def __post_init__(self, spans: SeqSpanTypes): self.complete = False elif not self.useful: self.useful = True - self.start, self.end = span.start, span.end + self._start, self._end = span.start, span.end else: - self.start = min(self.start, span.start) - self.end = max(self.end, span.end) + self._start = min(self._start, span.start) + self._end = max(self._end, span.end) self._spans = tuple(spans) self.length = posn @@ -2128,8 +2130,8 @@ def zeroed(self) -> "FeatureMap": span.end -= shift new_end = max(new_end, span.end) - zeroed.start = 0 - zeroed.end = new_end + zeroed._start = 0 + zeroed._end = new_end return zeroed @@ -2166,6 +2168,14 @@ def relative_position(self, abs_pos: IntTypes) -> IntTypes: raise ValueError(f"must positive, not {abs_pos=}") return abs_pos - self.start + @property + def start(self): + return self._start or 0 + + @property + def end(self): + return self._end or 0 + def gap_coords_to_map( gaps_lengths: dict[IntTypes, IntTypes], seq_length: int From b0ca1cb91e3f7a4f80791753a462a4232744cd9d Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 11:22:57 +1000 Subject: [PATCH 56/62] API: IndelMap.make_seq_feature_map no longer supports gaps [CHANGED] method is meant to return sequence coordinates of a FeatureMap that's in alignment coordinates. So including gaps makes no sense. (That concept does apply to mapping a FeatureMap in sequence coordinates to Alignment coordinates.) [NEW] added tests! --- src/cogent3/core/location.py | 21 ++++++--------------- tests/test_core/test_location.py | 25 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index fbc0c051d..563529a8e 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1796,37 +1796,28 @@ def to_feature_map(self) -> "FeatureMap": """returns a Map type, suited to Features""" return FeatureMap(spans=list(self.spans), parent_length=self.parent_length) - def make_seq_feature_map( - self, align_feature_map: "FeatureMap", include_gaps: bool = True - ) -> "FeatureMap": + def make_seq_feature_map(self, align_feature_map: "FeatureMap") -> "FeatureMap": """converts align_feature_map to a FeatureMap with sequence coordinates Parameters ---------- align_feature_map with alignment coordinates - include_gaps - whether to include gaps from self as LostSpan's + + Notes + ----- + LostSpans in align_feature_map are skipped """ - gap_spans = {} - if include_gaps and self.num_gaps: - last = 0 - for pos, cum_length in zip(self.gap_pos, self.cum_gap_lengths): - gap_spans[pos] = LostSpan(cum_length - last) - last = cum_length spans = [] for span in align_feature_map.spans: if span.lost: - spans.append(span) continue start = self.get_seq_index(span.start) end = self.get_seq_index(span.end) - if lost := gap_spans.pop(span.start, None): - spans.append(lost) spans.append(Span(start, end)) - return FeatureMap(spans=spans, parent_length=align_feature_map.parent_length) + return FeatureMap(spans=spans, parent_length=self.parent_length) @dataclasses.dataclass diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 036bac175..132a9453c 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -1170,3 +1170,28 @@ def test_featuremap_div(): fm_1 = fm_3 / 3 assert list(fm_1.spans) == [sp / 3 for sp in spans] assert fm_1.parent_length == 6 / 3 + + +def test_indelmap_make_seq_feature_map(): + # 1 + # 01234567890 + # AC--GTA-TAA + # 01 234 567 + im = IndelMap( + gap_pos=numpy.array([2, 5], dtype=int), + gap_lengths=numpy.array([2, 1], dtype=int), + parent_length=8, + ) + orig_spans = [Span(1, 5)] + align_map = FeatureMap(spans=orig_spans, parent_length=11) + spans = [Span(1, 3)] + expect = FeatureMap(spans=spans, parent_length=8) + got = im.make_seq_feature_map(align_map) + assert got.get_coordinates() == expect.get_coordinates() + assert got.parent_length == expect.parent_length + + # ignoring lost spans + align_map = FeatureMap(spans=orig_spans + [LostSpan(4)], parent_length=11) + got = im.make_seq_feature_map(align_map) + assert got.get_coordinates() == expect.get_coordinates() + assert got.parent_length == expect.parent_length From 133581b13587ddb2fd993f2ccbcdbbb8b868d7e5 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 11:23:32 +1000 Subject: [PATCH 57/62] MAINT: directly use features for slicing in a couple of cases --- tests/test_core/test_features.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_core/test_features.py b/tests/test_core/test_features.py index 40290d0e6..594ac9e22 100755 --- a/tests/test_core/test_features.py +++ b/tests/test_core/test_features.py @@ -433,7 +433,7 @@ def test_feature_from_alignment(): # But these will be returned as **alignment** # features with locations in alignment coordinates. - assert aln_exons.get_slice().to_dict() == {"x": "AAAAA", "y": "--TTT"} + assert aln[aln_exons].to_dict() == {"x": "AAAAA", "y": "--TTT"} # Similarly alignment features can be projected onto the aligned sequences, # where they may end up falling across gaps: @@ -451,7 +451,7 @@ def test_nested_get_slice(): s.add_feature(biotype="exon", name="trev", spans=[(30, 40)]) s.add_feature(biotype="repeat", name="bob", spans=[(12, 17)], parent_id="fred") f = list(ex.get_children())[0] - assert str(f.get_slice()) == str(s[12:17]) + assert str(s[f]) == str(s[12:17]) def test_roundtrip_annotated_seq(): From c4a9ce11ceea2cc34b3d66d0819f7bd7b2a21036 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 11:24:55 +1000 Subject: [PATCH 58/62] MAINT: update slicing of Aligned by FeatureMap [CHANGED] since FeatureMap coordinates are now always "plus" strand, first logical condition in __getitem__ no longer applies [CHANGED] update call to IndelMap.make_seq_feature_map reflecting change to that signature --- src/cogent3/core/alignment.py | 8 ++------ tests/test_core/test_alignment.py | 30 +++++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/cogent3/core/alignment.py b/src/cogent3/core/alignment.py index e34cf8b79..cb97be425 100644 --- a/src/cogent3/core/alignment.py +++ b/src/cogent3/core/alignment.py @@ -2250,11 +2250,7 @@ def _(self, span: int): def _(self, span: FeatureMap): # we assume the feature map is in align coordinates start, end = span.start, span.end - if span.useful and start > end: - empty = numpy.array([], dtype=self.map.gap_pos.dtype) - im = IndelMap(gap_pos=empty, cum_gap_lengths=empty, parent_length=0) - data = self.data[:0] - elif span.useful and len(list(span.spans)) == 1: + if span.useful and len(list(span.spans)) == 1: im = self.map[start:end] seq_start = self.map.get_seq_index(start) seq_end = self.map.get_seq_index(end) @@ -2266,7 +2262,7 @@ def _(self, span: FeatureMap): # multiple spans align_coords = span.get_coordinates() im = self.map.joined_segments(align_coords) - seq_map = self.map.make_seq_feature_map(span, include_gaps=False) + seq_map = self.map.make_seq_feature_map(span) data = self.data.gapped_by_map(seq_map) return Aligned(im, data) diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py index 2cb416735..19559f4f9 100644 --- a/tests/test_core/test_alignment.py +++ b/tests/test_core/test_alignment.py @@ -2612,7 +2612,7 @@ def test_get_gap_array_equivalence(): assert_allclose(array_aln.get_gap_array(), aln.get_gap_array()) -@pytest.mark.parametrize("reverse", (False, True)[1:]) +@pytest.mark.parametrize("reverse", (False, True)) def test_aligned_rich_dict(reverse): map_, s = make_seq( "TTGAAGAATATGT------GAAAGAG", name="s1", moltype="dna" @@ -3667,3 +3667,31 @@ def test_slice_aligned(raw): al = Aligned(imap, seq) sliced = al[:-3] assert str(sliced) == raw[:-3] + + +def test_slice_aligned_featuremap_allgap(): + from cogent3.core.location import FeatureMap, LostSpan + + imap, seq = DNA.make_seq("AAAGGGGGAACCCT", name="x").parse_out_gaps() + al = Aligned(imap, seq) + fmap = FeatureMap(spans=[LostSpan(4)], parent_length=0) + sliced = al[fmap] + assert not sliced + + +def test_slice_aligned_featuremap_multi_spans(): + from cogent3.core.location import FeatureMap + + # 1111111 + # 01234567890123456 + # *** ** *** + raw_seq = "AAAGG--GGG-AACCCT" + # 01234 567 890123 + # 1111 + imap, seq = DNA.make_seq(raw_seq, name="x").parse_out_gaps() + al = Aligned(imap, seq) + fmap = FeatureMap.from_locations( + locations=[(1, 4), (7, 9), (13, 16)], parent_length=len(raw_seq) + ) + sliced = al[fmap] + assert str(sliced) == "AAGGGCCC" From dc0516a8b0b1ff0355842b5079b5cffa798840cd Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 11:41:14 +1000 Subject: [PATCH 59/62] MAINT: address issues identified by codacy --- src/cogent3/core/location.py | 2 +- tests/test_core/test_location.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cogent3/core/location.py b/src/cogent3/core/location.py index 563529a8e..1578615e6 100644 --- a/src/cogent3/core/location.py +++ b/src/cogent3/core/location.py @@ -1363,7 +1363,7 @@ def _(self, item: slice): start = start if start >= 0 else len(self) + start stop = stop if stop >= 0 else len(self) + stop if min((start, stop)) < 0: - raise IndexError(f"item.start or item.stop is out of range") + raise IndexError("item.start or item.stop is out of range") if start >= stop: # standard slice behaviour without negative step diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 132a9453c..b6aa01c6e 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -689,7 +689,7 @@ def test_indelmap_slice_zero(): expect, s = DNA.make_seq(raw[start:end]).parse_out_gaps() imap, _ = DNA.make_seq(raw).parse_out_gaps() got = imap[start:end] - got.parent_length == len(s) + assert got.parent_length == len(s) assert got.gap_pos.tolist() == expect.gap_pos.tolist() assert got.cum_gap_lengths.tolist() == expect.cum_gap_lengths.tolist() From 3ce4a1f44d11d61caf7c5c4ae123a7e17c0a1209 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 12:55:17 +1000 Subject: [PATCH 60/62] TST: improve testing of the IndelMap.nucleic_reversed() method --- tests/test_core/test_location.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index b6aa01c6e..014d0fdbb 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -620,19 +620,20 @@ def test_indelmap_to_feature_map(): assert mm.get_coordinates() == im.get_coordinates() -def test_indelmap_nucleic_reversed(): - # 01 2345 - # --AC--GGGG-- - orig, _ = DNA.make_seq("--AC--GGGG--").parse_out_gaps() - plus_coords = [(0, 2), (2, 6)] - assert orig.get_coordinates() == plus_coords - rev = orig.nucleic_reversed() - assert rev.num_gaps == orig.num_gaps - minus_coords = [(0, 4), (4, 6)] - assert rev.get_coordinates() == minus_coords - # reversing again returns original - back = rev.nucleic_reversed() - assert orig.get_coordinates() == back.get_coordinates() +@pytest.mark.parametrize("raw", ("--AC--GGGG--", "A-A-A", "-A-AA----A")) +def test_indelmap_nucleic_reversed(raw): + from cogent3.core.alignment import Aligned + + plus = DNA.make_seq(raw) + minus = plus.rc() + plus_imap, plus_seq = DNA.make_seq(raw).parse_out_gaps() + minus_imap, minus_seq = minus.parse_out_gaps() + got = plus_imap.nucleic_reversed() + assert got.get_coordinates() == minus_imap.get_coordinates() + assert (got.gap_pos == minus_imap.gap_pos).all() + assert (got.cum_gap_lengths == minus_imap.cum_gap_lengths).all() + assert got.parent_length == minus_imap.parent_length + assert str(Aligned(got, minus_seq)) == str(minus) def test_get_coords(): From 5f467b946dd8d2854666afbf014f4016b54a1ad7 Mon Sep 17 00:00:00 2001 From: Gavin Huttley Date: Thu, 2 May 2024 12:59:27 +1000 Subject: [PATCH 61/62] MAINT: delete unused variable --- tests/test_core/test_location.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core/test_location.py b/tests/test_core/test_location.py index 014d0fdbb..55eea4e75 100644 --- a/tests/test_core/test_location.py +++ b/tests/test_core/test_location.py @@ -626,7 +626,7 @@ def test_indelmap_nucleic_reversed(raw): plus = DNA.make_seq(raw) minus = plus.rc() - plus_imap, plus_seq = DNA.make_seq(raw).parse_out_gaps() + plus_imap, _ = DNA.make_seq(raw).parse_out_gaps() minus_imap, minus_seq = minus.parse_out_gaps() got = plus_imap.nucleic_reversed() assert got.get_coordinates() == minus_imap.get_coordinates() From 3cf391088e97321df14a1e7c3f64d16da5021ed5 Mon Sep 17 00:00:00 2001 From: GavinHuttley Date: Thu, 2 May 2024 04:15:07 +0000 Subject: [PATCH 62/62] STY: pre-commit linting with black and isort --- tests/test_core/test_alignment.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_core/test_alignment.py b/tests/test_core/test_alignment.py index 02fbd459a..5000b7c56 100644 --- a/tests/test_core/test_alignment.py +++ b/tests/test_core/test_alignment.py @@ -3743,4 +3743,3 @@ def test_sequence_collection_repr(): data = {} seqs = SequenceCollection(data=data, moltype=BYTES) assert repr(seqs) == "0x () bytes seqcollection" -