In [19]:
from flukit import NumberedResidue
from flukit.utils import ALLOWED_LETTERS
import Bio.Seq as bs
from typing import Dict, List, Union

In [83]:
["A", "B"]

['A', 'B']

In [86]:
'asdsad'.replace('a', '')

'sdsd'

In [81]:
set('ACSADW').issubset(ALLOWED_LETTERS)

True

In [13]:
class NumberedProtein(bs.MutableSeq):
    def __init__(self, *args, **kwargs):
        #todo: limit the amino acid letters
        super().__init__(*args, **kwargs)

In [14]:
np = NumberedProtein('ATKC')

In [15]:
dir(np)

['__abstractmethods__',
 '__add__',
 '__array_ufunc__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_data',
 'append',
 'back_transcribe',
 'complement',
 'complement_rna',
 'count',
 'count_overlap',
 'defined',
 'defined_ranges',
 'endswith',
 'extend',
 'find',
 'index',
 'insert',
 'islower',
 'isupper',
 'join',
 'lower',
 'lstrip',
 'pop',
 'remove',
 'removeprefix',
 'removesuffix',
 'replace',
 'reverse',
 'reverse_complement',
 'reverse_complement_rna',
 'rfind',
 'rindex',
 

In [17]:
Seq('asdsad')

NameError: name 'Seq' is not defined

In [8]:
np._data

b'ATKC'

In [5]:
np

NumberedProtein('ATKC')

In [6]:
np

NumberedProtein('ATKC')

In [63]:


class NumberedProtein:
    def __init__(
        self,
        sequence: str,
        numbering_schemas: Dict[str, List[str]] = None,
        default_schema: str = "orig",
    ) -> None:
        self._raw_seq = bs.Seq(sequence)
        self._current_seq = bs.MutableSeq(sequence)
        self._numbering_schemas = {}
        self._mutations = {}  # Dict to store current mutations by position

        # Add original numbering schema (1-based indexing)
        orig_schema = [str(i + 1) for i in range(len(sequence))]
        self._numbering_schemas["orig"] = orig_schema

        # Add user-provided schemas
        if numbering_schemas:
            for schema_name, numbers in numbering_schemas.items():
                if len(numbers) != len(sequence):
                    raise ValueError(
                        f"Schema {schema_name} length doesn't match sequence length"
                    )
                self._numbering_schemas[schema_name] = numbers

        self.default_schema = default_schema

    @property
    def raw_sequence(self) -> str:
        return str(self._raw_seq)

    @property
    def current_sequence(self) -> str:
        return str(self._current_seq)

    @property
    def mutations(self) -> List[str]:
        """Get the list of mutations in the format 'X#Y' where:
        X is the original residue
        # is the position in default_schema
        Y is the new residue
        
        Note: Reversion mutations are automatically removed. For example,
        if K2P is followed by P2K, both mutations are removed as they cancel out.
        """
        return list(self._mutations.values())

    def get_residue(
        self, schema: str = "orig", position: Union[str, int] = None
    ) -> NumberedResidue:
        if schema not in self._numbering_schemas:
            raise ValueError(f"Unknown schema: {schema}")

        if schema == "orig" and isinstance(position, int):
            if position < 1 or position > len(self._current_seq):
                raise ValueError(f"Position {position} out of range")
            idx = position - 1
        else:
            # Convert position to string for comparison
            position = str(position)
            try:
                idx = self._numbering_schemas[schema].index(position)
            except ValueError:
                raise ValueError(f"Position {position} not found in schema {schema}")

        # Create number_dict for the residue
        number_dict = {
            schema_name: numbers[idx]
            for schema_name, numbers in self._numbering_schemas.items()
        }

        return NumberedResidue(residue=self._current_seq[idx], number_dict=number_dict)

    def replace_residue(
        self,
        schema: str = "orig",
        position: Union[str, int] = None,
        new_residue: str = None,
    ) -> None:
        if schema not in self._numbering_schemas:
            raise ValueError(f"Unknown schema: {schema}")

        if schema == "orig" and isinstance(position, int):
            if position < 1 or position > len(self._current_seq):
                raise ValueError(f"Position {position} out of range")
            idx = position - 1
        else:
            # Convert position to string for comparison
            position = str(position)
            try:
                idx = self._numbering_schemas[schema].index(position)
            except ValueError:
                raise ValueError(f"Position {position} not found in schema {schema}")

        # Validate new_residue
        if (
            not new_residue
            or len(new_residue) != 1
            or new_residue not in ALLOWED_LETTERS
        ):
            raise ValueError("Invalid residue")

        # Record mutation in default schema format
        old_residue = self._current_seq[idx]
        default_pos = self._numbering_schemas[self.default_schema][idx]
        mutation = f"{old_residue}{default_pos}{new_residue}"

        # Check if this mutation reverts to the original sequence
        orig_residue = self._raw_seq[idx]
        if new_residue == orig_residue:
            # If reverting to original, remove the mutation record
            self._mutations.pop(default_pos, None)
        else:
            # Otherwise, record the current mutation
            self._mutations[default_pos] = mutation

        # Apply the mutation
        self._current_seq[idx] = new_residue

    def __len__(self) -> int:
        """Return the length of the sequence."""
        return len(self.current_sequence)

    def __str__(self) -> str:
        """Return the current sequence as a string."""
        return self.current_sequence

    def __repr__(self) -> str:
        """Return a detailed representation including schemas."""
        return f"NumberedProtein(sequence='{self.current_sequence}', schemas={list(self._numbering_schemas.keys())})"


In [64]:
np = NumberedProtein('MKTCY', {'H3':['1', '3', '4', '5', '6'], 'H5': ['1', '2', '3', '4', '6']})

In [47]:
np

NumberedProtein(sequence='MKTCY', schemas=['orig', 'H3', 'H5'])

In [57]:
np.get_residue('orig', 5)

    |Y|
orig|5|
H3  |6|
H5  |6|

In [49]:
np.get_residue('H5', 6)

    |Y|
orig|5|
H3  |6|
H5  |6|

In [65]:
np.replace_residue('H3', '5', 'K')

In [66]:
np.replace_residue('H3', '3', 'C')

In [68]:
np.mutations

['C4K', 'K2C']

In [61]:
np

NumberedProtein(sequence='MKTCY', schemas=['orig', 'H3', 'H5'])

In [62]:
np

NumberedProtein(sequence='MKTCY', schemas=['orig', 'H3', 'H5'])