Merge d7bc00c into 618cfa6

rigdenlab · Mar 18, 2018 · 7d08471 · 7d08471
2 parents 618cfa6 + d7bc00c
commit 7d08471
Show file tree

Hide file tree

Showing 7 changed files with 190 additions and 122 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -16,13 +16,18 @@ Added
 - ``ContactMap.singletons`` returns a copy of the contact map with singleton contacts, i.e. ones without neighbors
 - ``Sequence.seq_encoded`` to allow turning a sequence into an encoded list
 - ``Sequence.encoded_matrix`` to give the entire alignment as encoded matrix
+- ``SequenceFile.filter_gapped`` to filter sequences with a certain threshold of gaps
 Changed
 ~~~~~~~
 - Changed API interface for ``conkit.plot`` in accordance to necessary changes for above
 - ``ContactMapFigure`` now accepts ``lim`` parameters for axes limits
 - ``ContactMapFigure`` and ``ContacctMapChordFigure`` improved to better space marker size
 - Typos corrected in documentation 
 - ``THREE_TO_ONE`` and ``ONE_TO_THREE`` dictionaries modified to ``Enum`` objects
+- ``SequeneFile.neff`` renamed to ``SequenceFile.meff``
+Fixed
+~~~~~
+- ``A3mParser`` keyword argument mismatch sorted
 
 [0.8.4]
 -------

diff --git a/CONTRIB.rst b/CONTRIB.rst
@@ -5,7 +5,7 @@ Contributors
 This is a list of people who have made contributions to ConKit.
 
 - `Daniel Rigden <https://github.com/DanielRigden>`_
-- `Adam Simpkin <https://github.com/hlasimpk`_
+- `Adam Simpkin <https://github.com/hlasimpk>`_
 - `Felix Simkovic <https://github.com/fsimkovic>`_
 - `Jens Thomas <https://github.com/DanielRigden>`_
 - `Stefan Seemayer <https://github.com/sseemayer>`_
diff --git a/conkit/core/contactmap.py b/conkit/core/contactmap.py
@@ -317,16 +317,13 @@ def singletons(self):
         :obj:`ContactMap <conkit.core.contactmap.ContactMap>`
 
         """
-        maxd = 2
-        contacts = np.asarray(self.as_list())
-        iterator = np.arange(self.ncontacts)
+        contacts = np.array(self.as_list(), dtype=np.int64)
         removables = set()
-        for i in iterator:
-            for j in iterator[i + 1:]:
+        for i in np.arange(contacts.shape[0]):
+            for j in np.arange(i + 1, contacts.shape[0]):
                 dist = np.abs(contacts[i] - contacts[j])
-                if dist[0] <= maxd and dist[1] <= maxd:
-                    removables.add(tuple(contacts[i]))
-                    removables.add(tuple(contacts[j]))
+                if np.all(dist <= 2):
+                    removables.update({tuple(contacts[i]), tuple(contacts[j])})
         singletons = self.deepcopy()
         [singletons.remove(i) for i in removables]
         return singletons

diff --git a/conkit/core/sequencefile.py b/conkit/core/sequencefile.py
@@ -45,8 +45,7 @@
     from itertools import izip as zip
 
 from conkit.core._entity import _Entity
-from conkit.core.mappings import SequenceAlignmentState
-
+from conkit.core.mappings import AminoAcidMapping, SequenceAlignmentState
 
 class SequenceFile(_Entity):
     """A sequence file object representing a single sequence file
@@ -134,7 +133,7 @@ def diversity(self):
         """The diversity of an alignment defined by :math:`\sqrt{N}/L`. 
         
         ``N`` equals the number of sequences in
-        thethe alignment and ``L`` the sequence length
+        the alignment and ``L`` the sequence length
         
         """
         if self.empty:
@@ -151,6 +150,13 @@ def empty(self):
 
     @property
     def neff(self):
+        """The number of effective sequences"""
+        import warnings
+        warnings.warn("This function will be deprecated in a future release! Use meff instead!")
+        return self.meff
+
+    @property
+    def meff(self):
         """The number of effective sequences"""
         return int(sum(self.calculate_weights()))
 
@@ -220,7 +226,7 @@ def calculate_meff(self, identity=0.8):
 
         See Also
         --------
-        neff
+        meff
 
         """
         import warnings
@@ -235,8 +241,23 @@ def calculate_neff_with_identity(self, identity):
         neff, calculate_weights
 
         """
+        import warnings
+        warnings.warn("This function will be deprecated in a future release! Use calculate_meff_with_identity instead!")
+        return self.calculate_neff_with_identity(identity) 
+
+    def calculate_meff_with_identity(self, identity):
+        """Calculate the number of effective sequences with specified sequence identity
+        
+        See Also
+        --------
+        neff, calculate_weights
+
+        """
+        import warnings
+        warnings.warn("This function will be deprecated in a future release! Use calculate_meff_with_identity instead!")
         return int(sum(self.calculate_weights(identity=identity)))
 
+
     def calculate_weights(self, identity=0.8):
         """Calculate the sequence weights
 
@@ -261,43 +282,39 @@ def calculate_weights(self, identity=0.8):
 
         Raises
         ------
-        MemoryError
-           Too many sequences in the alignment for Hamming distance calculation
-        RuntimeError
-           SciPy package not installed
+        ImportError
+           Cannot find SciPy package
         ValueError
            :obj:`SequenceFile <conkit.core.sequencefile.SequenceFile>` is not an alignment
         ValueError
            Sequence Identity needs to be between 0 and 1
 
         """
+        # http://stackoverflow.com/a/41090953/3046533
         try:
             import scipy.spatial
         except ImportError:
-            raise RuntimeError('Cannot find SciPy package')
-
-        if not self.is_alignment:
-            raise ValueError('This is not an alignment')
+            raise ImportError('Cannot find SciPy package')
 
         if identity < 0 or identity > 1:
             raise ValueError("Sequence Identity needs to be between 0 and 1")
-
-        msa_mat = np.array(self.ascii_matrix)
-        n = msa_mat.shape[0]  # size of the data
-        batch_size = min(n, 250)  # size of the batches
-        hamming = np.zeros(n, dtype=np.int)  # storage for data
-        # Separate the distance calculations into batches to avoid MemoryError exceptions.
-        # This answer was provided by a StackOverflow user. The corresponding suggestion by
-        # user @WarrenWeckesser: http://stackoverflow.com/a/41090953/3046533
-        num_full_batches, last_batch = divmod(n, batch_size)
-        batches = [batch_size] * num_full_batches
-        if last_batch != 0:
-            batches.append(last_batch)
-        for k, batch in enumerate(batches):
-            i = batch_size * k
-            dists = scipy.spatial.distance.cdist(msa_mat[i:i + batch], msa_mat, metric='hamming')
-            hamming[i:i + batch] = (dists < (1 - identity)).sum(axis=1)
-        return (1. / hamming).tolist()
+        
+        if self.is_alignment:
+            msa_mat = np.array(self.ascii_matrix)
+            M = msa_mat.shape[0]  # size of the data
+            batch_size = min(M, 250)  # size of the batches
+            hamming = np.zeros(M, dtype=np.int)  # storage for data
+            num_full_batches, last_batch = divmod(M, batch_size)
+            batches = [batch_size] * num_full_batches
+            if last_batch != 0:
+                batches.append(last_batch)
+            for k, batch in enumerate(batches):
+                i = batch_size * k
+                dists = scipy.spatial.distance.cdist(msa_mat[i:i + batch], msa_mat, metric='hamming')
+                hamming[i:i + batch] = (dists < (1 - identity)).sum(axis=1)
+            return (1. / hamming).tolist()
+        else:
+            raise ValueError('This is not an alignment')
 
     def calculate_freq(self):
         """Calculate the gap frequency in each alignment column
@@ -308,8 +325,7 @@ def calculate_freq(self):
         Returns
         -------
         list
-           A list containing the per alignment-column amino acid
-           frequency count
+           A list containing the per alignment-column amino acid frequency count
 
         Raises
         ------
@@ -320,26 +336,24 @@ def calculate_freq(self):
 
         """
         if self.is_alignment:
-            msa_mat = np.array(self.ascii_matrix)
-            aa_frequencies = np.where(msa_mat != 45, 1, 0)
-            aa_counts = np.sum(aa_frequencies, axis=0)
-            return (aa_counts / len(msa_mat[:, 0])).tolist()
+            msa_mat = np.array(self.encoded_matrix, dtype=np.int64)
+            return 1.0 - (msa_mat == AminoAcidMapping["X"].value).sum(axis=0) / self.nseq
         else:
             raise ValueError('This is not an alignment')
 
     def filter(self, min_id=0.3, max_id=0.9, inplace=False):
-        """Filter an alignment
+        """Filter sequences from an alignment according to the minimum and maximum identity
+        between the sequences
 
         Parameters
         ----------
         min_id : float, optional
-
+           Minimum sequence identity
         max_id : float, optional
-
+           Maximum sequence identity
         inplace : bool, optional
            Replace the saved order of sequences [default: False]
 
-
         Returns
         -------
         obj
@@ -354,9 +368,9 @@ def filter(self, min_id=0.3, max_id=0.9, inplace=False):
         ValueError
            :obj:`SequenceFile <conkit.core.sequencefile.SequenceFile>` is not an alignment
         ValueError
-           Minimum sequence Identity needs to be between 0 and 1
+           Minimum sequence identity needs to be between 0 and 1
         ValueError
-           Maximum sequence Identity needs to be between 0 and 1
+           Maximum sequence identity needs to be between 0 and 1
 
         """
         try:
@@ -372,18 +386,43 @@ def filter(self, min_id=0.3, max_id=0.9, inplace=False):
         elif 0 > max_id > 1:
             raise ValueError("Maximum sequence Identity needs to be between 0 and 1")
 
-        # Alignment to ASCII matrix
         msa_mat = np.array(self.ascii_matrix)
-        # Find all throwable sequences
         throw = set()
         for i in np.arange(len(self)):
             ident = 1 - scipy.spatial.distance.cdist([msa_mat[i]], msa_mat[i + 1:], metric='hamming')[0]
             throw.update((1 + i + np.argwhere((ident < min_id) | (ident > max_id)).flatten()).tolist())
-        # Throw the previously selected sequences
         sequence_file = self._inplace(inplace)
         for i in reversed(list(throw)):
             sequence_file.remove(self[i].id)
+        return sequence_file
 
+    def filter_gapped(self, min_prop=0.0, max_prop=0.9, inplace=True):
+        """Filter all sequences a gap proportion greater than the limit
+        
+        Parameters
+        ----------
+        min_prop : float, optional
+           Minimum allowed gap proportion [default: 0.0]
+        max_prop : float, optional
+           Maximum allowed gap proportion [default: 0.9]
+        inplace : bool, optional
+           Replace the saved order of sequences [default: False]
+
+        Returns
+        -------
+        obj
+           The reference to the :obj:`SequenceFile <conkit.core.sequencefile.SequenceFile>`, regardless of inplace
+
+        """
+        msa_mat = np.array(self.encoded_matrix)
+        throw = set()
+        for i in np.arange(len(self)):
+            gap_prop = (msa_mat[i] == AminoAcidMapping["X"].value).sum() / float(msa_mat[i].shape[0])
+            if gap_prop < min_prop or gap_prop > max_prop:
+                throw.add(self._child_list[i].id)
+        sequence_file = self._inplace(inplace)
+        for id_ in throw:
+            sequence_file.remove(id_)
         return sequence_file
 
     def sort(self, kword, reverse=False, inplace=False):