Skip to content

Commit

Permalink
Merge d7bc00c into 618cfa6
Browse files Browse the repository at this point in the history
  • Loading branch information
Felix Simkovic committed Mar 18, 2018
2 parents 618cfa6 + d7bc00c commit 7d08471
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 122 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,18 @@ Added
- ``ContactMap.singletons`` returns a copy of the contact map with singleton contacts, i.e. ones without neighbors
- ``Sequence.seq_encoded`` to allow turning a sequence into an encoded list
- ``Sequence.encoded_matrix`` to give the entire alignment as encoded matrix
- ``SequenceFile.filter_gapped`` to filter sequences with a certain threshold of gaps
Changed
~~~~~~~
- Changed API interface for ``conkit.plot`` in accordance to necessary changes for above
- ``ContactMapFigure`` now accepts ``lim`` parameters for axes limits
- ``ContactMapFigure`` and ``ContacctMapChordFigure`` improved to better space marker size
- Typos corrected in documentation
- ``THREE_TO_ONE`` and ``ONE_TO_THREE`` dictionaries modified to ``Enum`` objects
- ``SequeneFile.neff`` renamed to ``SequenceFile.meff``
Fixed
~~~~~
- ``A3mParser`` keyword argument mismatch sorted

[0.8.4]
-------
Expand Down
2 changes: 1 addition & 1 deletion CONTRIB.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Contributors
This is a list of people who have made contributions to ConKit.

- `Daniel Rigden <https://github.com/DanielRigden>`_
- `Adam Simpkin <https://github.com/hlasimpk`_
- `Adam Simpkin <https://github.com/hlasimpk>`_
- `Felix Simkovic <https://github.com/fsimkovic>`_
- `Jens Thomas <https://github.com/DanielRigden>`_
- `Stefan Seemayer <https://github.com/sseemayer>`_
13 changes: 5 additions & 8 deletions conkit/core/contactmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,16 +317,13 @@ def singletons(self):
:obj:`ContactMap <conkit.core.contactmap.ContactMap>`
"""
maxd = 2
contacts = np.asarray(self.as_list())
iterator = np.arange(self.ncontacts)
contacts = np.array(self.as_list(), dtype=np.int64)
removables = set()
for i in iterator:
for j in iterator[i + 1:]:
for i in np.arange(contacts.shape[0]):
for j in np.arange(i + 1, contacts.shape[0]):
dist = np.abs(contacts[i] - contacts[j])
if dist[0] <= maxd and dist[1] <= maxd:
removables.add(tuple(contacts[i]))
removables.add(tuple(contacts[j]))
if np.all(dist <= 2):
removables.update({tuple(contacts[i]), tuple(contacts[j])})
singletons = self.deepcopy()
[singletons.remove(i) for i in removables]
return singletons
Expand Down
127 changes: 83 additions & 44 deletions conkit/core/sequencefile.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@
from itertools import izip as zip

from conkit.core._entity import _Entity
from conkit.core.mappings import SequenceAlignmentState

from conkit.core.mappings import AminoAcidMapping, SequenceAlignmentState

class SequenceFile(_Entity):
"""A sequence file object representing a single sequence file
Expand Down Expand Up @@ -134,7 +133,7 @@ def diversity(self):
"""The diversity of an alignment defined by :math:`\sqrt{N}/L`.
``N`` equals the number of sequences in
thethe alignment and ``L`` the sequence length
the alignment and ``L`` the sequence length
"""
if self.empty:
Expand All @@ -151,6 +150,13 @@ def empty(self):

@property
def neff(self):
"""The number of effective sequences"""
import warnings
warnings.warn("This function will be deprecated in a future release! Use meff instead!")
return self.meff

@property
def meff(self):
"""The number of effective sequences"""
return int(sum(self.calculate_weights()))

Expand Down Expand Up @@ -220,7 +226,7 @@ def calculate_meff(self, identity=0.8):
See Also
--------
neff
meff
"""
import warnings
Expand All @@ -235,8 +241,23 @@ def calculate_neff_with_identity(self, identity):
neff, calculate_weights
"""
import warnings
warnings.warn("This function will be deprecated in a future release! Use calculate_meff_with_identity instead!")
return self.calculate_neff_with_identity(identity)

def calculate_meff_with_identity(self, identity):
"""Calculate the number of effective sequences with specified sequence identity
See Also
--------
neff, calculate_weights
"""
import warnings
warnings.warn("This function will be deprecated in a future release! Use calculate_meff_with_identity instead!")
return int(sum(self.calculate_weights(identity=identity)))


def calculate_weights(self, identity=0.8):
"""Calculate the sequence weights
Expand All @@ -261,43 +282,39 @@ def calculate_weights(self, identity=0.8):
Raises
------
MemoryError
Too many sequences in the alignment for Hamming distance calculation
RuntimeError
SciPy package not installed
ImportError
Cannot find SciPy package
ValueError
:obj:`SequenceFile <conkit.core.sequencefile.SequenceFile>` is not an alignment
ValueError
Sequence Identity needs to be between 0 and 1
"""
# http://stackoverflow.com/a/41090953/3046533
try:
import scipy.spatial
except ImportError:
raise RuntimeError('Cannot find SciPy package')

if not self.is_alignment:
raise ValueError('This is not an alignment')
raise ImportError('Cannot find SciPy package')

if identity < 0 or identity > 1:
raise ValueError("Sequence Identity needs to be between 0 and 1")

msa_mat = np.array(self.ascii_matrix)
n = msa_mat.shape[0] # size of the data
batch_size = min(n, 250) # size of the batches
hamming = np.zeros(n, dtype=np.int) # storage for data
# Separate the distance calculations into batches to avoid MemoryError exceptions.
# This answer was provided by a StackOverflow user. The corresponding suggestion by
# user @WarrenWeckesser: http://stackoverflow.com/a/41090953/3046533
num_full_batches, last_batch = divmod(n, batch_size)
batches = [batch_size] * num_full_batches
if last_batch != 0:
batches.append(last_batch)
for k, batch in enumerate(batches):
i = batch_size * k
dists = scipy.spatial.distance.cdist(msa_mat[i:i + batch], msa_mat, metric='hamming')
hamming[i:i + batch] = (dists < (1 - identity)).sum(axis=1)
return (1. / hamming).tolist()
if self.is_alignment:
msa_mat = np.array(self.ascii_matrix)
M = msa_mat.shape[0] # size of the data
batch_size = min(M, 250) # size of the batches
hamming = np.zeros(M, dtype=np.int) # storage for data
num_full_batches, last_batch = divmod(M, batch_size)
batches = [batch_size] * num_full_batches
if last_batch != 0:
batches.append(last_batch)
for k, batch in enumerate(batches):
i = batch_size * k
dists = scipy.spatial.distance.cdist(msa_mat[i:i + batch], msa_mat, metric='hamming')
hamming[i:i + batch] = (dists < (1 - identity)).sum(axis=1)
return (1. / hamming).tolist()
else:
raise ValueError('This is not an alignment')

def calculate_freq(self):
"""Calculate the gap frequency in each alignment column
Expand All @@ -308,8 +325,7 @@ def calculate_freq(self):
Returns
-------
list
A list containing the per alignment-column amino acid
frequency count
A list containing the per alignment-column amino acid frequency count
Raises
------
Expand All @@ -320,26 +336,24 @@ def calculate_freq(self):
"""
if self.is_alignment:
msa_mat = np.array(self.ascii_matrix)
aa_frequencies = np.where(msa_mat != 45, 1, 0)
aa_counts = np.sum(aa_frequencies, axis=0)
return (aa_counts / len(msa_mat[:, 0])).tolist()
msa_mat = np.array(self.encoded_matrix, dtype=np.int64)
return 1.0 - (msa_mat == AminoAcidMapping["X"].value).sum(axis=0) / self.nseq
else:
raise ValueError('This is not an alignment')

def filter(self, min_id=0.3, max_id=0.9, inplace=False):
"""Filter an alignment
"""Filter sequences from an alignment according to the minimum and maximum identity
between the sequences
Parameters
----------
min_id : float, optional
Minimum sequence identity
max_id : float, optional
Maximum sequence identity
inplace : bool, optional
Replace the saved order of sequences [default: False]
Returns
-------
obj
Expand All @@ -354,9 +368,9 @@ def filter(self, min_id=0.3, max_id=0.9, inplace=False):
ValueError
:obj:`SequenceFile <conkit.core.sequencefile.SequenceFile>` is not an alignment
ValueError
Minimum sequence Identity needs to be between 0 and 1
Minimum sequence identity needs to be between 0 and 1
ValueError
Maximum sequence Identity needs to be between 0 and 1
Maximum sequence identity needs to be between 0 and 1
"""
try:
Expand All @@ -372,18 +386,43 @@ def filter(self, min_id=0.3, max_id=0.9, inplace=False):
elif 0 > max_id > 1:
raise ValueError("Maximum sequence Identity needs to be between 0 and 1")

# Alignment to ASCII matrix
msa_mat = np.array(self.ascii_matrix)
# Find all throwable sequences
throw = set()
for i in np.arange(len(self)):
ident = 1 - scipy.spatial.distance.cdist([msa_mat[i]], msa_mat[i + 1:], metric='hamming')[0]
throw.update((1 + i + np.argwhere((ident < min_id) | (ident > max_id)).flatten()).tolist())
# Throw the previously selected sequences
sequence_file = self._inplace(inplace)
for i in reversed(list(throw)):
sequence_file.remove(self[i].id)
return sequence_file

def filter_gapped(self, min_prop=0.0, max_prop=0.9, inplace=True):
"""Filter all sequences a gap proportion greater than the limit
Parameters
----------
min_prop : float, optional
Minimum allowed gap proportion [default: 0.0]
max_prop : float, optional
Maximum allowed gap proportion [default: 0.9]
inplace : bool, optional
Replace the saved order of sequences [default: False]
Returns
-------
obj
The reference to the :obj:`SequenceFile <conkit.core.sequencefile.SequenceFile>`, regardless of inplace
"""
msa_mat = np.array(self.encoded_matrix)
throw = set()
for i in np.arange(len(self)):
gap_prop = (msa_mat[i] == AminoAcidMapping["X"].value).sum() / float(msa_mat[i].shape[0])
if gap_prop < min_prop or gap_prop > max_prop:
throw.add(self._child_list[i].id)
sequence_file = self._inplace(inplace)
for id_ in throw:
sequence_file.remove(id_)
return sequence_file

def sort(self, kword, reverse=False, inplace=False):
Expand Down
Loading

0 comments on commit 7d08471

Please sign in to comment.