diff --git a/CHANGELOG.rst b/CHANGELOG.rst index bffea9e2..e59e20e4 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,6 +16,7 @@ Added - ``ContactMap.singletons`` returns a copy of the contact map with singleton contacts, i.e. ones without neighbors - ``Sequence.seq_encoded`` to allow turning a sequence into an encoded list - ``Sequence.encoded_matrix`` to give the entire alignment as encoded matrix +- ``SequenceFile.filter_gapped`` to filter sequences with a certain threshold of gaps Changed ~~~~~~~ - Changed API interface for ``conkit.plot`` in accordance to necessary changes for above @@ -23,6 +24,10 @@ Changed - ``ContactMapFigure`` and ``ContacctMapChordFigure`` improved to better space marker size - Typos corrected in documentation - ``THREE_TO_ONE`` and ``ONE_TO_THREE`` dictionaries modified to ``Enum`` objects +- ``SequeneFile.neff`` renamed to ``SequenceFile.meff`` +Fixed +~~~~~ +- ``A3mParser`` keyword argument mismatch sorted [0.8.4] ------- diff --git a/CONTRIB.rst b/CONTRIB.rst index 661c497a..3106c7af 100644 --- a/CONTRIB.rst +++ b/CONTRIB.rst @@ -5,7 +5,7 @@ Contributors This is a list of people who have made contributions to ConKit. - `Daniel Rigden `_ -- `Adam Simpkin `_ - `Felix Simkovic `_ - `Jens Thomas `_ - `Stefan Seemayer `_ diff --git a/conkit/core/contactmap.py b/conkit/core/contactmap.py index 0c1b56c2..8b30eba6 100644 --- a/conkit/core/contactmap.py +++ b/conkit/core/contactmap.py @@ -317,16 +317,13 @@ def singletons(self): :obj:`ContactMap ` """ - maxd = 2 - contacts = np.asarray(self.as_list()) - iterator = np.arange(self.ncontacts) + contacts = np.array(self.as_list(), dtype=np.int64) removables = set() - for i in iterator: - for j in iterator[i + 1:]: + for i in np.arange(contacts.shape[0]): + for j in np.arange(i + 1, contacts.shape[0]): dist = np.abs(contacts[i] - contacts[j]) - if dist[0] <= maxd and dist[1] <= maxd: - removables.add(tuple(contacts[i])) - removables.add(tuple(contacts[j])) + if np.all(dist <= 2): + removables.update({tuple(contacts[i]), tuple(contacts[j])}) singletons = self.deepcopy() [singletons.remove(i) for i in removables] return singletons diff --git a/conkit/core/sequencefile.py b/conkit/core/sequencefile.py index 33f734d1..0df5d242 100644 --- a/conkit/core/sequencefile.py +++ b/conkit/core/sequencefile.py @@ -45,8 +45,7 @@ from itertools import izip as zip from conkit.core._entity import _Entity -from conkit.core.mappings import SequenceAlignmentState - +from conkit.core.mappings import AminoAcidMapping, SequenceAlignmentState class SequenceFile(_Entity): """A sequence file object representing a single sequence file @@ -134,7 +133,7 @@ def diversity(self): """The diversity of an alignment defined by :math:`\sqrt{N}/L`. ``N`` equals the number of sequences in - thethe alignment and ``L`` the sequence length + the alignment and ``L`` the sequence length """ if self.empty: @@ -151,6 +150,13 @@ def empty(self): @property def neff(self): + """The number of effective sequences""" + import warnings + warnings.warn("This function will be deprecated in a future release! Use meff instead!") + return self.meff + + @property + def meff(self): """The number of effective sequences""" return int(sum(self.calculate_weights())) @@ -220,7 +226,7 @@ def calculate_meff(self, identity=0.8): See Also -------- - neff + meff """ import warnings @@ -235,8 +241,23 @@ def calculate_neff_with_identity(self, identity): neff, calculate_weights """ + import warnings + warnings.warn("This function will be deprecated in a future release! Use calculate_meff_with_identity instead!") + return self.calculate_neff_with_identity(identity) + + def calculate_meff_with_identity(self, identity): + """Calculate the number of effective sequences with specified sequence identity + + See Also + -------- + neff, calculate_weights + + """ + import warnings + warnings.warn("This function will be deprecated in a future release! Use calculate_meff_with_identity instead!") return int(sum(self.calculate_weights(identity=identity))) + def calculate_weights(self, identity=0.8): """Calculate the sequence weights @@ -261,43 +282,39 @@ def calculate_weights(self, identity=0.8): Raises ------ - MemoryError - Too many sequences in the alignment for Hamming distance calculation - RuntimeError - SciPy package not installed + ImportError + Cannot find SciPy package ValueError :obj:`SequenceFile ` is not an alignment ValueError Sequence Identity needs to be between 0 and 1 """ + # http://stackoverflow.com/a/41090953/3046533 try: import scipy.spatial except ImportError: - raise RuntimeError('Cannot find SciPy package') - - if not self.is_alignment: - raise ValueError('This is not an alignment') + raise ImportError('Cannot find SciPy package') if identity < 0 or identity > 1: raise ValueError("Sequence Identity needs to be between 0 and 1") - - msa_mat = np.array(self.ascii_matrix) - n = msa_mat.shape[0] # size of the data - batch_size = min(n, 250) # size of the batches - hamming = np.zeros(n, dtype=np.int) # storage for data - # Separate the distance calculations into batches to avoid MemoryError exceptions. - # This answer was provided by a StackOverflow user. The corresponding suggestion by - # user @WarrenWeckesser: http://stackoverflow.com/a/41090953/3046533 - num_full_batches, last_batch = divmod(n, batch_size) - batches = [batch_size] * num_full_batches - if last_batch != 0: - batches.append(last_batch) - for k, batch in enumerate(batches): - i = batch_size * k - dists = scipy.spatial.distance.cdist(msa_mat[i:i + batch], msa_mat, metric='hamming') - hamming[i:i + batch] = (dists < (1 - identity)).sum(axis=1) - return (1. / hamming).tolist() + + if self.is_alignment: + msa_mat = np.array(self.ascii_matrix) + M = msa_mat.shape[0] # size of the data + batch_size = min(M, 250) # size of the batches + hamming = np.zeros(M, dtype=np.int) # storage for data + num_full_batches, last_batch = divmod(M, batch_size) + batches = [batch_size] * num_full_batches + if last_batch != 0: + batches.append(last_batch) + for k, batch in enumerate(batches): + i = batch_size * k + dists = scipy.spatial.distance.cdist(msa_mat[i:i + batch], msa_mat, metric='hamming') + hamming[i:i + batch] = (dists < (1 - identity)).sum(axis=1) + return (1. / hamming).tolist() + else: + raise ValueError('This is not an alignment') def calculate_freq(self): """Calculate the gap frequency in each alignment column @@ -308,8 +325,7 @@ def calculate_freq(self): Returns ------- list - A list containing the per alignment-column amino acid - frequency count + A list containing the per alignment-column amino acid frequency count Raises ------ @@ -320,26 +336,24 @@ def calculate_freq(self): """ if self.is_alignment: - msa_mat = np.array(self.ascii_matrix) - aa_frequencies = np.where(msa_mat != 45, 1, 0) - aa_counts = np.sum(aa_frequencies, axis=0) - return (aa_counts / len(msa_mat[:, 0])).tolist() + msa_mat = np.array(self.encoded_matrix, dtype=np.int64) + return 1.0 - (msa_mat == AminoAcidMapping["X"].value).sum(axis=0) / self.nseq else: raise ValueError('This is not an alignment') def filter(self, min_id=0.3, max_id=0.9, inplace=False): - """Filter an alignment + """Filter sequences from an alignment according to the minimum and maximum identity + between the sequences Parameters ---------- min_id : float, optional - + Minimum sequence identity max_id : float, optional - + Maximum sequence identity inplace : bool, optional Replace the saved order of sequences [default: False] - Returns ------- obj @@ -354,9 +368,9 @@ def filter(self, min_id=0.3, max_id=0.9, inplace=False): ValueError :obj:`SequenceFile ` is not an alignment ValueError - Minimum sequence Identity needs to be between 0 and 1 + Minimum sequence identity needs to be between 0 and 1 ValueError - Maximum sequence Identity needs to be between 0 and 1 + Maximum sequence identity needs to be between 0 and 1 """ try: @@ -372,18 +386,43 @@ def filter(self, min_id=0.3, max_id=0.9, inplace=False): elif 0 > max_id > 1: raise ValueError("Maximum sequence Identity needs to be between 0 and 1") - # Alignment to ASCII matrix msa_mat = np.array(self.ascii_matrix) - # Find all throwable sequences throw = set() for i in np.arange(len(self)): ident = 1 - scipy.spatial.distance.cdist([msa_mat[i]], msa_mat[i + 1:], metric='hamming')[0] throw.update((1 + i + np.argwhere((ident < min_id) | (ident > max_id)).flatten()).tolist()) - # Throw the previously selected sequences sequence_file = self._inplace(inplace) for i in reversed(list(throw)): sequence_file.remove(self[i].id) + return sequence_file + def filter_gapped(self, min_prop=0.0, max_prop=0.9, inplace=True): + """Filter all sequences a gap proportion greater than the limit + + Parameters + ---------- + min_prop : float, optional + Minimum allowed gap proportion [default: 0.0] + max_prop : float, optional + Maximum allowed gap proportion [default: 0.9] + inplace : bool, optional + Replace the saved order of sequences [default: False] + + Returns + ------- + obj + The reference to the :obj:`SequenceFile `, regardless of inplace + + """ + msa_mat = np.array(self.encoded_matrix) + throw = set() + for i in np.arange(len(self)): + gap_prop = (msa_mat[i] == AminoAcidMapping["X"].value).sum() / float(msa_mat[i].shape[0]) + if gap_prop < min_prop or gap_prop > max_prop: + throw.add(self._child_list[i].id) + sequence_file = self._inplace(inplace) + for id_ in throw: + sequence_file.remove(id_) return sequence_file def sort(self, kword, reverse=False, inplace=False): diff --git a/conkit/core/tests/test_sequencefile.py b/conkit/core/tests/test_sequencefile.py index 93f3bbfe..014ced92 100644 --- a/conkit/core/tests/test_sequencefile.py +++ b/conkit/core/tests/test_sequencefile.py @@ -25,23 +25,23 @@ def skipUnless(condition): class TestSequenceFile(unittest.TestCase): def test_ascii_matrix_1(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', '-CC-C-'), Sequence('doe', 'BBBBBB')]: + for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', '-CC-C-'), Sequence('doe', 'DDDDDD')]: sequence_file.add(seq) matrix = sequence_file.ascii_matrix self.assertEqual([65, 65, 65, 65, 65, 65], list(matrix)[0]) self.assertEqual([45, 67, 67, 45, 67, 45], list(matrix)[1]) - self.assertEqual([66, 66, 66, 66, 66, 66], list(matrix)[2]) + self.assertEqual([68, 68, 68, 68, 68, 68], list(matrix)[2]) def test_is_alignment_1(self): sequence_file = SequenceFile('test') sequence_file.add(Sequence('foo', 'AAAAA')) - sequence_file.add(Sequence('bar', 'BBBBB')) + sequence_file.add(Sequence('bar', 'CCCCC')) self.assertTrue(sequence_file.is_alignment) def test_is_alignment_2(self): sequence_file = SequenceFile('test') sequence_file.add(Sequence('foo', 'AAAAA')) - sequence_file.add(Sequence('bar', 'BBBB')) + sequence_file.add(Sequence('bar', 'CCCC')) self.assertFalse(sequence_file.is_alignment) def test_empty_1(self): @@ -65,7 +65,7 @@ def test_nseq_2(self): def test_nseq_3(self): sequence_file = SequenceFile('test') sequence_file.add(Sequence('foo', 'AAAAA')) - sequence_file.add(Sequence('bar', 'BBBBB')) + sequence_file.add(Sequence('bar', 'CCCCC')) self.assertEqual(2, sequence_file.nseq) def test_remark_1(self): @@ -113,7 +113,7 @@ def test_top_sequence_2(self): def test_top_sequence_3(self): sequence_file = SequenceFile('test') sequence1 = Sequence('foo', 'AAAAA') - sequence2 = Sequence('bar', 'BBBBB') + sequence2 = Sequence('bar', 'CCCCC') sequence_file.add(sequence1) sequence_file.add(sequence2) self.assertEqual(sequence1, sequence_file.top_sequence) @@ -138,7 +138,7 @@ def test_calculate_weights_2(self): Sequence('foo', 'AAAAAAA'), Sequence('bar', 'AAAAAAA'), Sequence('cho', 'AAAAAAA'), - Sequence('baz', 'BBBBBBB') + Sequence('baz', 'CCCCCCC') ]: sequence_file.add(s) weights = sequence_file.calculate_weights(identity=0.7) @@ -150,8 +150,8 @@ def test_calculate_weights_3(self): for s in [ Sequence('foo', 'AAAAAAA'), Sequence('bar', 'A-AABA-'), - Sequence('cho', 'B-BAA--'), - Sequence('baz', 'BBBBBBB') + Sequence('cho', 'C-CAA--'), + Sequence('baz', 'CCCCCCC') ]: sequence_file.add(s) weights = sequence_file.calculate_weights(identity=0.7) @@ -163,8 +163,8 @@ def test_calculate_weights_4(self): for s in [ Sequence('foo', 'AAAAAAA'), Sequence('bar', 'AAAABA-'), - Sequence('cho', 'B-BAA--'), - Sequence('baz', 'BBBBBBB') + Sequence('cho', 'C-CAA--'), + Sequence('baz', 'CCCCCCC') ]: sequence_file.add(s) weights = sequence_file.calculate_weights(identity=0.7) @@ -175,9 +175,9 @@ def test_calculate_weights_5(self): sequence_file = SequenceFile('test') for s in [ Sequence('foo', 'AAAAAAA'), - Sequence('bar', 'AA-ABA-'), - Sequence('cho', 'B-BAA--'), - Sequence('baz', 'BBBBBBB') + Sequence('bar', 'AA-ACA-'), + Sequence('cho', 'C-CAA--'), + Sequence('baz', 'CCCCCCC') ]: sequence_file.add(s) weights = sequence_file.calculate_weights(identity=0.6) @@ -188,10 +188,10 @@ def test_calculate_weights_6(self): sequence_file = SequenceFile('test') for s in [ Sequence('foo', 'AAAAAAA'), - Sequence('bar', 'AA-ABA-'), - Sequence('cho', 'AAACBAA'), - Sequence('doo', 'B-BAA--'), - Sequence('miu', 'BBBBBBB'), + Sequence('bar', 'AA-ACA-'), + Sequence('cho', 'AAADCAA'), + Sequence('doo', 'C-CAA--'), + Sequence('miu', 'CCCCCCC'), Sequence('nop', 'AAAAAAB') ]: sequence_file.add(s) @@ -203,14 +203,14 @@ def test_calculate_weights_6(self): sequence_file = SequenceFile('test') for s in [ Sequence('foo', 'AAAAAAA'), - Sequence('bar', 'AA-ABA-'), - Sequence('cho', 'AAACBAA'), - Sequence('doo', 'B-BAA--'), - Sequence('miu', 'BBBBBBB'), + Sequence('bar', 'AA-ACA-'), + Sequence('cho', 'AAADCAA'), + Sequence('doo', 'C-CAA--'), + Sequence('miu', 'CCCCCCC'), Sequence('nop', 'AAAAAAB') ]: sequence_file.add(s) - self.assertEqual(5, sequence_file.neff) + self.assertEqual(5, sequence_file.meff) def test_calculate_freq_1(self): sequence_file = SequenceFile('test') @@ -235,74 +235,74 @@ def test_calculate_freq_3(self): def test_sort_1(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'BBBBB'), Sequence('doe', 'CCCCC')]: + for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'CCCCC'), Sequence('doe', 'DDDDD')]: sequence_file.add(seq) sequence_file_sorted = sequence_file.sort('id', reverse=False, inplace=False) self.assertEqual(['bar', 'doe', 'foo'], [s.id for s in sequence_file_sorted]) - self.assertEqual(['BBBBB', 'CCCCC', 'AAAAA'], [s.seq for s in sequence_file_sorted]) + self.assertEqual(['CCCCC', 'DDDDD', 'AAAAA'], [s.seq for s in sequence_file_sorted]) self.assertNotEqual(sequence_file, sequence_file_sorted) def test_sort_2(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'BBBBB'), Sequence('doe', 'CCCCC')]: + for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'CCCCC'), Sequence('doe', 'DDDDD')]: sequence_file.add(seq) sequence_file_sorted = sequence_file.sort('id', reverse=True, inplace=False) self.assertEqual(['foo', 'doe', 'bar'], [s.id for s in sequence_file_sorted]) - self.assertEqual(['AAAAA', 'CCCCC', 'BBBBB'], [s.seq for s in sequence_file_sorted]) + self.assertEqual(['AAAAA', 'DDDDD', 'CCCCC'], [s.seq for s in sequence_file_sorted]) self.assertNotEqual(sequence_file, sequence_file_sorted) def test_sort_3(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'BBBBB'), Sequence('doe', 'CCCCC')]: + for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'CCCCC'), Sequence('doe', 'DDDDD')]: sequence_file.add(seq) sequence_file_sorted = sequence_file.sort('seq', reverse=False, inplace=True) self.assertEqual(['foo', 'bar', 'doe'], [s.id for s in sequence_file_sorted]) - self.assertEqual(['AAAAA', 'BBBBB', 'CCCCC'], [s.seq for s in sequence_file_sorted]) + self.assertEqual(['AAAAA', 'CCCCC', 'DDDDD'], [s.seq for s in sequence_file_sorted]) self.assertEqual(sequence_file, sequence_file_sorted) def test_sort_4(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'BBBBB'), Sequence('doe', 'CCCCC')]: + for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'DDDDD'), Sequence('doe', 'CCCCC')]: sequence_file.add(seq) sequence_file_sorted = sequence_file.sort('seq', reverse=True, inplace=True) - self.assertEqual(['doe', 'bar', 'foo'], [s.id for s in sequence_file_sorted]) - self.assertEqual(['CCCCC', 'BBBBB', 'AAAAA'], [s.seq for s in sequence_file_sorted]) + self.assertEqual(['bar', 'doe', 'foo'], [s.id for s in sequence_file_sorted]) + self.assertEqual(['DDDDD', 'CCCCC', 'AAAAA'], [s.seq for s in sequence_file_sorted]) self.assertEqual(sequence_file, sequence_file_sorted) def test_trim_1(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'BBBBB'), Sequence('doe', 'CCCCC')]: + for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'CCCCC'), Sequence('doe', 'DDDDD')]: sequence_file.add(seq) sequence_file_trimmed = sequence_file.trim(1, 5) self.assertEqual(['foo', 'bar', 'doe'], [s.id for s in sequence_file_trimmed]) - self.assertEqual(['AAAAA', 'BBBBB', 'CCCCC'], [s.seq for s in sequence_file_trimmed]) + self.assertEqual(['AAAAA', 'CCCCC', 'DDDDD'], [s.seq for s in sequence_file_trimmed]) self.assertNotEqual(sequence_file, sequence_file_trimmed) def test_trim_2(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'BBBBB'), Sequence('doe', 'CCCCC')]: + for seq in [Sequence('foo', 'AAAAA'), Sequence('bar', 'CCCCC'), Sequence('doe', 'DDDDD')]: sequence_file.add(seq) sequence_file_trimmed = sequence_file.trim(3, 5) self.assertEqual(['foo', 'bar', 'doe'], [s.id for s in sequence_file_trimmed]) - self.assertEqual(['AAA', 'BBB', 'CCC'], [s.seq for s in sequence_file_trimmed]) + self.assertEqual(['AAA', 'CCC', 'DDD'], [s.seq for s in sequence_file_trimmed]) self.assertNotEqual(sequence_file, sequence_file_trimmed) def test_trim_3(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'ABCDE'), Sequence('bar', 'BCDEF'), Sequence('doe', 'CDEFG')]: + for seq in [Sequence('foo', 'ACDEF'), Sequence('bar', 'CDEFG'), Sequence('doe', 'DEFGH')]: sequence_file.add(seq) sequence_file_trimmed = sequence_file.trim(1, 3) self.assertEqual(['foo', 'bar', 'doe'], [s.id for s in sequence_file_trimmed]) - self.assertEqual(['ABC', 'BCD', 'CDE'], [s.seq for s in sequence_file_trimmed]) + self.assertEqual(['ACD', 'CDE', 'DEF'], [s.seq for s in sequence_file_trimmed]) self.assertNotEqual(sequence_file, sequence_file_trimmed) def test_trim_4(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'ABCDE'), Sequence('bar', 'BCDEF'), Sequence('doe', 'CDEFG')]: + for seq in [Sequence('foo', 'ACDEF'), Sequence('bar', 'CDEFG'), Sequence('doe', 'DEFGH')]: sequence_file.add(seq) sequence_file_trimmed = sequence_file.trim(2, 3) self.assertEqual(['foo', 'bar', 'doe'], [s.id for s in sequence_file_trimmed]) - self.assertEqual(['BC', 'CD', 'DE'], [s.seq for s in sequence_file_trimmed]) + self.assertEqual(['CD', 'DE', 'EF'], [s.seq for s in sequence_file_trimmed]) self.assertNotEqual(sequence_file, sequence_file_trimmed) @skipUnless(SCIPY) @@ -316,7 +316,7 @@ def test_filter_1(self): @skipUnless(SCIPY) def test_filter_2(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', 'AAAABB'), Sequence('doe', 'AAAAAA')]: + for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', 'AAAACC'), Sequence('doe', 'AAAAAA')]: sequence_file.add(seq) filtered = sequence_file.filter(min_id=0.0, max_id=0.9) self.assertEqual(['foo', 'bar'], [s.id for s in filtered]) @@ -324,7 +324,7 @@ def test_filter_2(self): @skipUnless(SCIPY) def test_filter_3(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', 'AAAAAA'), Sequence('doe', 'BBBBBB')]: + for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', 'AAAAAA'), Sequence('doe', 'CCCCCC')]: sequence_file.add(seq) filtered = sequence_file.filter(min_id=0.0, max_id=0.9) self.assertEqual(['foo', 'doe'], [s.id for s in filtered]) @@ -332,7 +332,7 @@ def test_filter_3(self): @skipUnless(SCIPY) def test_filter_4(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', 'CCCCCC'), Sequence('doe', 'BBBBBB')]: + for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', 'CCCCCC'), Sequence('doe', 'DDDDDD')]: sequence_file.add(seq) filtered = sequence_file.filter(min_id=0.0, max_id=0.9) self.assertEqual(['foo', 'bar', 'doe'], [s.id for s in filtered]) @@ -340,14 +340,52 @@ def test_filter_4(self): @skipUnless(SCIPY) def test_filter_5(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', 'CCCCCC'), Sequence('doe', 'BBBBBB')]: + for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', 'CCCCCC'), Sequence('doe', 'DDDDDD')]: sequence_file.add(seq) filtered = sequence_file.filter(min_id=0.1, max_id=0.9) self.assertEqual(['foo'], [s.id for s in filtered]) + @skipUnless(SCIPY) + def test_filter_6(self): + sequence_file = SequenceFile('test') + for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', 'AACCCC'), Sequence('doe', 'DDDDDD')]: + sequence_file.add(seq) + filtered = sequence_file.filter(min_id=0.1, max_id=0.9) + self.assertEqual(['foo', 'bar'], [s.id for s in filtered]) + + def test_filter_gapped_1(self): + sequence_file = SequenceFile('test') + sequence_file.add(Sequence('foo', '-----')) + filtered = sequence_file.filter_gapped(min_prop=0.0, max_prop=1.0) + self.assertEqual(['foo'], [s.id for s in filtered]) + + def test_filter_gapped_2(self): + sequence_file = SequenceFile('test') + sequence_file.add(Sequence('foo', 'AAA--')) + filtered = sequence_file.filter_gapped(min_prop=0.0, max_prop=0.6) + self.assertEqual(['foo'], [s.id for s in filtered]) + + def test_filter_gapped_3(self): + sequence_file = SequenceFile('test') + sequence_file.add(Sequence('foo', 'AAA--')) + filtered = sequence_file.filter_gapped(min_prop=0.0, max_prop=0.599999999) + self.assertEqual(['foo'], [s.id for s in filtered]) + + def test_filter_gapped_4(self): + sequence_file = SequenceFile('test') + sequence_file.add(Sequence('foo', 'AAAA-')) + filtered = sequence_file.filter_gapped(min_prop=0.2, max_prop=1.0) + self.assertEqual(['foo'], [s.id for s in filtered]) + + def test_filter_gapped_5(self): + sequence_file = SequenceFile('test') + sequence_file.add(Sequence('foo', 'AAAAA')) + filtered = sequence_file.filter_gapped(min_prop=0.199999999, max_prop=1.0) + self.assertEqual([], [s.id for s in filtered]) + def test_diversity_1(self): sequence_file = SequenceFile('test') - for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', 'CCCCCC'), Sequence('doe', 'BBBBBB')]: + for seq in [Sequence('foo', 'AAAAAA'), Sequence('bar', 'CCCCCC'), Sequence('doe', 'DDDDDD')]: sequence_file.add(seq) self.assertEqual(0.289, sequence_file.diversity) diff --git a/conkit/io/a3m.py b/conkit/io/a3m.py index 5040f63f..f5edbbb2 100644 --- a/conkit/io/a3m.py +++ b/conkit/io/a3m.py @@ -56,7 +56,7 @@ class A3mParser(SequenceFileParser): def __init__(self): super(A3mParser, self).__init__() - def read(self, f_handle, f_id='a3m', remove_insert=True): + def read(self, f_handle, f_id='a3m', remove_inserts=True): """Read a sequence file Parameters @@ -65,7 +65,7 @@ def read(self, f_handle, f_id='a3m', remove_insert=True): Open file handle [read permissions] f_id : str, optional Unique sequence file identifier - remove_insert : bool, optional + remove_inserts : bool, optional Remove insert states [default: True] Returns @@ -88,13 +88,10 @@ def read(self, f_handle, f_id='a3m', remove_insert=True): elif line.startswith('>'): break - # Read the sequence record(s) and store them while True: if not line.startswith('>'): raise ValueError("Fasta record needs to start with '>'") - - id = line[1:] # Header without '>' - + id = line[1:] chunks = [] line = f_handle.readline().rstrip() while True: @@ -104,16 +101,10 @@ def read(self, f_handle, f_id='a3m', remove_insert=True): break chunks.append(line) line = f_handle.readline().rstrip() - seq_string = "".join(chunks) # Sequence from chunks - - # Remove insert states - if remove_insert: - seq_string = self._remove_insert(seq_string) - - # Create the sequence record instance + seq_string = "".join(chunks) + if remove_inserts: + seq_string = self._remove_inserts(seq_string) sequence_entry = Sequence(id, seq_string) - - # Store the sequence in the file try: sequence_file.add(sequence_entry) except ValueError: @@ -125,12 +116,10 @@ def read(self, f_handle, f_id='a3m', remove_insert=True): break sequence_entry.id = new_id sequence_file.add(sequence_entry) - if not line: break - # Match the insert states of the sequence - if not remove_insert: + if not remove_inserts: self._adjust_insert(sequence_file) return sequence_file @@ -173,7 +162,7 @@ def pad(chunk, length, pad_char='-'): in zip(seq, insert_max_lengths)) return - def _remove_insert(self, seq): + def _remove_inserts(self, seq): """Remove insert states""" return "".join([char for char in seq if not char.islower()]) diff --git a/conkit/io/tests/test_a3m.py b/conkit/io/tests/test_a3m.py index 340c1d15..01237186 100644 --- a/conkit/io/tests/test_a3m.py +++ b/conkit/io/tests/test_a3m.py @@ -37,7 +37,7 @@ def test_read_1(self): f_name = create_tmp_f(content=msa) parser = A3mParser() with open(f_name, 'r') as f_in: - sequence_file = parser.read(f_in, remove_insert=True) # <------------ + sequence_file = parser.read(f_in, remove_inserts=True) # <------------ for i, sequence_entry in enumerate(sequence_file): if i == 0: self.assertEqual('d1a1x__ b.63.1.1 (-) p13-MTCP1 {Human (Homo sapiens)}', @@ -116,7 +116,7 @@ def test_read_2(self): f_name = create_tmp_f(content=msa) parser = A3mParser() with open(f_name, 'r') as f_in: - sequence_file = parser.read(f_in, remove_insert=False) # <------------ + sequence_file = parser.read(f_in, remove_inserts=False) # <------------ for i, sequence_entry in enumerate(sequence_file): if i == 0: self.assertEqual('d1a1x__ b.63.1.1 (-) p13-MTCP1 {Human (Homo sapiens)}', @@ -185,7 +185,7 @@ def test_read_3(self): f_name = create_tmp_f(content=msa) parser = A3mParser() with open(f_name, 'r') as f_in: - sequence_file = parser.read(f_in, remove_insert=False) # <------------ + sequence_file = parser.read(f_in, remove_inserts=False) # <------------ for i, sequence_entry in enumerate(sequence_file): if i == 0: self.assertEqual('d1a1x__ b.63.1.1 (-) p13-MTCP1 {Human (Homo sapiens)}', @@ -240,7 +240,7 @@ def test_write_1(self): f_name_out = create_tmp_f() parser = A3mParser() with open(f_name_in, 'r') as f_in, open(f_name_out, 'w') as f_out: - sequence_file = parser.read(f_in, remove_insert=True) + sequence_file = parser.read(f_in, remove_inserts=True) parser.write(f_out, sequence_file) ref = [ ">d1a1x__ b.63.1.1 (-) p13-MTCP1 {Human (Homo sapiens)}", @@ -300,7 +300,7 @@ def test_write_2(self): f_name_out = create_tmp_f() parser = A3mParser() with open(f_name_in, 'r') as f_in, open(f_name_out, 'w') as f_out: - sequence_file = parser.read(f_in, remove_insert=False) + sequence_file = parser.read(f_in, remove_inserts=False) parser.write(f_out, sequence_file) ref = [ ">d1a1x__ b.63.1.1 (-) p13-MTCP1 {Human (Homo sapiens)}",