Skip to content

Commit

Permalink
ENH: updates default gap filter, improves error reporting (#13)
Browse files Browse the repository at this point in the history
This now matches QIIME 1 by disabling filtering of gap positions. We don't have
a good idea of what the default should be, so going with matching QIIME 1
pending an exploration of reasonable defaults.

Fixes #12.
  • Loading branch information
gregcaporaso authored and jairideout committed Oct 1, 2016
1 parent fbba734 commit 3ad93c2
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 19 deletions.
24 changes: 22 additions & 2 deletions q2_alignment/_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def _compute_frequencies(alignment):
for c in alignment.iter_positions()]


def mask(alignment: skbio.TabularMSA, max_gap_frequency: float=0.05,
def mask(alignment: skbio.TabularMSA, max_gap_frequency: float=1.0,
min_conservation: float=0.40) -> skbio.TabularMSA:
# check that parameters are in range
if max_gap_frequency < 0.0 or max_gap_frequency > 1.0:
Expand All @@ -69,6 +69,10 @@ def mask(alignment: skbio.TabularMSA, max_gap_frequency: float=0.05,
if min_conservation < 0.0 or min_conservation > 1.0:
raise ValueError('min_conservation out of range [0.0, 1.0]: %f' %
min_conservation)
# check that input alignment is not empty
if alignment.shape.position == 0:
raise ValueError('Input alignment is empty (i.e., there are zero '
'sequences or positions in the input alignment).')
# compute frequencies of all alphabet characters
frequencies = _compute_frequencies(alignment)
# compute gap and conservation masks, and then combine them
Expand All @@ -79,4 +83,20 @@ def mask(alignment: skbio.TabularMSA, max_gap_frequency: float=0.05,
min_conservation)
combined_mask = gap_mask & conservation_mask
# apply the mask and return the resulting alignment
return _apply_mask(alignment, combined_mask)
result = _apply_mask(alignment, combined_mask)

if result.shape.position == 0:
num_input_positions = alignment.shape.position
frac_passed_gap = (gap_mask.sum() / num_input_positions)
str_passed_gap = '{percent:.2%}'.format(percent=frac_passed_gap)
frac_passed_conservation = \
(conservation_mask.sum() / num_input_positions)
str_passed_conservation = \
'{percent:.2%}'.format(percent=frac_passed_conservation)
raise ValueError("No alignment positions remain after filtering. The "
"filter thresholds will need to be relaxed. %s "
"of positions were retained by the gap filter, and "
"%s of positions were retained by the "
"conservation filter." %
(str_passed_gap, str_passed_conservation))
return result
52 changes: 35 additions & 17 deletions q2_alignment/test/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,22 +82,24 @@ def test_gap_boundaries(self):
skbio.DNA('A', metadata={'id': 'seq2', 'description': ''}),
skbio.DNA('A', metadata={'id': 'seq3', 'description': ''})]
)
empty_alignment = skbio.TabularMSA(
[skbio.DNA('', metadata={'id': 'seq1', 'description': ''}),
skbio.DNA('', metadata={'id': 'seq2', 'description': ''}),
skbio.DNA('', metadata={'id': 'seq3', 'description': ''})]
)

actual = mask(alignment1, max_gap_frequency=1.0, min_conservation=0.0)
self.assertEqual(actual, alignment1)

actual = mask(alignment2, max_gap_frequency=0.0, min_conservation=0.0)
self.assertEqual(actual, alignment2)

eps = np.finfo(float).eps
actual = mask(alignment1, max_gap_frequency=1.0-eps,
min_conservation=0.0)
self.assertEqual(actual, empty_alignment)
def test_error_on_empty_alignment_gap_boundary(self):
alignment1 = skbio.TabularMSA(
[skbio.DNA('A', metadata={'id': 'seq1', 'description': ''}),
skbio.DNA('-', metadata={'id': 'seq2', 'description': ''}),
skbio.DNA('-', metadata={'id': 'seq3', 'description': ''})]
)

self.assertRaisesRegex(ValueError,
" 0.00% of positions were retained by the gap",
mask, alignment1, max_gap_frequency=0.1,
min_conservation=0.0)

def test_conservation_boundaries(self):
alignment1 = skbio.TabularMSA(
Expand All @@ -108,20 +110,23 @@ def test_conservation_boundaries(self):
[skbio.DNA('-', metadata={'id': 'seq1', 'description': ''}),
skbio.DNA('-', metadata={'id': 'seq2', 'description': ''}),
skbio.DNA('-', metadata={'id': 'seq3', 'description': ''})])
empty_alignment = skbio.TabularMSA(
[skbio.DNA('', metadata={'id': 'seq1', 'description': ''}),
skbio.DNA('', metadata={'id': 'seq2', 'description': ''}),
skbio.DNA('', metadata={'id': 'seq3', 'description': ''})])
eps = np.finfo(float).eps

actual = mask(alignment1, max_gap_frequency=1.0, min_conservation=1.0)
self.assertEqual(actual, alignment1)

actual = mask(alignment2, max_gap_frequency=1.0, min_conservation=0.0)
self.assertEqual(actual, alignment2)

actual = mask(alignment2, max_gap_frequency=1.0,
min_conservation=0.0+eps)
self.assertEqual(actual, empty_alignment)
def test_error_on_empty_alignment_conservation_boundary(self):
alignment1 = skbio.TabularMSA(
[skbio.DNA('A', metadata={'id': 'seq1', 'description': ''}),
skbio.DNA('C', metadata={'id': 'seq2', 'description': ''}),
skbio.DNA('G', metadata={'id': 'seq3', 'description': ''})])

self.assertRaisesRegex(ValueError,
" 0.00% of positions were retained by the con",
mask, alignment1, max_gap_frequency=1.0,
min_conservation=0.5)

def test_invalid_gap_threshold(self):
alignment = skbio.TabularMSA(
Expand All @@ -146,3 +151,16 @@ def test_invalid_conservation_threshold(self):
mask(alignment, min_conservation=0.0 - eps)
with self.assertRaises(ValueError):
mask(alignment, min_conservation=1.0 + eps)

def test_empty_input(self):
alignment = skbio.TabularMSA(
[skbio.DNA('', metadata={'id': 'seq1', 'description': ''}),
skbio.DNA('', metadata={'id': 'seq2', 'description': ''}),
skbio.DNA('', metadata={'id': 'seq3', 'description': ''})]
)
with self.assertRaises(ValueError):
mask(alignment)

alignment = skbio.TabularMSA([])
with self.assertRaises(ValueError):
mask(alignment)

0 comments on commit 3ad93c2

Please sign in to comment.