Skip to content

Commit

Permalink
IMP: Implement validate API for TSVTaxonomyFormat. (#221)
Browse files Browse the repository at this point in the history
  • Loading branch information
David-Rod authored and thermokarst committed Oct 29, 2019
1 parent 22aa622 commit 8470cf2
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 16 deletions.
47 changes: 31 additions & 16 deletions q2_types/feature_data/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,38 +90,53 @@ class TSVTaxonomyFormat(model.TextFileFormat):
Optionally followed by other arbitrary columns.
This format supports comment lines starting with #, and blank lines. The
expected header must be the first non-comment, non-blank line. In addition
to the header, there must be at least one line of data.
This format supports blank lines. The expected header must be the first
non-blank line. In addition to the header, there must be at least one line
of data.
"""
HEADER = ['Feature ID', 'Taxon']

def sniff(self):
def _check_n_records(self, n=None):
with self.open() as fh:
data_lines = 0
data_line_count = 0
header = None
while data_lines < 10:
line = fh.readline()

if line == '':
# EOF
break
elif line.lstrip(' ') == '\n':
file_ = enumerate(fh) if n is None else zip(range(n), fh)

for i, line in file_:
# Tracks line number for error reporting
i = i + 1

if line.lstrip(' ') == '\n':
# Blank line
continue

cells = line.rstrip('\n').split('\t')
cells = line.strip('\n').split('\t')

if header is None:
if cells[:2] != self.HEADER:
return False
raise ValidationError(
'%s must be the first two header values. The '
'first two header values provided are: %s (on '
'line %s).' % (self.HEADER, cells[:2], i))
header = cells
else:
if len(cells) != len(header):
return False
data_lines += 1
raise ValidationError(
'Number of values on line %s are not the same as '
'number of header values. Found %s values '
'(%s), expected %s.' % (i, len(cells), cells,
len(self.HEADER)))

data_line_count += 1

if data_line_count == 0:
raise ValidationError('No taxonomy records found, only blank '
'lines and/or a header row.')

return header is not None and data_lines > 0
def _validate_(self, level):
self._check_n_records(n={'min': 10, 'max': None}[level])


TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Feature ID Taxon
seq1 k__Bacteria; p__Proteobacteria -1.0
seq2 k__Bacteria 1.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Feature ID Taxon Confidence Random
seq1 k__Foo; p__Bar -1.0
seq2 k__Foo; p__Baz -42.0
13 changes: 13 additions & 0 deletions q2_types/feature_data/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,19 @@ def test_tsv_taxonomy_directory_format(self):

format.validate()

def test_tsv_taxonomy_format_column_header_lengths(self):
filenames = ['greater-column-length.tsv', 'greater-header-length.tsv']

filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
for filename in filenames]

for filepath in filepaths:
format = TSVTaxonomyFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError,
'line 2.*3 values.*expected 2'):
format.validate()


class TestDNAFASTAFormats(TestPluginBase):
package = 'q2_types.feature_data.tests'
Expand Down

0 comments on commit 8470cf2

Please sign in to comment.