Skip to content

Commit

Permalink
Merge 608dfc6 into cf45830
Browse files Browse the repository at this point in the history
  • Loading branch information
David-Rod committed Aug 9, 2019
2 parents cf45830 + 608dfc6 commit dd06ee8
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 10 deletions.
41 changes: 31 additions & 10 deletions q2_types/feature_data/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,38 +90,59 @@ class TSVTaxonomyFormat(model.TextFileFormat):
Optionally followed by other arbitrary columns.
This format supports comment lines starting with #, and blank lines. The
expected header must be the first non-comment, non-blank line. In addition
to the header, there must be at least one line of data.
This format supports blank lines. The expected header must be the first
non-blank line. In addition to the header, there must be at least one line
of data.
"""
HEADER = ['Feature ID', 'Taxon']

def sniff(self):
def _check_n_records(self, n=None):
with self.open() as fh:
data_lines = 0
header = None
while data_lines < 10:
line = fh.readline()

file_ = enumerate(fh) if n is None else zip(range(n), fh)

for i, line in file_:
# Tracks line count for error reporting
i = i + 1
if line == '':
# EOF
break
elif line.lstrip(' ') == '\n':
# Blank line
continue

cells = line.rstrip('\n').split('\t')
cells = line.strip('\n').split('\t')

if header is None:
if cells[:2] != self.HEADER:
return False
raise ValidationError("['Feature ID' and 'Taxon'] "
"must be the first two header "
"values to be valid.\n\n The "
"first two header values "
"provided are: {}.\nIssue on "
"line {}"
.format(cells[:2], i))
header = cells
else:
if len(cells) != len(header):
return False
raise ValidationError("Number of columns are not the "
"same as number of headers in "
"the file. \nHeader values: "
"{} \nColumn values: {}\nIssue "
"on line: {}"
.format(header, cells[:], i))

data_lines += 1

return header is not None and data_lines > 0
if data_lines == 0:
raise ValidationError("No feature records found, only blank "
"lines and/or a header row.")

def _validate_(self, level):
self._check_n_records(n={'min': 10, 'max': None}[level])


TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Feature ID Taxon
seq1 k__Bacteria; p__Proteobacteria -1.0
seq2 k__Bacteria 1.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Feature ID Taxon Confidence Random
seq1 k__Foo; p__Bar -1.0
seq2 k__Foo; p__Baz -42.0
12 changes: 12 additions & 0 deletions q2_types/feature_data/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,18 @@ def test_tsv_taxonomy_directory_format(self):

format.validate()

def test_tsv_taxonomy_format_column_header_lengths(self):
filenames = ['greater-column-length.tsv', 'greater-header-length.tsv']

filepaths = [self.get_data_path(os.path.join('taxonomy', filename))
for filename in filenames]

for filepath in filepaths:
format = TSVTaxonomyFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError, 'Number of columns'):
format.validate()


class TestDNAFASTAFormats(TestPluginBase):
package = 'q2_types.feature_data.tests'
Expand Down

0 comments on commit dd06ee8

Please sign in to comment.