diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 4e38034e..5012388e 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -90,20 +90,23 @@ class TSVTaxonomyFormat(model.TextFileFormat): Optionally followed by other arbitrary columns. - This format supports comment lines starting with #, and blank lines. The - expected header must be the first non-comment, non-blank line. In addition - to the header, there must be at least one line of data. + This format supports blank lines. The expected header must be the first + non-blank line. In addition to the header, there must be at least one line + of data. """ HEADER = ['Feature ID', 'Taxon'] - def sniff(self): + def _check_n_records(self, n=None): with self.open() as fh: data_lines = 0 header = None - while data_lines < 10: - line = fh.readline() + file_ = enumerate(fh) if n is None else zip(range(n), fh) + + for i, line in file_: + # Tracks line count for error reporting + i = i + 1 if line == '': # EOF break @@ -111,17 +114,35 @@ def sniff(self): # Blank line continue - cells = line.rstrip('\n').split('\t') + cells = line.strip('\n').split('\t') + if header is None: if cells[:2] != self.HEADER: - return False + raise ValidationError("['Feature ID' and 'Taxon'] " + "must be the first two header " + "values to be valid.\n\n The " + "first two header values " + "provided are: {}.\nIssue on " + "line {}" + .format(cells[:2], i)) header = cells else: if len(cells) != len(header): - return False + raise ValidationError("Number of columns are not the " + "same as number of headers in " + "the file. \nHeader values: " + "{} \nColumn values: {}\nIssue " + "on line: {}" + .format(header, cells[:], i)) + data_lines += 1 - return header is not None and data_lines > 0 + if data_lines == 0: + raise ValidationError("No feature records found, only blank " + "lines and/or a header row.") + + def _validate_(self, level): + self._check_n_records(n={'min': 10, 'max': None}[level]) TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat( diff --git a/q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv b/q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv new file mode 100644 index 00000000..00040321 --- /dev/null +++ b/q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv @@ -0,0 +1,3 @@ +Feature ID Taxon +seq1 k__Bacteria; p__Proteobacteria -1.0 +seq2 k__Bacteria 1.0 diff --git a/q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv b/q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv new file mode 100644 index 00000000..f53aa340 --- /dev/null +++ b/q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv @@ -0,0 +1,3 @@ +Feature ID Taxon Confidence Random +seq1 k__Foo; p__Bar -1.0 +seq2 k__Foo; p__Baz -42.0 diff --git a/q2_types/feature_data/tests/test_format.py b/q2_types/feature_data/tests/test_format.py index 4329bd38..9a4b514d 100644 --- a/q2_types/feature_data/tests/test_format.py +++ b/q2_types/feature_data/tests/test_format.py @@ -134,6 +134,18 @@ def test_tsv_taxonomy_directory_format(self): format.validate() + def test_tsv_taxonomy_format_column_header_lengths(self): + filenames = ['greater-column-length.tsv', 'greater-header-length.tsv'] + + filepaths = [self.get_data_path(os.path.join('taxonomy', filename)) + for filename in filenames] + + for filepath in filepaths: + format = TSVTaxonomyFormat(filepath, mode='r') + + with self.assertRaisesRegex(ValidationError, 'Number of columns'): + format.validate() + class TestDNAFASTAFormats(TestPluginBase): package = 'q2_types.feature_data.tests'