From 8470cf2a15f69af3cfba849b057cb2ca44103b2f Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Tue, 29 Oct 2019 13:16:31 -0700 Subject: [PATCH] IMP: Implement validate API for TSVTaxonomyFormat. (#221) --- q2_types/feature_data/_format.py | 47 ++++++++++++------- .../data/taxonomy/greater-column-length.tsv | 3 ++ .../data/taxonomy/greater-header-length.tsv | 3 ++ q2_types/feature_data/tests/test_format.py | 13 +++++ 4 files changed, 50 insertions(+), 16 deletions(-) create mode 100644 q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv create mode 100644 q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index aeafbc21..f43d7073 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -90,38 +90,53 @@ class TSVTaxonomyFormat(model.TextFileFormat): Optionally followed by other arbitrary columns. - This format supports comment lines starting with #, and blank lines. The - expected header must be the first non-comment, non-blank line. In addition - to the header, there must be at least one line of data. + This format supports blank lines. The expected header must be the first + non-blank line. In addition to the header, there must be at least one line + of data. """ HEADER = ['Feature ID', 'Taxon'] - def sniff(self): + def _check_n_records(self, n=None): with self.open() as fh: - data_lines = 0 + data_line_count = 0 header = None - while data_lines < 10: - line = fh.readline() - if line == '': - # EOF - break - elif line.lstrip(' ') == '\n': + file_ = enumerate(fh) if n is None else zip(range(n), fh) + + for i, line in file_: + # Tracks line number for error reporting + i = i + 1 + + if line.lstrip(' ') == '\n': # Blank line continue - cells = line.rstrip('\n').split('\t') + cells = line.strip('\n').split('\t') + if header is None: if cells[:2] != self.HEADER: - return False + raise ValidationError( + '%s must be the first two header values. The ' + 'first two header values provided are: %s (on ' + 'line %s).' % (self.HEADER, cells[:2], i)) header = cells else: if len(cells) != len(header): - return False - data_lines += 1 + raise ValidationError( + 'Number of values on line %s are not the same as ' + 'number of header values. Found %s values ' + '(%s), expected %s.' % (i, len(cells), cells, + len(self.HEADER))) + + data_line_count += 1 + + if data_line_count == 0: + raise ValidationError('No taxonomy records found, only blank ' + 'lines and/or a header row.') - return header is not None and data_lines > 0 + def _validate_(self, level): + self._check_n_records(n={'min': 10, 'max': None}[level]) TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat( diff --git a/q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv b/q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv new file mode 100644 index 00000000..00040321 --- /dev/null +++ b/q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv @@ -0,0 +1,3 @@ +Feature ID Taxon +seq1 k__Bacteria; p__Proteobacteria -1.0 +seq2 k__Bacteria 1.0 diff --git a/q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv b/q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv new file mode 100644 index 00000000..f53aa340 --- /dev/null +++ b/q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv @@ -0,0 +1,3 @@ +Feature ID Taxon Confidence Random +seq1 k__Foo; p__Bar -1.0 +seq2 k__Foo; p__Baz -42.0 diff --git a/q2_types/feature_data/tests/test_format.py b/q2_types/feature_data/tests/test_format.py index 49a3ed54..3b206176 100644 --- a/q2_types/feature_data/tests/test_format.py +++ b/q2_types/feature_data/tests/test_format.py @@ -134,6 +134,19 @@ def test_tsv_taxonomy_directory_format(self): format.validate() + def test_tsv_taxonomy_format_column_header_lengths(self): + filenames = ['greater-column-length.tsv', 'greater-header-length.tsv'] + + filepaths = [self.get_data_path(os.path.join('taxonomy', filename)) + for filename in filenames] + + for filepath in filepaths: + format = TSVTaxonomyFormat(filepath, mode='r') + + with self.assertRaisesRegex(ValidationError, + 'line 2.*3 values.*expected 2'): + format.validate() + class TestDNAFASTAFormats(TestPluginBase): package = 'q2_types.feature_data.tests'