diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 4e38034e..e4d04dcc 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -90,20 +90,23 @@ class TSVTaxonomyFormat(model.TextFileFormat): Optionally followed by other arbitrary columns. - This format supports comment lines starting with #, and blank lines. The - expected header must be the first non-comment, non-blank line. In addition - to the header, there must be at least one line of data. + This format supports blank lines. The expected header must be the first + non-blank line. In addition to the header, there must be at least one line + of data. """ HEADER = ['Feature ID', 'Taxon'] - def sniff(self): + def _check_n_records(self, n=None): with self.open() as fh: data_lines = 0 header = None - while data_lines < 10: - line = fh.readline() + file_ = enumerate(fh) if n is None else zip(range(n), fh) + + for i, line in file_: + # Checks rows in the file, excludes header row + i = i + 1 if line == '': # EOF break @@ -111,17 +114,33 @@ def sniff(self): # Blank line continue - cells = line.rstrip('\n').split('\t') + cells = line.strip('\n').split('\t') + if header is None: if cells[:2] != self.HEADER: - return False + raise ValidationError("['Feature ID' and 'Taxon'] " + "must be the first two header " + "values to be valid.\n\n The " + "first two header values " + "provided are: {}." + .format(cells[:2])) header = cells else: if len(cells) != len(header): - return False + raise ValidationError('Number of columns are not the ' + 'same as number of headers in ' + 'the file. \nHeader values: ' + '{} \nColumn values: {} ' + .format(header, cells[:], i)) + data_lines += 1 - return header is not None and data_lines > 0 + if data_lines == 0: + raise ValidationError("No feature records found, only blank " + "lines and/or a header row.") + + def _validate_(self, level): + self._check_n_records(n={'min': 10, 'max': None}[level]) TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat(