From 97f19a7f4375b5d95fbf5a98d4a46cf5a6504b7b Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Thu, 18 Jul 2019 10:31:20 -0700 Subject: [PATCH 01/10] MAINT: Initial commit that attempts to implement validate API. --- q2_types/feature_data/_format.py | 24 ++++++++++++++++++++---- q2_types/feature_table/_format.py | 12 ++++++++++-- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 64cc5c62..52ca2878 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -38,7 +38,8 @@ class TaxonomyFormat(model.TextFileFormat): """ - def sniff(self): + # Was formerly a sniff method that I renamed + def _check_file_format(self, root, n=None): with self.open() as fh: count = 0 while count < 10: @@ -61,6 +62,9 @@ def sniff(self): return False if count == 0 else True + def _validate_(self, level): + self._check_file_format(self, n={'min': 10, 'max': None}[level]) + TaxonomyDirectoryFormat = model.SingleFileDirectoryFormat( 'TaxonomyDirectoryFormat', 'taxonomy.tsv', TaxonomyFormat) @@ -96,7 +100,7 @@ class TSVTaxonomyFormat(model.TextFileFormat): """ HEADER = ['Feature ID', 'Taxon'] - def sniff(self): + def _check_tsv_tax_format(self, root, n=None): with self.open() as fh: data_lines = 0 header = None @@ -125,13 +129,17 @@ def sniff(self): return header is not None and data_lines > 0 + def _validate_(self, level): + self._check_tsv_tax_format(root=str(self.path.parent), + n={'min': 10, 'max': None}[level]) + TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat( 'TSVTaxonomyDirectoryFormat', 'taxonomy.tsv', TSVTaxonomyFormat) class DNAFASTAFormat(model.TextFileFormat): - def sniff(self): + def _check_dna_fasta_format(self, root, n=None): filepath = str(self) sniffer = skbio.io.io_registry.get_sniffer('fasta') if sniffer(filepath)[0]: @@ -149,6 +157,10 @@ def sniff(self): empty_sniffer = skbio.io.io_registry.get_sniffer('') return empty_sniffer(filepath)[0] + def _validate_(self, level): + self._check_dna_fasta_format(root=str(self.path.parent), + n={'min': 10, 'max': None}[level]) + DNASequencesDirectoryFormat = model.SingleFileDirectoryFormat( 'DNASequencesDirectoryFormat', 'dna-sequences.fasta', DNAFASTAFormat) @@ -162,7 +174,7 @@ class PairedDNASequencesDirectoryFormat(model.DirectoryFormat): class AlignedDNAFASTAFormat(model.TextFileFormat): - def sniff(self): + def _check_aligned_dna_fasta_format(self, root, n=None): filepath = str(self) sniffer = skbio.io.io_registry.get_sniffer('fasta') if sniffer(filepath)[0]: @@ -179,6 +191,10 @@ def sniff(self): pass return False + def _validate_(self, level): + self._check_aligned_dna_fasta_format(root=str(self.path.parent), + n={'min': 10, 'max': None}[level]) + AlignedDNASequencesDirectoryFormat = model.SingleFileDirectoryFormat( 'AlignedDNASequencesDirectoryFormat', 'aligned-dna-sequences.fasta', diff --git a/q2_types/feature_table/_format.py b/q2_types/feature_table/_format.py index 0fa76a7f..9a1779d3 100644 --- a/q2_types/feature_table/_format.py +++ b/q2_types/feature_table/_format.py @@ -21,7 +21,7 @@ class BIOMV100Format(model.TextFileFormat): 'shape', 'data', 'comment' } - def sniff(self): + def _check_biomv100_format(self, root, n=None): with self.open() as fh: try: parser = ijson.parse(fh) @@ -36,6 +36,10 @@ def sniff(self): pass return False + def _validate_(self, level): + self._check_biomv100_format(root=str(self.path.parent), + n={'min': 10, 'max': None}[level]) + class BIOMV210Format(model.BinaryFileFormat): # minimum requirements as described by @@ -68,7 +72,7 @@ class BIOMV210Format(model.BinaryFileFormat): def open(self): return h5py.File(str(self), mode=self._mode) - def sniff(self): + def _check_biomv210_format(self, root, n=None): try: with self.open() as fh: for grp in self.groups: @@ -84,6 +88,10 @@ def sniff(self): except Exception: return False + def _validate_(self, level): + self._check_biomv210_format(root=str(self.path.parent), + n={'min': 10, 'max': None}[level]) + BIOMV100DirFmt = model.SingleFileDirectoryFormat('BIOMV100DirFmt', 'feature-table.biom', From 65a69e83cc896272ec510ed2a4a74afc50eeadad Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Fri, 19 Jul 2019 16:20:52 -0700 Subject: [PATCH 02/10] Added validation API to TSVTaxonomyFormat class only. Removed validation from all other classes in repository. --- q2_types/feature_data/_format.py | 45 +++++++++++++++++-------------- q2_types/feature_table/_format.py | 12 ++------- 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index e5cf65d6..1bbbd895 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -42,8 +42,7 @@ class TaxonomyFormat(model.TextFileFormat): """ - # Was formerly a sniff method that I renamed - def _check_file_format(self, root, n=None): + def sniff(self): with self.open() as fh: count = 0 while count < 10: @@ -66,9 +65,6 @@ def _check_file_format(self, root, n=None): return False if count == 0 else True - def _validate_(self, level): - self._check_file_format(self, n={'min': 10, 'max': None}[level]) - TaxonomyDirectoryFormat = model.SingleFileDirectoryFormat( 'TaxonomyDirectoryFormat', 'taxonomy.tsv', TaxonomyFormat) @@ -108,34 +104,47 @@ def _check_tsv_tax_format(self, root, n=None): with self.open() as fh: data_lines = 0 header = None - while data_lines < 10: - line = fh.readline() + + file_ = enumerate(fh) if n is None else zip(range(n), fh) + + for iter, line in file_: + iter = iter + 1 if line == '': # EOF break - elif line.lstrip(' ') == '\n': - # Blank line + elif line.strip(' ') == '\n': continue elif line.startswith('#'): # Comment line continue - cells = line.rstrip('\n').split('\t') + cells = line.strip('\n').split('\t') + if header is None: if cells[:2] != self.HEADER: - return False + raise ValidationError("Anthony TSVTaxonomy") header = cells else: if len(cells) != len(header): - return False + raise ValidationError("Number of headers are not the " + "same as number of colums in " + "the file.") data_lines += 1 - return header is not None and data_lines > 0 + if header is None: + raise ValidationError("This file must contain 'Feature ID' " + "and 'Taxon' as header values to " + "meet formatting requirements.") + + if data_lines == 0: + raise ValidationError("No sample records found in manifest, " + "only observed comments, blank lines, " + "and/or a header row.") def _validate_(self, level): - self._check_tsv_tax_format(root=str(self.path.parent), - n={'min': 10, 'max': None}[level]) + self._check_tsv_tax_format(root=str(self.path.parent), n={'min': 1, + 'max': None}[level]) TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat( @@ -198,7 +207,7 @@ class PairedDNASequencesDirectoryFormat(model.DirectoryFormat): class AlignedDNAFASTAFormat(model.TextFileFormat): - def _check_aligned_dna_fasta_format(self, root, n=None): + def sniff(self): filepath = str(self) sniffer = skbio.io.io_registry.get_sniffer('fasta') if sniffer(filepath)[0]: @@ -215,10 +224,6 @@ def _check_aligned_dna_fasta_format(self, root, n=None): pass return False - def _validate_(self, level): - self._check_aligned_dna_fasta_format(root=str(self.path.parent), - n={'min': 10, 'max': None}[level]) - AlignedDNASequencesDirectoryFormat = model.SingleFileDirectoryFormat( 'AlignedDNASequencesDirectoryFormat', 'aligned-dna-sequences.fasta', diff --git a/q2_types/feature_table/_format.py b/q2_types/feature_table/_format.py index 9a1779d3..0fa76a7f 100644 --- a/q2_types/feature_table/_format.py +++ b/q2_types/feature_table/_format.py @@ -21,7 +21,7 @@ class BIOMV100Format(model.TextFileFormat): 'shape', 'data', 'comment' } - def _check_biomv100_format(self, root, n=None): + def sniff(self): with self.open() as fh: try: parser = ijson.parse(fh) @@ -36,10 +36,6 @@ def _check_biomv100_format(self, root, n=None): pass return False - def _validate_(self, level): - self._check_biomv100_format(root=str(self.path.parent), - n={'min': 10, 'max': None}[level]) - class BIOMV210Format(model.BinaryFileFormat): # minimum requirements as described by @@ -72,7 +68,7 @@ class BIOMV210Format(model.BinaryFileFormat): def open(self): return h5py.File(str(self), mode=self._mode) - def _check_biomv210_format(self, root, n=None): + def sniff(self): try: with self.open() as fh: for grp in self.groups: @@ -88,10 +84,6 @@ def _check_biomv210_format(self, root, n=None): except Exception: return False - def _validate_(self, level): - self._check_biomv210_format(root=str(self.path.parent), - n={'min': 10, 'max': None}[level]) - BIOMV100DirFmt = model.SingleFileDirectoryFormat('BIOMV100DirFmt', 'feature-table.biom', From 747d0aec59e5a023440a48d329607c9185414827 Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Thu, 25 Jul 2019 10:14:25 -0700 Subject: [PATCH 03/10] MAINT: Changed minimum number of lines to validate file. Improved validation error messages for users. --- q2_types/feature_data/_format.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 1bbbd895..170a9789 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -100,15 +100,15 @@ class TSVTaxonomyFormat(model.TextFileFormat): """ HEADER = ['Feature ID', 'Taxon'] - def _check_tsv_tax_format(self, root, n=None): + def _check_tsv_tax_format(self, n=None): with self.open() as fh: data_lines = 0 header = None file_ = enumerate(fh) if n is None else zip(range(n), fh) - for iter, line in file_: - iter = iter + 1 + for i, line in file_: + i = i + 1 if line == '': # EOF @@ -123,19 +123,22 @@ def _check_tsv_tax_format(self, root, n=None): if header is None: if cells[:2] != self.HEADER: - raise ValidationError("Anthony TSVTaxonomy") + raise ValidationError("'Feature ID' and 'Taxon' must" + " be included as headers to be" + " a valid TSV file. Please check" + " header values in your file.") header = cells else: if len(cells) != len(header): - raise ValidationError("Number of headers are not the " - "same as number of colums in " - "the file.") - data_lines += 1 + raise ValidationError('Number of headers are not the ' + 'same as number of columns in ' + 'the file. \nNumber of headers: ' + '{} \nNumber of columns: {} ' + '\nIssue on line: {}' + .format(len(header), len(cells), + i)) - if header is None: - raise ValidationError("This file must contain 'Feature ID' " - "and 'Taxon' as header values to " - "meet formatting requirements.") + data_lines += 1 if data_lines == 0: raise ValidationError("No sample records found in manifest, " @@ -143,8 +146,7 @@ def _check_tsv_tax_format(self, root, n=None): "and/or a header row.") def _validate_(self, level): - self._check_tsv_tax_format(root=str(self.path.parent), n={'min': 1, - 'max': None}[level]) + self._check_tsv_tax_format(n={'min': 10, 'max': None}[level]) TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat( From 58899781d68fbbde033ec193448f59c356f68c6d Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Thu, 25 Jul 2019 13:28:15 -0700 Subject: [PATCH 04/10] MAINT: Substituted index values for counts pertaining to header and column arrays in error message. --- q2_types/feature_data/_format.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 170a9789..0ef46a68 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -7,6 +7,7 @@ # ---------------------------------------------------------------------------- import re +import textwrap import skbio.io import qiime2.plugin.model as model @@ -132,11 +133,10 @@ def _check_tsv_tax_format(self, n=None): if len(cells) != len(header): raise ValidationError('Number of headers are not the ' 'same as number of columns in ' - 'the file. \nNumber of headers: ' - '{} \nNumber of columns: {} ' + 'the file. \nHeader values: ' + '{} \nColumn values: {} ' '\nIssue on line: {}' - .format(len(header), len(cells), - i)) + .format(header, cells[:], i)) data_lines += 1 From 8ef5a2ccc2278ac1334a0d0ebc2949257a32ed13 Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Thu, 25 Jul 2019 13:31:35 -0700 Subject: [PATCH 05/10] Removed import of textwrap --- q2_types/feature_data/_format.py | 1 - 1 file changed, 1 deletion(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 0ef46a68..fdd7fd60 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -7,7 +7,6 @@ # ---------------------------------------------------------------------------- import re -import textwrap import skbio.io import qiime2.plugin.model as model From b60f0c2ff4a9fda93bd0df6850a1f2694def3628 Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Mon, 29 Jul 2019 14:58:00 -0700 Subject: [PATCH 06/10] MAINT: Changed method name which validates taxonomy files. Updated raised error messages. --- q2_types/feature_data/_format.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index fdd7fd60..7035e55c 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -100,7 +100,7 @@ class TSVTaxonomyFormat(model.TextFileFormat): """ HEADER = ['Feature ID', 'Taxon'] - def _check_tsv_tax_format(self, n=None): + def _check_n_records(self, n=None): with self.open() as fh: data_lines = 0 header = None @@ -123,10 +123,12 @@ def _check_tsv_tax_format(self, n=None): if header is None: if cells[:2] != self.HEADER: - raise ValidationError("'Feature ID' and 'Taxon' must" - " be included as headers to be" - " a valid TSV file. Please check" - " header values in your file.") + raise ValidationError("['Feature ID' and 'Taxon'] " + "must be the first two header " + "values to be a valid axonomy " + "file.\n\nThe first two header " + "values provided are: {}." + .format(cells[:2])) header = cells else: if len(cells) != len(header): @@ -134,18 +136,17 @@ def _check_tsv_tax_format(self, n=None): 'same as number of columns in ' 'the file. \nHeader values: ' '{} \nColumn values: {} ' - '\nIssue on line: {}' .format(header, cells[:], i)) data_lines += 1 if data_lines == 0: - raise ValidationError("No sample records found in manifest, " + raise ValidationError("No feature records found in manifest, " "only observed comments, blank lines, " "and/or a header row.") def _validate_(self, level): - self._check_tsv_tax_format(n={'min': 10, 'max': None}[level]) + self._check_n_records(n={'min': 10, 'max': None}[level]) TSVTaxonomyDirectoryFormat = model.SingleFileDirectoryFormat( From eae936705c810fd78f2cf5cb70596ac042ddecdb Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Mon, 5 Aug 2019 12:15:29 -0700 Subject: [PATCH 07/10] SQUASH: Corrected misspelling of taxonomy --- q2_types/feature_data/_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 7035e55c..78fcfebf 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -125,7 +125,7 @@ def _check_n_records(self, n=None): if cells[:2] != self.HEADER: raise ValidationError("['Feature ID' and 'Taxon'] " "must be the first two header " - "values to be a valid axonomy " + "values to be a valid taxonomy " "file.\n\nThe first two header " "values provided are: {}." .format(cells[:2])) From 0206b83211de38db15a0ecd88d2202601746c086 Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Thu, 8 Aug 2019 13:14:22 -0700 Subject: [PATCH 08/10] SQUASH: Edited error messages, added comments, updated class description --- q2_types/feature_data/_format.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 78fcfebf..1885272c 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -93,9 +93,9 @@ class TSVTaxonomyFormat(model.TextFileFormat): Optionally followed by other arbitrary columns. - This format supports comment lines starting with #, and blank lines. The - expected header must be the first non-comment, non-blank line. In addition - to the header, there must be at least one line of data. + This format supports blank lines. The expected header must be the first + non-blank line. In addition to the header, there must be at least one line + of data. """ HEADER = ['Feature ID', 'Taxon'] @@ -109,11 +109,12 @@ def _check_n_records(self, n=None): for i, line in file_: i = i + 1 - + # Checks rows in the file, excludes header row if line == '': # EOF break - elif line.strip(' ') == '\n': + elif line.lstrip(' ') == '\n': + # Blank line continue elif line.startswith('#'): # Comment line @@ -125,15 +126,15 @@ def _check_n_records(self, n=None): if cells[:2] != self.HEADER: raise ValidationError("['Feature ID' and 'Taxon'] " "must be the first two header " - "values to be a valid taxonomy " - "file.\n\nThe first two header " - "values provided are: {}." + "values to be valid.\n\n The " + "first two header values " + "provided are: {}." .format(cells[:2])) header = cells else: if len(cells) != len(header): - raise ValidationError('Number of headers are not the ' - 'same as number of columns in ' + raise ValidationError('Number of columns are not the ' + 'same as number of headers in ' 'the file. \nHeader values: ' '{} \nColumn values: {} ' .format(header, cells[:], i)) @@ -141,9 +142,8 @@ def _check_n_records(self, n=None): data_lines += 1 if data_lines == 0: - raise ValidationError("No feature records found in manifest, " - "only observed comments, blank lines, " - "and/or a header row.") + raise ValidationError("No feature records found, only blank " + "lines and/or a header row.") def _validate_(self, level): self._check_n_records(n={'min': 10, 'max': None}[level]) From 51cbb8fedf7873c6a5f675188d0f0c121ce0c0e2 Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Thu, 8 Aug 2019 13:18:37 -0700 Subject: [PATCH 09/10] SQUASH: Merged feature branch with upstream --- q2_types/feature_data/_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 4237adb4..e4d04dcc 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -105,8 +105,8 @@ def _check_n_records(self, n=None): file_ = enumerate(fh) if n is None else zip(range(n), fh) for i, line in file_: - i = i + 1 # Checks rows in the file, excludes header row + i = i + 1 if line == '': # EOF break From 608dfc6788610f1845a4438c44b3972432aee04a Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Fri, 9 Aug 2019 11:16:44 -0700 Subject: [PATCH 10/10] SQUASH: Added test .tsv files added a test method --- q2_types/feature_data/_format.py | 16 +++++++++------- .../data/taxonomy/greater-column-length.tsv | 3 +++ .../data/taxonomy/greater-header-length.tsv | 3 +++ q2_types/feature_data/tests/test_format.py | 12 ++++++++++++ 4 files changed, 27 insertions(+), 7 deletions(-) create mode 100644 q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv create mode 100644 q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index e4d04dcc..5012388e 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -105,7 +105,7 @@ def _check_n_records(self, n=None): file_ = enumerate(fh) if n is None else zip(range(n), fh) for i, line in file_: - # Checks rows in the file, excludes header row + # Tracks line count for error reporting i = i + 1 if line == '': # EOF @@ -122,15 +122,17 @@ def _check_n_records(self, n=None): "must be the first two header " "values to be valid.\n\n The " "first two header values " - "provided are: {}." - .format(cells[:2])) + "provided are: {}.\nIssue on " + "line {}" + .format(cells[:2], i)) header = cells else: if len(cells) != len(header): - raise ValidationError('Number of columns are not the ' - 'same as number of headers in ' - 'the file. \nHeader values: ' - '{} \nColumn values: {} ' + raise ValidationError("Number of columns are not the " + "same as number of headers in " + "the file. \nHeader values: " + "{} \nColumn values: {}\nIssue " + "on line: {}" .format(header, cells[:], i)) data_lines += 1 diff --git a/q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv b/q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv new file mode 100644 index 00000000..00040321 --- /dev/null +++ b/q2_types/feature_data/tests/data/taxonomy/greater-column-length.tsv @@ -0,0 +1,3 @@ +Feature ID Taxon +seq1 k__Bacteria; p__Proteobacteria -1.0 +seq2 k__Bacteria 1.0 diff --git a/q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv b/q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv new file mode 100644 index 00000000..f53aa340 --- /dev/null +++ b/q2_types/feature_data/tests/data/taxonomy/greater-header-length.tsv @@ -0,0 +1,3 @@ +Feature ID Taxon Confidence Random +seq1 k__Foo; p__Bar -1.0 +seq2 k__Foo; p__Baz -42.0 diff --git a/q2_types/feature_data/tests/test_format.py b/q2_types/feature_data/tests/test_format.py index 4329bd38..9a4b514d 100644 --- a/q2_types/feature_data/tests/test_format.py +++ b/q2_types/feature_data/tests/test_format.py @@ -134,6 +134,18 @@ def test_tsv_taxonomy_directory_format(self): format.validate() + def test_tsv_taxonomy_format_column_header_lengths(self): + filenames = ['greater-column-length.tsv', 'greater-header-length.tsv'] + + filepaths = [self.get_data_path(os.path.join('taxonomy', filename)) + for filename in filenames] + + for filepath in filepaths: + format = TSVTaxonomyFormat(filepath, mode='r') + + with self.assertRaisesRegex(ValidationError, 'Number of columns'): + format.validate() + class TestDNAFASTAFormats(TestPluginBase): package = 'q2_types.feature_data.tests'