From 6d7a4e3dd9a1f551a6c242f18f324d10ca05cb3e Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Mon, 15 Jul 2019 14:50:07 -0700 Subject: [PATCH 1/6] MAINT: Addressed comment issue which removed data that followed a '#' or ' #' in tsv files. --- q2_types/feature_data/_transformer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/q2_types/feature_data/_transformer.py b/q2_types/feature_data/_transformer.py index e370894d..b3d551f6 100644 --- a/q2_types/feature_data/_transformer.py +++ b/q2_types/feature_data/_transformer.py @@ -47,7 +47,7 @@ def _taxonomy_formats_to_dataframe(filepath, has_header=None): """ # Using `dtype=object` and `set_index()` to avoid type casting/inference of # any columns or the index. - df = pd.read_csv(filepath, sep='\t', comment='#', skip_blank_lines=True, + df = pd.read_csv(filepath, sep='\t', skip_blank_lines=True, header=None, dtype=object) if len(df.columns) < 2: @@ -191,6 +191,9 @@ def _23(ff: TSVTaxonomyFormat) -> pd.Series: @plugin.register_transformer def _29(ff: TSVTaxonomyFormat) -> qiime2.Metadata: df = _taxonomy_formats_to_dataframe(str(ff), has_header=True) + taxCol = df['Taxon'] + for index in range(len(taxCol)): + taxCol[index].strip() return qiime2.Metadata(df) From 0b85873fff5a7bff2954b57a2e4b3c5d06013320 Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Tue, 16 Jul 2019 16:14:34 -0700 Subject: [PATCH 2/6] MAINT: Corrected strip function usage and adjusted test files to pass Travis tests. --- q2_types/feature_data/_transformer.py | 5 +-- .../tests/data/taxonomy/blanks-and-comments | 10 ++--- .../tests/data/taxonomy/header-only.tsv | 6 +-- .../data/taxonomy/leading_space_taxon.tsv | 2 + .../data/taxonomy/start_end_space_taxon.tsv | 2 + .../data/taxonomy/trailing_space_taxon.tsv | 2 + .../tests/data/taxonomy/valid-but-messy.tsv | 20 +++++----- .../feature_data/tests/test_transformer.py | 39 +++++++++++++++++++ 8 files changed, 65 insertions(+), 21 deletions(-) create mode 100644 q2_types/feature_data/tests/data/taxonomy/leading_space_taxon.tsv create mode 100644 q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv create mode 100644 q2_types/feature_data/tests/data/taxonomy/trailing_space_taxon.tsv diff --git a/q2_types/feature_data/_transformer.py b/q2_types/feature_data/_transformer.py index b3d551f6..5645f62d 100644 --- a/q2_types/feature_data/_transformer.py +++ b/q2_types/feature_data/_transformer.py @@ -191,9 +191,8 @@ def _23(ff: TSVTaxonomyFormat) -> pd.Series: @plugin.register_transformer def _29(ff: TSVTaxonomyFormat) -> qiime2.Metadata: df = _taxonomy_formats_to_dataframe(str(ff), has_header=True) - taxCol = df['Taxon'] - for index in range(len(taxCol)): - taxCol[index].strip() + for index in range(len(df['Taxon'])): + df['Taxon'][index] = df['Taxon'][index].strip() return qiime2.Metadata(df) diff --git a/q2_types/feature_data/tests/data/taxonomy/blanks-and-comments b/q2_types/feature_data/tests/data/taxonomy/blanks-and-comments index c0f07076..e5abfeb2 100644 --- a/q2_types/feature_data/tests/data/taxonomy/blanks-and-comments +++ b/q2_types/feature_data/tests/data/taxonomy/blanks-and-comments @@ -1,17 +1,17 @@ -# hello -# world # + + -# hello, peanut -# -# + + + diff --git a/q2_types/feature_data/tests/data/taxonomy/header-only.tsv b/q2_types/feature_data/tests/data/taxonomy/header-only.tsv index 9a329a73..73265f07 100644 --- a/q2_types/feature_data/tests/data/taxonomy/header-only.tsv +++ b/q2_types/feature_data/tests/data/taxonomy/header-only.tsv @@ -1,7 +1,7 @@ -# This file -# only has a -# header! + + + Feature ID Taxon diff --git a/q2_types/feature_data/tests/data/taxonomy/leading_space_taxon.tsv b/q2_types/feature_data/tests/data/taxonomy/leading_space_taxon.tsv new file mode 100644 index 00000000..ed6b2740 --- /dev/null +++ b/q2_types/feature_data/tests/data/taxonomy/leading_space_taxon.tsv @@ -0,0 +1,2 @@ +Feature ID Taxon Confidence +seq1 k__Foo; p__Bar -1.0 diff --git a/q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv b/q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv new file mode 100644 index 00000000..9c96da59 --- /dev/null +++ b/q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv @@ -0,0 +1,2 @@ +Feature ID Taxon Confidence +seq1 k__Foo; p__Bar -1.0 diff --git a/q2_types/feature_data/tests/data/taxonomy/trailing_space_taxon.tsv b/q2_types/feature_data/tests/data/taxonomy/trailing_space_taxon.tsv new file mode 100644 index 00000000..e254ffbc --- /dev/null +++ b/q2_types/feature_data/tests/data/taxonomy/trailing_space_taxon.tsv @@ -0,0 +1,2 @@ +Feature ID Taxon Confidence +seq1 k__Foo; p__Bar -1.0 diff --git a/q2_types/feature_data/tests/data/taxonomy/valid-but-messy.tsv b/q2_types/feature_data/tests/data/taxonomy/valid-but-messy.tsv index 834ec5a8..0afeee9e 100644 --- a/q2_types/feature_data/tests/data/taxonomy/valid-but-messy.tsv +++ b/q2_types/feature_data/tests/data/taxonomy/valid-but-messy.tsv @@ -1,11 +1,11 @@ -# There's some important whitespace in this file for testing, take care not to -# remove :) -# hello -# world # + + + + @@ -15,20 +15,20 @@ -# comment + Feature ID Taxon Extra Column -# hello, peanut -# -# + + + SEQUENCE1 k__Bar; p__Baz foo -# GWAR + seq2 some; taxonomy; for; ya bar baz -# FOOTER + diff --git a/q2_types/feature_data/tests/test_transformer.py b/q2_types/feature_data/tests/test_transformer.py index ba0dc2a3..ff790b71 100644 --- a/q2_types/feature_data/tests/test_transformer.py +++ b/q2_types/feature_data/tests/test_transformer.py @@ -264,6 +264,45 @@ def test_tsv_taxonomy_format_to_metadata(self): self.assertEqual(exp, obs) + def test_tsv_taxonomy_to_metadata_trailing_whitespace_taxon(self): + _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata, + os.path.join( + 'taxonomy', + 'trailing_space_taxon.tsv')) + + index = pd.Index(['seq1'], name='Feature ID', dtype=object) + exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index, + columns=['Taxon', 'Confidence'], dtype=object) + exp = qiime2.Metadata(exp_df) + + self.assertEqual(exp, obs) + + def test_tsv_taxonomy_to_metadata_leading_whitespace_taxon(self): + _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata, + os.path.join( + 'taxonomy', + 'leading_space_taxon.tsv')) + + index = pd.Index(['seq1'], name='Feature ID', dtype=object) + exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index, + columns=['Taxon', 'Confidence'], dtype=object) + exp = qiime2.Metadata(exp_df) + + self.assertEqual(exp, obs) + + def test_tsv_taxonomy_to_metadata_trailing_leading_whitespace_taxon(self): + _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata, + os.path.join( + 'taxonomy', + 'start_end_space_taxon.tsv')) + + index = pd.Index(['seq1'], name='Feature ID', dtype=object) + exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0']], index=index, + columns=['Taxon', 'Confidence'], dtype=object) + exp = qiime2.Metadata(exp_df) + + self.assertEqual(exp, obs) + # In-depth testing of the `_taxonomy_formats_to_dataframe` helper function, # which does the heavy lifting for the transformers. From e8089789aa389d49122b0c653ac9e488c574bbe7 Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Tue, 6 Aug 2019 09:11:46 -0700 Subject: [PATCH 3/6] SQUASH: Stripped 'Taxon' column in transformer and edited test files --- q2_types/feature_data/_transformer.py | 4 ++-- .../tests/data/taxonomy/{blanks-and-comments => blanks} | 0 q2_types/feature_data/tests/data/taxonomy/header-only.tsv | 6 ------ 3 files changed, 2 insertions(+), 8 deletions(-) rename q2_types/feature_data/tests/data/taxonomy/{blanks-and-comments => blanks} (100%) diff --git a/q2_types/feature_data/_transformer.py b/q2_types/feature_data/_transformer.py index 5645f62d..edf366de 100644 --- a/q2_types/feature_data/_transformer.py +++ b/q2_types/feature_data/_transformer.py @@ -168,6 +168,7 @@ def _6(ff: TaxonomyFormat) -> pd.Series: @plugin.register_transformer def _28(ff: TaxonomyFormat) -> qiime2.Metadata: df = _taxonomy_formats_to_dataframe(str(ff), has_header=None) + df['Taxon'] = df['Taxon'].str.strip() return qiime2.Metadata(df) @@ -191,8 +192,7 @@ def _23(ff: TSVTaxonomyFormat) -> pd.Series: @plugin.register_transformer def _29(ff: TSVTaxonomyFormat) -> qiime2.Metadata: df = _taxonomy_formats_to_dataframe(str(ff), has_header=True) - for index in range(len(df['Taxon'])): - df['Taxon'][index] = df['Taxon'][index].strip() + df['Taxon'] = df['Taxon'].str.strip() return qiime2.Metadata(df) diff --git a/q2_types/feature_data/tests/data/taxonomy/blanks-and-comments b/q2_types/feature_data/tests/data/taxonomy/blanks similarity index 100% rename from q2_types/feature_data/tests/data/taxonomy/blanks-and-comments rename to q2_types/feature_data/tests/data/taxonomy/blanks diff --git a/q2_types/feature_data/tests/data/taxonomy/header-only.tsv b/q2_types/feature_data/tests/data/taxonomy/header-only.tsv index 73265f07..f7c0e0d8 100644 --- a/q2_types/feature_data/tests/data/taxonomy/header-only.tsv +++ b/q2_types/feature_data/tests/data/taxonomy/header-only.tsv @@ -1,7 +1 @@ - - - - - - Feature ID Taxon From 5cae31774f089ab06bbf64ec73079a6e6b8c96eb Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Tue, 6 Aug 2019 11:00:17 -0700 Subject: [PATCH 4/6] SQUASH: Updated feature branch with changes from upstream. Updated test methods. --- q2_types/feature_data/_format.py | 6 ------ q2_types/feature_data/tests/test_format.py | 6 +++--- q2_types/feature_data/tests/test_transformer.py | 4 ++-- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 69171ccb..4e38034e 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -54,9 +54,6 @@ def sniff(self): elif line.lstrip(' ') == '\n': # Blank line continue - elif line.startswith('#'): - # Comment line - continue else: cells = line.split('\t') if len(cells) < 2: @@ -113,9 +110,6 @@ def sniff(self): elif line.lstrip(' ') == '\n': # Blank line continue - elif line.startswith('#'): - # Comment line - continue cells = line.rstrip('\n').split('\t') if header is None: diff --git a/q2_types/feature_data/tests/test_format.py b/q2_types/feature_data/tests/test_format.py index 9f735247..4329bd38 100644 --- a/q2_types/feature_data/tests/test_format.py +++ b/q2_types/feature_data/tests/test_format.py @@ -37,7 +37,7 @@ def test_taxonomy_format_validate_positive(self): format.validate() def test_taxonomy_format_validate_negative(self): - filenames = ['empty', 'blanks-and-comments', '1-column.tsv'] + filenames = ['empty', 'blanks', '1-column.tsv'] filepaths = [self.get_data_path(os.path.join('taxonomy', filename)) for filename in filenames] @@ -78,7 +78,7 @@ def test_headerless_tsv_taxonomy_format_validate_positive(self): format.validate() def test_headerless_tsv_taxonomy_format_validate_negative(self): - filenames = ['empty', 'blanks-and-comments', '1-column.tsv'] + filenames = ['empty', 'blanks', '1-column.tsv'] filepaths = [self.get_data_path(os.path.join('taxonomy', filename)) for filename in filenames] @@ -113,7 +113,7 @@ def test_tsv_taxonomy_format_validate_positive(self): format.validate() def test_tsv_taxonomy_format_validate_negative(self): - filenames = ['empty', 'blanks-and-comments', '1-column.tsv', + filenames = ['empty', 'blanks', '1-column.tsv', 'headerless.tsv', 'header-only.tsv', 'jagged.tsv'] filepaths = [self.get_data_path(os.path.join('taxonomy', filename)) for filename in filenames] diff --git a/q2_types/feature_data/tests/test_transformer.py b/q2_types/feature_data/tests/test_transformer.py index abe8d3df..dbd33263 100644 --- a/q2_types/feature_data/tests/test_transformer.py +++ b/q2_types/feature_data/tests/test_transformer.py @@ -314,11 +314,11 @@ def test_one_column(self): _taxonomy_formats_to_dataframe( self.get_data_path(os.path.join('taxonomy', '1-column.tsv'))) - def test_blanks_and_comments(self): + def test_blanks(self): with self.assertRaises(pandas.io.common.EmptyDataError): _taxonomy_formats_to_dataframe( self.get_data_path(os.path.join('taxonomy', - 'blanks-and-comments'))) + 'blanks'))) def test_empty(self): with self.assertRaises(pandas.io.common.EmptyDataError): From 2aaaa5001e2b01d286ca73d384bf709e89741cc7 Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Wed, 7 Aug 2019 09:06:38 -0700 Subject: [PATCH 5/6] SQUASH: Strip whitespace from Taxon column in utility function --- q2_types/feature_data/_transformer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/q2_types/feature_data/_transformer.py b/q2_types/feature_data/_transformer.py index ed5698d3..7b699775 100644 --- a/q2_types/feature_data/_transformer.py +++ b/q2_types/feature_data/_transformer.py @@ -88,6 +88,7 @@ def _taxonomy_formats_to_dataframe(filepath, has_header=None): "column names are duplicated: %s" % ', '.join(df.columns.get_duplicates())) + df['Taxon'] = df['Taxon'].str.strip() return df @@ -168,7 +169,6 @@ def _6(ff: TaxonomyFormat) -> pd.Series: @plugin.register_transformer def _28(ff: TaxonomyFormat) -> qiime2.Metadata: df = _taxonomy_formats_to_dataframe(str(ff), has_header=None) - df['Taxon'] = df['Taxon'].str.strip() return qiime2.Metadata(df) @@ -192,7 +192,6 @@ def _23(ff: TSVTaxonomyFormat) -> pd.Series: @plugin.register_transformer def _29(ff: TSVTaxonomyFormat) -> qiime2.Metadata: df = _taxonomy_formats_to_dataframe(str(ff), has_header=True) - df['Taxon'] = df['Taxon'].str.strip() return qiime2.Metadata(df) From 06e4766a437660a7b2cb2c5a35763a8f4d1da19f Mon Sep 17 00:00:00 2001 From: David Rodriguez Date: Thu, 8 Aug 2019 09:53:35 -0700 Subject: [PATCH 6/6] SQUASH: Added leading white space to test file --- .../feature_data/tests/data/taxonomy/start_end_space_taxon.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv b/q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv index 9c96da59..e254ffbc 100644 --- a/q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv +++ b/q2_types/feature_data/tests/data/taxonomy/start_end_space_taxon.tsv @@ -1,2 +1,2 @@ Feature ID Taxon Confidence -seq1 k__Foo; p__Bar -1.0 +seq1 k__Foo; p__Bar -1.0