From dd65a21b8ec57d099a9b7d4ce2bddb7dd61ee73b Mon Sep 17 00:00:00 2001 From: Joshua Shorenstein Date: Wed, 20 May 2015 11:13:03 -0700 Subject: [PATCH 1/5] check for non-utf8 characters. Fix #1197 --- qiita_db/metadata_template/util.py | 43 +++++++++++++++++++----------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py index 863e7bbfa..a1ed58bc7 100644 --- a/qiita_db/metadata_template/util.py +++ b/qiita_db/metadata_template/util.py @@ -9,6 +9,7 @@ from __future__ import division from future.utils import PY3 from future.utils.six import StringIO +from csv import reader import pandas as pd import numpy as np @@ -215,21 +216,33 @@ def load_template_to_dataframe(fn, strip_whitespace=True): # comment: # using the tab character as "comment" we remove rows that are # constituted only by delimiters i. e. empty rows. - template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', - infer_datetime_format=True, - keep_default_na=False, na_values=[''], - parse_dates=True, index_col=False, comment='\t', - mangle_dupe_cols=False, converters={ - 'sample_name': lambda x: str(x).strip(), - # required sample template information - 'physical_location': str, - 'sample_type': str, - # collection_timestamp is not added here - 'host_subject_id': str, - 'description': str, - # common prep template information - 'center_name': str, - 'center_projct_name': str}) + try: + template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', + encoding='utf-8', infer_datetime_format=True, + keep_default_na=False, na_values=[''], + parse_dates=True, index_col=False, comment='\t', + mangle_dupe_cols=False, converters={ + 'sample_name': lambda x: str(x).strip(), + # required sample template information + 'physical_location': str, + 'sample_type': str, + # collection_timestamp is not added here + 'host_subject_id': str, + 'description': str, + # common prep template information + 'center_name': str, + 'center_projct_name': str}) + except UnicodeDecodeError: + # Find row number and col number for utf-8 encoding errors + errors = [] + for row, line in enumerate(holdfile): + for col, cell in enumerate(line.split('\t')): + try: + cell.encode('utf-8') + except UnicodeError: + errors.append('Non-unicode value in cell at ' + 'row %d col %d' % (row+1, col+1)) + raise ValueError(', '.join(errors)) # let pandas infer the dtypes of these columns, if the inference is # not correct, then we have to raise an error From 4c91caeae36975c17991f06bdd42297170ff67d0 Mon Sep 17 00:00:00 2001 From: Joshua Shorenstein Date: Wed, 20 May 2015 11:17:00 -0700 Subject: [PATCH 2/5] remove unneded import --- qiita_db/metadata_template/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py index a1ed58bc7..51c0585a3 100644 --- a/qiita_db/metadata_template/util.py +++ b/qiita_db/metadata_template/util.py @@ -9,7 +9,6 @@ from __future__ import division from future.utils import PY3 from future.utils.six import StringIO -from csv import reader import pandas as pd import numpy as np From c415e74aef082ba02947a364bd75218fe2b8b28d Mon Sep 17 00:00:00 2001 From: Joshua Shorenstein Date: Wed, 20 May 2015 11:50:31 -0700 Subject: [PATCH 3/5] add suggestions --- qiita_db/metadata_template/test/test_util.py | 8 +++++++- qiita_db/metadata_template/util.py | 14 +++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/qiita_db/metadata_template/test/test_util.py b/qiita_db/metadata_template/test/test_util.py index 02716a4cf..199269495 100644 --- a/qiita_db/metadata_template/test/test_util.py +++ b/qiita_db/metadata_template/test/test_util.py @@ -13,7 +13,8 @@ import pandas as pd from pandas.util.testing import assert_frame_equal -from qiita_db.exceptions import QiitaDBColumnError, QiitaDBWarning +from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBWarning, + QiitaDBError) from qiita_db.metadata_template.util import ( get_datatypes, as_python_types, prefix_sample_names_with_id, load_template_to_dataframe, get_invalid_sample_names) @@ -153,6 +154,11 @@ def test_load_template_to_dataframe_lowercase(self): exp.rename(columns={"str_column": "str_CoLumn"}, inplace=True) assert_frame_equal(obs, exp) + def test_load_template_to_dataframe_non_utf8(self): + bad = EXP_SAMPLE_TEMPLATE.replace('Test Sample 2', 'Test Sample\x962') + with self.assertRaises(QiitaDBError): + load_template_to_dataframe(StringIO(bad)) + def test_load_template_to_dataframe_typechecking(self): obs = load_template_to_dataframe( StringIO(EXP_SAMPLE_TEMPLATE_LAT_ALL_INT)) diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py index 51c0585a3..b6a6521fc 100644 --- a/qiita_db/metadata_template/util.py +++ b/qiita_db/metadata_template/util.py @@ -15,7 +15,8 @@ import warnings from skbio.io.util import open_file -from qiita_db.exceptions import QiitaDBColumnError, QiitaDBWarning +from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBWarning, + QiitaDBError) from .constants import CONTROLLED_COLS if PY3: @@ -147,6 +148,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True): to the needed type. QiitaDBWarning When columns are dropped because they have no content for any sample. + QiitaDBError + When non UTF-8 characters are found in the file. Notes ----- @@ -233,15 +236,16 @@ def load_template_to_dataframe(fn, strip_whitespace=True): 'center_projct_name': str}) except UnicodeDecodeError: # Find row number and col number for utf-8 encoding errors + headers = holdfile[0].strip().split('\t') errors = [] - for row, line in enumerate(holdfile): + for row, line in enumerate(holdfile, 1): for col, cell in enumerate(line.split('\t')): try: cell.encode('utf-8') except UnicodeError: - errors.append('Non-unicode value in cell at ' - 'row %d col %d' % (row+1, col+1)) - raise ValueError(', '.join(errors)) + errors.append('row %d, header %s' % (row, headers[col])) + raise QiitaDBError('Non UTF-8 characters found at ' + + '; '.join(errors)) # let pandas infer the dtypes of these columns, if the inference is # not correct, then we have to raise an error From 5211e8911cdef73481d10f563f2507b0a7c0e8cb Mon Sep 17 00:00:00 2001 From: Joshua Shorenstein Date: Wed, 20 May 2015 12:24:34 -0700 Subject: [PATCH 4/5] revert to ValueError --- qiita_db/metadata_template/util.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py index b6a6521fc..0ddd92733 100644 --- a/qiita_db/metadata_template/util.py +++ b/qiita_db/metadata_template/util.py @@ -141,15 +141,13 @@ def load_template_to_dataframe(fn, strip_whitespace=True): Raises ------ ValueError - Empty file passed + Empty file passed, or non UTF-8 characters are found in the file. QiitaDBColumnError If the sample_name column is not present in the template. If there's a value in one of the reserved columns that cannot be cast to the needed type. QiitaDBWarning When columns are dropped because they have no content for any sample. - QiitaDBError - When non UTF-8 characters are found in the file. Notes ----- @@ -244,8 +242,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True): cell.encode('utf-8') except UnicodeError: errors.append('row %d, header %s' % (row, headers[col])) - raise QiitaDBError('Non UTF-8 characters found at ' + - '; '.join(errors)) + raise ValueError('Non UTF-8 characters found at ' + + '; '.join(errors)) # let pandas infer the dtypes of these columns, if the inference is # not correct, then we have to raise an error From 2494f33484c5a0635863d8f236253e0d9054ba27 Mon Sep 17 00:00:00 2001 From: Joshua Shorenstein Date: Wed, 20 May 2015 12:46:17 -0700 Subject: [PATCH 5/5] add QiitaDBError as displayed error --- qiita_db/metadata_template/util.py | 8 +++++--- qiita_pet/handlers/study_handlers/description_handlers.py | 8 ++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py index 0ddd92733..b6a6521fc 100644 --- a/qiita_db/metadata_template/util.py +++ b/qiita_db/metadata_template/util.py @@ -141,13 +141,15 @@ def load_template_to_dataframe(fn, strip_whitespace=True): Raises ------ ValueError - Empty file passed, or non UTF-8 characters are found in the file. + Empty file passed QiitaDBColumnError If the sample_name column is not present in the template. If there's a value in one of the reserved columns that cannot be cast to the needed type. QiitaDBWarning When columns are dropped because they have no content for any sample. + QiitaDBError + When non UTF-8 characters are found in the file. Notes ----- @@ -242,8 +244,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True): cell.encode('utf-8') except UnicodeError: errors.append('row %d, header %s' % (row, headers[col])) - raise ValueError('Non UTF-8 characters found at ' + - '; '.join(errors)) + raise QiitaDBError('Non UTF-8 characters found at ' + + '; '.join(errors)) # let pandas infer the dtypes of these columns, if the inference is # not correct, then we have to raise an error diff --git a/qiita_pet/handlers/study_handlers/description_handlers.py b/qiita_pet/handlers/study_handlers/description_handlers.py index 0f68c63e8..4f3295c79 100644 --- a/qiita_pet/handlers/study_handlers/description_handlers.py +++ b/qiita_pet/handlers/study_handlers/description_handlers.py @@ -190,7 +190,7 @@ def process_sample_template(self, study, user, callback): except (TypeError, QiitaDBColumnError, QiitaDBExecutionError, QiitaDBDuplicateError, IOError, ValueError, KeyError, - CParserError, QiitaDBDuplicateHeaderError) as e: + CParserError, QiitaDBDuplicateHeaderError, QiitaDBError) as e: # Some error occurred while processing the sample template # Show the error to the user so they can fix the template msg = html_error_message % ('parsing the sample template:', @@ -422,9 +422,9 @@ def add_prep_template(self, study, user, callback): if warns: msg = '; '.join([str(w.message) for w in warns]) msg_level = 'warning' - except (TypeError, QiitaDBColumnError, QiitaDBExecutionError, - QiitaDBDuplicateError, IOError, ValueError, - CParserError) as e: + except (TypeError, QiitaDBError, QiitaDBColumnError, + QiitaDBExecutionError, QiitaDBDuplicateError, IOError, + ValueError, CParserError) as e: pt_id = None # Some error occurred while processing the prep template # Show the error to the user so he can fix the template