diff --git a/qiita_db/metadata_template/test/test_util.py b/qiita_db/metadata_template/test/test_util.py index 02716a4cf..199269495 100644 --- a/qiita_db/metadata_template/test/test_util.py +++ b/qiita_db/metadata_template/test/test_util.py @@ -13,7 +13,8 @@ import pandas as pd from pandas.util.testing import assert_frame_equal -from qiita_db.exceptions import QiitaDBColumnError, QiitaDBWarning +from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBWarning, + QiitaDBError) from qiita_db.metadata_template.util import ( get_datatypes, as_python_types, prefix_sample_names_with_id, load_template_to_dataframe, get_invalid_sample_names) @@ -153,6 +154,11 @@ def test_load_template_to_dataframe_lowercase(self): exp.rename(columns={"str_column": "str_CoLumn"}, inplace=True) assert_frame_equal(obs, exp) + def test_load_template_to_dataframe_non_utf8(self): + bad = EXP_SAMPLE_TEMPLATE.replace('Test Sample 2', 'Test Sample\x962') + with self.assertRaises(QiitaDBError): + load_template_to_dataframe(StringIO(bad)) + def test_load_template_to_dataframe_typechecking(self): obs = load_template_to_dataframe( StringIO(EXP_SAMPLE_TEMPLATE_LAT_ALL_INT)) diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py index 863e7bbfa..b6a6521fc 100644 --- a/qiita_db/metadata_template/util.py +++ b/qiita_db/metadata_template/util.py @@ -15,7 +15,8 @@ import warnings from skbio.io.util import open_file -from qiita_db.exceptions import QiitaDBColumnError, QiitaDBWarning +from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBWarning, + QiitaDBError) from .constants import CONTROLLED_COLS if PY3: @@ -147,6 +148,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True): to the needed type. QiitaDBWarning When columns are dropped because they have no content for any sample. + QiitaDBError + When non UTF-8 characters are found in the file. Notes ----- @@ -215,21 +218,34 @@ def load_template_to_dataframe(fn, strip_whitespace=True): # comment: # using the tab character as "comment" we remove rows that are # constituted only by delimiters i. e. empty rows. - template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', - infer_datetime_format=True, - keep_default_na=False, na_values=[''], - parse_dates=True, index_col=False, comment='\t', - mangle_dupe_cols=False, converters={ - 'sample_name': lambda x: str(x).strip(), - # required sample template information - 'physical_location': str, - 'sample_type': str, - # collection_timestamp is not added here - 'host_subject_id': str, - 'description': str, - # common prep template information - 'center_name': str, - 'center_projct_name': str}) + try: + template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', + encoding='utf-8', infer_datetime_format=True, + keep_default_na=False, na_values=[''], + parse_dates=True, index_col=False, comment='\t', + mangle_dupe_cols=False, converters={ + 'sample_name': lambda x: str(x).strip(), + # required sample template information + 'physical_location': str, + 'sample_type': str, + # collection_timestamp is not added here + 'host_subject_id': str, + 'description': str, + # common prep template information + 'center_name': str, + 'center_projct_name': str}) + except UnicodeDecodeError: + # Find row number and col number for utf-8 encoding errors + headers = holdfile[0].strip().split('\t') + errors = [] + for row, line in enumerate(holdfile, 1): + for col, cell in enumerate(line.split('\t')): + try: + cell.encode('utf-8') + except UnicodeError: + errors.append('row %d, header %s' % (row, headers[col])) + raise QiitaDBError('Non UTF-8 characters found at ' + + '; '.join(errors)) # let pandas infer the dtypes of these columns, if the inference is # not correct, then we have to raise an error diff --git a/qiita_pet/handlers/study_handlers/description_handlers.py b/qiita_pet/handlers/study_handlers/description_handlers.py index 0f68c63e8..4f3295c79 100644 --- a/qiita_pet/handlers/study_handlers/description_handlers.py +++ b/qiita_pet/handlers/study_handlers/description_handlers.py @@ -190,7 +190,7 @@ def process_sample_template(self, study, user, callback): except (TypeError, QiitaDBColumnError, QiitaDBExecutionError, QiitaDBDuplicateError, IOError, ValueError, KeyError, - CParserError, QiitaDBDuplicateHeaderError) as e: + CParserError, QiitaDBDuplicateHeaderError, QiitaDBError) as e: # Some error occurred while processing the sample template # Show the error to the user so they can fix the template msg = html_error_message % ('parsing the sample template:', @@ -422,9 +422,9 @@ def add_prep_template(self, study, user, callback): if warns: msg = '; '.join([str(w.message) for w in warns]) msg_level = 'warning' - except (TypeError, QiitaDBColumnError, QiitaDBExecutionError, - QiitaDBDuplicateError, IOError, ValueError, - CParserError) as e: + except (TypeError, QiitaDBError, QiitaDBColumnError, + QiitaDBExecutionError, QiitaDBDuplicateError, IOError, + ValueError, CParserError) as e: pt_id = None # Some error occurred while processing the prep template # Show the error to the user so he can fix the template