Skip to content

Commit 6819e8a

Browse files
committed
Merge pull request #1198 from squirrelo/utf8-issue
Check for non-utf8 characters. Fix #1197
2 parents e492531 + 2494f33 commit 6819e8a

File tree

3 files changed

+43
-21
lines changed

3 files changed

+43
-21
lines changed

qiita_db/metadata_template/test/test_util.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
import pandas as pd
1414
from pandas.util.testing import assert_frame_equal
1515

16-
from qiita_db.exceptions import QiitaDBColumnError, QiitaDBWarning
16+
from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBWarning,
17+
QiitaDBError)
1718
from qiita_db.metadata_template.util import (
1819
get_datatypes, as_python_types, prefix_sample_names_with_id,
1920
load_template_to_dataframe, get_invalid_sample_names)
@@ -153,6 +154,11 @@ def test_load_template_to_dataframe_lowercase(self):
153154
exp.rename(columns={"str_column": "str_CoLumn"}, inplace=True)
154155
assert_frame_equal(obs, exp)
155156

157+
def test_load_template_to_dataframe_non_utf8(self):
158+
bad = EXP_SAMPLE_TEMPLATE.replace('Test Sample 2', 'Test Sample\x962')
159+
with self.assertRaises(QiitaDBError):
160+
load_template_to_dataframe(StringIO(bad))
161+
156162
def test_load_template_to_dataframe_typechecking(self):
157163
obs = load_template_to_dataframe(
158164
StringIO(EXP_SAMPLE_TEMPLATE_LAT_ALL_INT))

qiita_db/metadata_template/util.py

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
import warnings
1616
from skbio.io.util import open_file
1717

18-
from qiita_db.exceptions import QiitaDBColumnError, QiitaDBWarning
18+
from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBWarning,
19+
QiitaDBError)
1920
from .constants import CONTROLLED_COLS
2021

2122
if PY3:
@@ -147,6 +148,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
147148
to the needed type.
148149
QiitaDBWarning
149150
When columns are dropped because they have no content for any sample.
151+
QiitaDBError
152+
When non UTF-8 characters are found in the file.
150153
151154
Notes
152155
-----
@@ -215,21 +218,34 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
215218
# comment:
216219
# using the tab character as "comment" we remove rows that are
217220
# constituted only by delimiters i. e. empty rows.
218-
template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t',
219-
infer_datetime_format=True,
220-
keep_default_na=False, na_values=[''],
221-
parse_dates=True, index_col=False, comment='\t',
222-
mangle_dupe_cols=False, converters={
223-
'sample_name': lambda x: str(x).strip(),
224-
# required sample template information
225-
'physical_location': str,
226-
'sample_type': str,
227-
# collection_timestamp is not added here
228-
'host_subject_id': str,
229-
'description': str,
230-
# common prep template information
231-
'center_name': str,
232-
'center_projct_name': str})
221+
try:
222+
template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t',
223+
encoding='utf-8', infer_datetime_format=True,
224+
keep_default_na=False, na_values=[''],
225+
parse_dates=True, index_col=False, comment='\t',
226+
mangle_dupe_cols=False, converters={
227+
'sample_name': lambda x: str(x).strip(),
228+
# required sample template information
229+
'physical_location': str,
230+
'sample_type': str,
231+
# collection_timestamp is not added here
232+
'host_subject_id': str,
233+
'description': str,
234+
# common prep template information
235+
'center_name': str,
236+
'center_projct_name': str})
237+
except UnicodeDecodeError:
238+
# Find row number and col number for utf-8 encoding errors
239+
headers = holdfile[0].strip().split('\t')
240+
errors = []
241+
for row, line in enumerate(holdfile, 1):
242+
for col, cell in enumerate(line.split('\t')):
243+
try:
244+
cell.encode('utf-8')
245+
except UnicodeError:
246+
errors.append('row %d, header %s' % (row, headers[col]))
247+
raise QiitaDBError('Non UTF-8 characters found at ' +
248+
'; '.join(errors))
233249

234250
# let pandas infer the dtypes of these columns, if the inference is
235251
# not correct, then we have to raise an error

qiita_pet/handlers/study_handlers/description_handlers.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def process_sample_template(self, study, user, callback):
190190

191191
except (TypeError, QiitaDBColumnError, QiitaDBExecutionError,
192192
QiitaDBDuplicateError, IOError, ValueError, KeyError,
193-
CParserError, QiitaDBDuplicateHeaderError) as e:
193+
CParserError, QiitaDBDuplicateHeaderError, QiitaDBError) as e:
194194
# Some error occurred while processing the sample template
195195
# Show the error to the user so they can fix the template
196196
msg = html_error_message % ('parsing the sample template:',
@@ -422,9 +422,9 @@ def add_prep_template(self, study, user, callback):
422422
if warns:
423423
msg = '; '.join([str(w.message) for w in warns])
424424
msg_level = 'warning'
425-
except (TypeError, QiitaDBColumnError, QiitaDBExecutionError,
426-
QiitaDBDuplicateError, IOError, ValueError,
427-
CParserError) as e:
425+
except (TypeError, QiitaDBError, QiitaDBColumnError,
426+
QiitaDBExecutionError, QiitaDBDuplicateError, IOError,
427+
ValueError, CParserError) as e:
428428
pt_id = None
429429
# Some error occurred while processing the prep template
430430
# Show the error to the user so he can fix the template

0 commit comments

Comments
 (0)