Skip to content

Commit dd65a21

Browse files
committed
check for non-utf8 characters. Fix #1197
1 parent e492531 commit dd65a21

File tree

1 file changed

+28
-15
lines changed

1 file changed

+28
-15
lines changed

qiita_db/metadata_template/util.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from __future__ import division
1010
from future.utils import PY3
1111
from future.utils.six import StringIO
12+
from csv import reader
1213

1314
import pandas as pd
1415
import numpy as np
@@ -215,21 +216,33 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
215216
# comment:
216217
# using the tab character as "comment" we remove rows that are
217218
# constituted only by delimiters i. e. empty rows.
218-
template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t',
219-
infer_datetime_format=True,
220-
keep_default_na=False, na_values=[''],
221-
parse_dates=True, index_col=False, comment='\t',
222-
mangle_dupe_cols=False, converters={
223-
'sample_name': lambda x: str(x).strip(),
224-
# required sample template information
225-
'physical_location': str,
226-
'sample_type': str,
227-
# collection_timestamp is not added here
228-
'host_subject_id': str,
229-
'description': str,
230-
# common prep template information
231-
'center_name': str,
232-
'center_projct_name': str})
219+
try:
220+
template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t',
221+
encoding='utf-8', infer_datetime_format=True,
222+
keep_default_na=False, na_values=[''],
223+
parse_dates=True, index_col=False, comment='\t',
224+
mangle_dupe_cols=False, converters={
225+
'sample_name': lambda x: str(x).strip(),
226+
# required sample template information
227+
'physical_location': str,
228+
'sample_type': str,
229+
# collection_timestamp is not added here
230+
'host_subject_id': str,
231+
'description': str,
232+
# common prep template information
233+
'center_name': str,
234+
'center_projct_name': str})
235+
except UnicodeDecodeError:
236+
# Find row number and col number for utf-8 encoding errors
237+
errors = []
238+
for row, line in enumerate(holdfile):
239+
for col, cell in enumerate(line.split('\t')):
240+
try:
241+
cell.encode('utf-8')
242+
except UnicodeError:
243+
errors.append('Non-unicode value in cell at '
244+
'row %d col %d' % (row+1, col+1))
245+
raise ValueError(', '.join(errors))
233246

234247
# let pandas infer the dtypes of these columns, if the inference is
235248
# not correct, then we have to raise an error

0 commit comments

Comments
 (0)