|
15 | 15 | import warnings |
16 | 16 | from skbio.io.util import open_file |
17 | 17 |
|
18 | | -from qiita_db.exceptions import QiitaDBColumnError, QiitaDBWarning |
| 18 | +from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBWarning, |
| 19 | + QiitaDBError) |
19 | 20 | from .constants import CONTROLLED_COLS |
20 | 21 |
|
21 | 22 | if PY3: |
@@ -147,6 +148,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True): |
147 | 148 | to the needed type. |
148 | 149 | QiitaDBWarning |
149 | 150 | When columns are dropped because they have no content for any sample. |
| 151 | + QiitaDBError |
| 152 | + When non UTF-8 characters are found in the file. |
150 | 153 |
|
151 | 154 | Notes |
152 | 155 | ----- |
@@ -215,21 +218,34 @@ def load_template_to_dataframe(fn, strip_whitespace=True): |
215 | 218 | # comment: |
216 | 219 | # using the tab character as "comment" we remove rows that are |
217 | 220 | # constituted only by delimiters i. e. empty rows. |
218 | | - template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', |
219 | | - infer_datetime_format=True, |
220 | | - keep_default_na=False, na_values=[''], |
221 | | - parse_dates=True, index_col=False, comment='\t', |
222 | | - mangle_dupe_cols=False, converters={ |
223 | | - 'sample_name': lambda x: str(x).strip(), |
224 | | - # required sample template information |
225 | | - 'physical_location': str, |
226 | | - 'sample_type': str, |
227 | | - # collection_timestamp is not added here |
228 | | - 'host_subject_id': str, |
229 | | - 'description': str, |
230 | | - # common prep template information |
231 | | - 'center_name': str, |
232 | | - 'center_projct_name': str}) |
| 221 | + try: |
| 222 | + template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', |
| 223 | + encoding='utf-8', infer_datetime_format=True, |
| 224 | + keep_default_na=False, na_values=[''], |
| 225 | + parse_dates=True, index_col=False, comment='\t', |
| 226 | + mangle_dupe_cols=False, converters={ |
| 227 | + 'sample_name': lambda x: str(x).strip(), |
| 228 | + # required sample template information |
| 229 | + 'physical_location': str, |
| 230 | + 'sample_type': str, |
| 231 | + # collection_timestamp is not added here |
| 232 | + 'host_subject_id': str, |
| 233 | + 'description': str, |
| 234 | + # common prep template information |
| 235 | + 'center_name': str, |
| 236 | + 'center_projct_name': str}) |
| 237 | + except UnicodeDecodeError: |
| 238 | + # Find row number and col number for utf-8 encoding errors |
| 239 | + headers = holdfile[0].strip().split('\t') |
| 240 | + errors = [] |
| 241 | + for row, line in enumerate(holdfile, 1): |
| 242 | + for col, cell in enumerate(line.split('\t')): |
| 243 | + try: |
| 244 | + cell.encode('utf-8') |
| 245 | + except UnicodeError: |
| 246 | + errors.append('row %d, header %s' % (row, headers[col])) |
| 247 | + raise QiitaDBError('Non UTF-8 characters found at ' + |
| 248 | + '; '.join(errors)) |
233 | 249 |
|
234 | 250 | # let pandas infer the dtypes of these columns, if the inference is |
235 | 251 | # not correct, then we have to raise an error |
|
0 commit comments