|
9 | 9 | from __future__ import division |
10 | 10 | from future.utils import PY3 |
11 | 11 | from future.utils.six import StringIO |
| 12 | +from csv import reader |
12 | 13 |
|
13 | 14 | import pandas as pd |
14 | 15 | import numpy as np |
@@ -215,21 +216,33 @@ def load_template_to_dataframe(fn, strip_whitespace=True): |
215 | 216 | # comment: |
216 | 217 | # using the tab character as "comment" we remove rows that are |
217 | 218 | # constituted only by delimiters i. e. empty rows. |
218 | | - template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', |
219 | | - infer_datetime_format=True, |
220 | | - keep_default_na=False, na_values=[''], |
221 | | - parse_dates=True, index_col=False, comment='\t', |
222 | | - mangle_dupe_cols=False, converters={ |
223 | | - 'sample_name': lambda x: str(x).strip(), |
224 | | - # required sample template information |
225 | | - 'physical_location': str, |
226 | | - 'sample_type': str, |
227 | | - # collection_timestamp is not added here |
228 | | - 'host_subject_id': str, |
229 | | - 'description': str, |
230 | | - # common prep template information |
231 | | - 'center_name': str, |
232 | | - 'center_projct_name': str}) |
| 219 | + try: |
| 220 | + template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t', |
| 221 | + encoding='utf-8', infer_datetime_format=True, |
| 222 | + keep_default_na=False, na_values=[''], |
| 223 | + parse_dates=True, index_col=False, comment='\t', |
| 224 | + mangle_dupe_cols=False, converters={ |
| 225 | + 'sample_name': lambda x: str(x).strip(), |
| 226 | + # required sample template information |
| 227 | + 'physical_location': str, |
| 228 | + 'sample_type': str, |
| 229 | + # collection_timestamp is not added here |
| 230 | + 'host_subject_id': str, |
| 231 | + 'description': str, |
| 232 | + # common prep template information |
| 233 | + 'center_name': str, |
| 234 | + 'center_projct_name': str}) |
| 235 | + except UnicodeDecodeError: |
| 236 | + # Find row number and col number for utf-8 encoding errors |
| 237 | + errors = [] |
| 238 | + for row, line in enumerate(holdfile): |
| 239 | + for col, cell in enumerate(line.split('\t')): |
| 240 | + try: |
| 241 | + cell.encode('utf-8') |
| 242 | + except UnicodeError: |
| 243 | + errors.append('Non-unicode value in cell at ' |
| 244 | + 'row %d col %d' % (row+1, col+1)) |
| 245 | + raise ValueError(', '.join(errors)) |
233 | 246 |
|
234 | 247 | # let pandas infer the dtypes of these columns, if the inference is |
235 | 248 | # not correct, then we have to raise an error |
|
0 commit comments