Merge pull request #1198 from squirrelo/utf8-issue

wasade · wasade · commit 6819e8ad6255 · 2015-05-20T13:42:17.000-07:00
Check for non-utf8 characters. Fix #1197
diff --git a/qiita_db/metadata_template/test/test_util.py b/qiita_db/metadata_template/test/test_util.py
@@ -13,7 +13,8 @@
 import pandas as pd
 from pandas.util.testing import assert_frame_equal
 
-from qiita_db.exceptions import QiitaDBColumnError, QiitaDBWarning
+from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBWarning,
+                                 QiitaDBError)
 from qiita_db.metadata_template.util import (
     get_datatypes, as_python_types, prefix_sample_names_with_id,
     load_template_to_dataframe, get_invalid_sample_names)
@@ -153,6 +154,11 @@ def test_load_template_to_dataframe_lowercase(self):
         exp.rename(columns={"str_column": "str_CoLumn"}, inplace=True)
         assert_frame_equal(obs, exp)
 
+    def test_load_template_to_dataframe_non_utf8(self):
+        bad = EXP_SAMPLE_TEMPLATE.replace('Test Sample 2', 'Test Sample\x962')
+        with self.assertRaises(QiitaDBError):
+            load_template_to_dataframe(StringIO(bad))
+
     def test_load_template_to_dataframe_typechecking(self):
         obs = load_template_to_dataframe(
             StringIO(EXP_SAMPLE_TEMPLATE_LAT_ALL_INT))
diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py
@@ -15,7 +15,8 @@
 import warnings
 from skbio.io.util import open_file
 
-from qiita_db.exceptions import QiitaDBColumnError, QiitaDBWarning
+from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBWarning,
+                                 QiitaDBError)
 from .constants import CONTROLLED_COLS
 
 if PY3:
@@ -147,6 +148,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
         to the needed type.
     QiitaDBWarning
         When columns are dropped because they have no content for any sample.
+    QiitaDBError
+        When non UTF-8 characters are found in the file.
 
     Notes
     -----
@@ -215,21 +218,34 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
     # comment:
     #   using the tab character as "comment" we remove rows that are
     #   constituted only by delimiters i. e. empty rows.
-    template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t',
-                           infer_datetime_format=True,
-                           keep_default_na=False, na_values=[''],
-                           parse_dates=True, index_col=False, comment='\t',
-                           mangle_dupe_cols=False, converters={
-                               'sample_name': lambda x: str(x).strip(),
-                               # required sample template information
-                               'physical_location': str,
-                               'sample_type': str,
-                               # collection_timestamp is not added here
-                               'host_subject_id': str,
-                               'description': str,
-                               # common prep template information
-                               'center_name': str,
-                               'center_projct_name': str})
+    try:
+        template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t',
+                               encoding='utf-8', infer_datetime_format=True,
+                               keep_default_na=False, na_values=[''],
+                               parse_dates=True, index_col=False, comment='\t',
+                               mangle_dupe_cols=False, converters={
+                                   'sample_name': lambda x: str(x).strip(),
+                                   # required sample template information
+                                   'physical_location': str,
+                                   'sample_type': str,
+                                   # collection_timestamp is not added here
+                                   'host_subject_id': str,
+                                   'description': str,
+                                   # common prep template information
+                                   'center_name': str,
+                                   'center_projct_name': str})
+    except UnicodeDecodeError:
+        # Find row number and col number for utf-8 encoding errors
+        headers = holdfile[0].strip().split('\t')
+        errors = []
+        for row, line in enumerate(holdfile, 1):
+            for col, cell in enumerate(line.split('\t')):
+                try:
+                    cell.encode('utf-8')
+                except UnicodeError:
+                    errors.append('row %d, header %s' % (row, headers[col]))
+        raise QiitaDBError('Non UTF-8 characters found at ' +
+                           '; '.join(errors))
 
     # let pandas infer the dtypes of these columns, if the inference is
     # not correct, then we have to raise an error
diff --git a/qiita_pet/handlers/study_handlers/description_handlers.py b/qiita_pet/handlers/study_handlers/description_handlers.py
@@ -190,7 +190,7 @@ def process_sample_template(self, study, user, callback):
 
         except (TypeError, QiitaDBColumnError, QiitaDBExecutionError,
                 QiitaDBDuplicateError, IOError, ValueError, KeyError,
-                CParserError, QiitaDBDuplicateHeaderError) as e:
+                CParserError, QiitaDBDuplicateHeaderError, QiitaDBError) as e:
             # Some error occurred while processing the sample template
             # Show the error to the user so they can fix the template
             msg = html_error_message % ('parsing the sample template:',
@@ -422,9 +422,9 @@ def add_prep_template(self, study, user, callback):
                 if warns:
                     msg = '; '.join([str(w.message) for w in warns])
                     msg_level = 'warning'
-        except (TypeError, QiitaDBColumnError, QiitaDBExecutionError,
-                QiitaDBDuplicateError, IOError, ValueError,
-                CParserError) as e:
+        except (TypeError, QiitaDBError, QiitaDBColumnError,
+                QiitaDBExecutionError, QiitaDBDuplicateError, IOError,
+                ValueError, CParserError) as e:
             pt_id = None
             # Some error occurred while processing the prep template
             # Show the error to the user so he can fix the template