From dd65a21b8ec57d099a9b7d4ce2bddb7dd61ee73b Mon Sep 17 00:00:00 2001
From: Joshua Shorenstein <squirrelo@gmail.com>
Date: Wed, 20 May 2015 11:13:03 -0700
Subject: [PATCH 1/5] check for non-utf8 characters. Fix #1197

---
 qiita_db/metadata_template/util.py | 43 +++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py
index 863e7bbfa..a1ed58bc7 100644
--- a/qiita_db/metadata_template/util.py
+++ b/qiita_db/metadata_template/util.py
@@ -9,6 +9,7 @@
 from __future__ import division
 from future.utils import PY3
 from future.utils.six import StringIO
+from csv import reader
 
 import pandas as pd
 import numpy as np
@@ -215,21 +216,33 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
     # comment:
     #   using the tab character as "comment" we remove rows that are
     #   constituted only by delimiters i. e. empty rows.
-    template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t',
-                           infer_datetime_format=True,
-                           keep_default_na=False, na_values=[''],
-                           parse_dates=True, index_col=False, comment='\t',
-                           mangle_dupe_cols=False, converters={
-                               'sample_name': lambda x: str(x).strip(),
-                               # required sample template information
-                               'physical_location': str,
-                               'sample_type': str,
-                               # collection_timestamp is not added here
-                               'host_subject_id': str,
-                               'description': str,
-                               # common prep template information
-                               'center_name': str,
-                               'center_projct_name': str})
+    try:
+        template = pd.read_csv(StringIO(''.join(holdfile)), sep='\t',
+                               encoding='utf-8', infer_datetime_format=True,
+                               keep_default_na=False, na_values=[''],
+                               parse_dates=True, index_col=False, comment='\t',
+                               mangle_dupe_cols=False, converters={
+                                   'sample_name': lambda x: str(x).strip(),
+                                   # required sample template information
+                                   'physical_location': str,
+                                   'sample_type': str,
+                                   # collection_timestamp is not added here
+                                   'host_subject_id': str,
+                                   'description': str,
+                                   # common prep template information
+                                   'center_name': str,
+                                   'center_projct_name': str})
+    except UnicodeDecodeError:
+        # Find row number and col number for utf-8 encoding errors
+        errors = []
+        for row, line in enumerate(holdfile):
+            for col, cell in enumerate(line.split('\t')):
+                try:
+                    cell.encode('utf-8')
+                except UnicodeError:
+                    errors.append('Non-unicode value in cell at '
+                                  'row %d col %d' % (row+1, col+1))
+        raise ValueError(', '.join(errors))
 
     # let pandas infer the dtypes of these columns, if the inference is
     # not correct, then we have to raise an error

From 4c91caeae36975c17991f06bdd42297170ff67d0 Mon Sep 17 00:00:00 2001
From: Joshua Shorenstein <squirrelo@gmail.com>
Date: Wed, 20 May 2015 11:17:00 -0700
Subject: [PATCH 2/5] remove unneded import

---
 qiita_db/metadata_template/util.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py
index a1ed58bc7..51c0585a3 100644
--- a/qiita_db/metadata_template/util.py
+++ b/qiita_db/metadata_template/util.py
@@ -9,7 +9,6 @@
 from __future__ import division
 from future.utils import PY3
 from future.utils.six import StringIO
-from csv import reader
 
 import pandas as pd
 import numpy as np

From c415e74aef082ba02947a364bd75218fe2b8b28d Mon Sep 17 00:00:00 2001
From: Joshua Shorenstein <squirrelo@gmail.com>
Date: Wed, 20 May 2015 11:50:31 -0700
Subject: [PATCH 3/5] add suggestions

---
 qiita_db/metadata_template/test/test_util.py |  8 +++++++-
 qiita_db/metadata_template/util.py           | 14 +++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/qiita_db/metadata_template/test/test_util.py b/qiita_db/metadata_template/test/test_util.py
index 02716a4cf..199269495 100644
--- a/qiita_db/metadata_template/test/test_util.py
+++ b/qiita_db/metadata_template/test/test_util.py
@@ -13,7 +13,8 @@
 import pandas as pd
 from pandas.util.testing import assert_frame_equal
 
-from qiita_db.exceptions import QiitaDBColumnError, QiitaDBWarning
+from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBWarning,
+                                 QiitaDBError)
 from qiita_db.metadata_template.util import (
     get_datatypes, as_python_types, prefix_sample_names_with_id,
     load_template_to_dataframe, get_invalid_sample_names)
@@ -153,6 +154,11 @@ def test_load_template_to_dataframe_lowercase(self):
         exp.rename(columns={"str_column": "str_CoLumn"}, inplace=True)
         assert_frame_equal(obs, exp)
 
+    def test_load_template_to_dataframe_non_utf8(self):
+        bad = EXP_SAMPLE_TEMPLATE.replace('Test Sample 2', 'Test Sample\x962')
+        with self.assertRaises(QiitaDBError):
+            load_template_to_dataframe(StringIO(bad))
+
     def test_load_template_to_dataframe_typechecking(self):
         obs = load_template_to_dataframe(
             StringIO(EXP_SAMPLE_TEMPLATE_LAT_ALL_INT))
diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py
index 51c0585a3..b6a6521fc 100644
--- a/qiita_db/metadata_template/util.py
+++ b/qiita_db/metadata_template/util.py
@@ -15,7 +15,8 @@
 import warnings
 from skbio.io.util import open_file
 
-from qiita_db.exceptions import QiitaDBColumnError, QiitaDBWarning
+from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBWarning,
+                                 QiitaDBError)
 from .constants import CONTROLLED_COLS
 
 if PY3:
@@ -147,6 +148,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
         to the needed type.
     QiitaDBWarning
         When columns are dropped because they have no content for any sample.
+    QiitaDBError
+        When non UTF-8 characters are found in the file.
 
     Notes
     -----
@@ -233,15 +236,16 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
                                    'center_projct_name': str})
     except UnicodeDecodeError:
         # Find row number and col number for utf-8 encoding errors
+        headers = holdfile[0].strip().split('\t')
         errors = []
-        for row, line in enumerate(holdfile):
+        for row, line in enumerate(holdfile, 1):
             for col, cell in enumerate(line.split('\t')):
                 try:
                     cell.encode('utf-8')
                 except UnicodeError:
-                    errors.append('Non-unicode value in cell at '
-                                  'row %d col %d' % (row+1, col+1))
-        raise ValueError(', '.join(errors))
+                    errors.append('row %d, header %s' % (row, headers[col]))
+        raise QiitaDBError('Non UTF-8 characters found at ' +
+                           '; '.join(errors))
 
     # let pandas infer the dtypes of these columns, if the inference is
     # not correct, then we have to raise an error

From 5211e8911cdef73481d10f563f2507b0a7c0e8cb Mon Sep 17 00:00:00 2001
From: Joshua Shorenstein <squirrelo@gmail.com>
Date: Wed, 20 May 2015 12:24:34 -0700
Subject: [PATCH 4/5] revert to ValueError

---
 qiita_db/metadata_template/util.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py
index b6a6521fc..0ddd92733 100644
--- a/qiita_db/metadata_template/util.py
+++ b/qiita_db/metadata_template/util.py
@@ -141,15 +141,13 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
     Raises
     ------
     ValueError
-        Empty file passed
+        Empty file passed, or non UTF-8 characters are found in the file.
     QiitaDBColumnError
         If the sample_name column is not present in the template.
         If there's a value in one of the reserved columns that cannot be cast
         to the needed type.
     QiitaDBWarning
         When columns are dropped because they have no content for any sample.
-    QiitaDBError
-        When non UTF-8 characters are found in the file.
 
     Notes
     -----
@@ -244,8 +242,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
                     cell.encode('utf-8')
                 except UnicodeError:
                     errors.append('row %d, header %s' % (row, headers[col]))
-        raise QiitaDBError('Non UTF-8 characters found at ' +
-                           '; '.join(errors))
+        raise ValueError('Non UTF-8 characters found at ' +
+                         '; '.join(errors))
 
     # let pandas infer the dtypes of these columns, if the inference is
     # not correct, then we have to raise an error

From 2494f33484c5a0635863d8f236253e0d9054ba27 Mon Sep 17 00:00:00 2001
From: Joshua Shorenstein <squirrelo@gmail.com>
Date: Wed, 20 May 2015 12:46:17 -0700
Subject: [PATCH 5/5] add QiitaDBError as displayed error

---
 qiita_db/metadata_template/util.py                        | 8 +++++---
 qiita_pet/handlers/study_handlers/description_handlers.py | 8 ++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/qiita_db/metadata_template/util.py b/qiita_db/metadata_template/util.py
index 0ddd92733..b6a6521fc 100644
--- a/qiita_db/metadata_template/util.py
+++ b/qiita_db/metadata_template/util.py
@@ -141,13 +141,15 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
     Raises
     ------
     ValueError
-        Empty file passed, or non UTF-8 characters are found in the file.
+        Empty file passed
     QiitaDBColumnError
         If the sample_name column is not present in the template.
         If there's a value in one of the reserved columns that cannot be cast
         to the needed type.
     QiitaDBWarning
         When columns are dropped because they have no content for any sample.
+    QiitaDBError
+        When non UTF-8 characters are found in the file.
 
     Notes
     -----
@@ -242,8 +244,8 @@ def load_template_to_dataframe(fn, strip_whitespace=True):
                     cell.encode('utf-8')
                 except UnicodeError:
                     errors.append('row %d, header %s' % (row, headers[col]))
-        raise ValueError('Non UTF-8 characters found at ' +
-                         '; '.join(errors))
+        raise QiitaDBError('Non UTF-8 characters found at ' +
+                           '; '.join(errors))
 
     # let pandas infer the dtypes of these columns, if the inference is
     # not correct, then we have to raise an error
diff --git a/qiita_pet/handlers/study_handlers/description_handlers.py b/qiita_pet/handlers/study_handlers/description_handlers.py
index 0f68c63e8..4f3295c79 100644
--- a/qiita_pet/handlers/study_handlers/description_handlers.py
+++ b/qiita_pet/handlers/study_handlers/description_handlers.py
@@ -190,7 +190,7 @@ def process_sample_template(self, study, user, callback):
 
         except (TypeError, QiitaDBColumnError, QiitaDBExecutionError,
                 QiitaDBDuplicateError, IOError, ValueError, KeyError,
-                CParserError, QiitaDBDuplicateHeaderError) as e:
+                CParserError, QiitaDBDuplicateHeaderError, QiitaDBError) as e:
             # Some error occurred while processing the sample template
             # Show the error to the user so they can fix the template
             msg = html_error_message % ('parsing the sample template:',
@@ -422,9 +422,9 @@ def add_prep_template(self, study, user, callback):
                 if warns:
                     msg = '; '.join([str(w.message) for w in warns])
                     msg_level = 'warning'
-        except (TypeError, QiitaDBColumnError, QiitaDBExecutionError,
-                QiitaDBDuplicateError, IOError, ValueError,
-                CParserError) as e:
+        except (TypeError, QiitaDBError, QiitaDBColumnError,
+                QiitaDBExecutionError, QiitaDBDuplicateError, IOError,
+                ValueError, CParserError) as e:
             pt_id = None
             # Some error occurred while processing the prep template
             # Show the error to the user so he can fix the template