From f64ed9c7a9e2d09793fa116fafc739b91f8cde47 Mon Sep 17 00:00:00 2001
From: Anthony <oddant1@hotmail.com>
Date: Thu, 6 Jun 2019 14:01:14 -0700
Subject: [PATCH] BUG: Properly validate FASTA files closes#187

---
 q2_types/feature_data/_format.py              | 96 +++++++++++++++----
 .../data/dna-sequences-consecutive-ids.fasta  |  2 +
 .../dna-sequences-corrupt-characters.fasta    |  2 +
 .../dna-sequences-first-line-not-id.fasta     |  1 +
 q2_types/feature_data/tests/test_format.py    | 21 ++++
 5 files changed, 106 insertions(+), 16 deletions(-)
 create mode 100644 q2_types/feature_data/tests/data/dna-sequences-consecutive-ids.fasta
 create mode 100644 q2_types/feature_data/tests/data/dna-sequences-corrupt-characters.fasta
 create mode 100644 q2_types/feature_data/tests/data/dna-sequences-first-line-not-id.fasta
diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py
index 64cc5c62..fbee8773 100644
--- a/q2_types/feature_data/_format.py
+++ b/q2_types/feature_data/_format.py
@@ -6,8 +6,13 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
+import re
+import io
+import math
+
 import skbio.io
 import qiime2.plugin.model as model
+from qiime2.plugin import ValidationError
 
 from ..plugin_setup import plugin
 
@@ -131,23 +136,82 @@ def sniff(self):
 
 
 class DNAFASTAFormat(model.TextFileFormat):
-    def sniff(self):
-        filepath = str(self)
-        sniffer = skbio.io.io_registry.get_sniffer('fasta')
-        if sniffer(filepath)[0]:
-            generator = skbio.io.read(filepath, constructor=skbio.DNA,
-                                      format='fasta', verify=False)
-            try:
-                for seq, _ in zip(generator, range(5)):
-                    pass
-                return True
-            # ValueError raised by skbio if there are invalid DNA chars.
-            except ValueError:
-                pass
+    def _check(self, level):
+        FASTADNAValidator = re.compile(r'[ACGTURYKMSWBDHVN\-\.]+\n?')
+        last_line_was_ID = False
 
-        # Empty files are ok also
-        empty_sniffer = skbio.io.io_registry.get_sniffer('<emptyfile>')
-        return empty_sniffer(filepath)[0]
+        with open(str(self)) as fh:
+            try:
+                line_number = 1
+                # If we read 2 and only get one \n we have only one \n in the
+                # file
+                first = fh.read(2)
+                # Empty files should pass
+                if first == '\n' or first == '':
+                    return
+                if first[0] != '>':
+                    raise ValidationError(
+                        "First line of file is not a valid FASTA ID. FASTA "
+                        "IDs must start with '>'")
+                fh.seek(0)
+                line = fh.readline()
+                while line != '' and line_number < level:
+                    if line.startswith('>'):
+                        if last_line_was_ID:
+                            raise ValidationError(
+                                'Multiple consecutive IDs starting on line '
+                                f'{line_number-1!r}')
+                        last_line_was_ID = True
+                    elif re.fullmatch(FASTADNAValidator, line):
+                        last_line_was_ID = False
+                    else:
+                        raise ValidationError(
+                            f'Invalid sequence on line {line_number}')
+                    line_number += 1
+                    line = fh.readline()
+            except UnicodeDecodeError as e:
+                # We tell() on the buffer because we want the actual buffer the
+                # error occured in, tell() on the file handle will report being
+                # at the end of the last buffer when the error occured well
+                # into the next buffer
+                buffer = fh.buffer
+                pos = buffer.tell()
+                # We want to start our read from the beginning of the buffer we
+                # encountered the error on because we have yet to count any
+                # lines in that buffer
+                pos = (math.ceil(pos / io.DEFAULT_BUFFER_SIZE) - 1) * \
+                    io.DEFAULT_BUFFER_SIZE
+                buffer.seek(pos)
+                # e.start reports the position of the bad byte in the current
+                # buffer not the file, this is the cause of all the pain
+                # determining which buffer in the file the error occured in so
+                # we can find the bad byte's position in the file not just the
+                # buffer. If the bad byte is the final byte in a buffer it will
+                # report it as being the first byte in the next buffer. This is
+                # likely due to the fact that e.start reports the buffer
+                # position of the bad byte, and e.end reports the buffer
+                # position of the next good byte. If the last byte in a buffer
+                # is bad then e.start should be 8192 and e.end should be byte 0
+                # of the next buffer, but end being less than start doesn't
+                # really make sense, so this is handled by bumping e.start to 0
+                # and e.end to 1
+                fh_error = io.TextIOWrapper(io.BytesIO(buffer.read(e.start)),
+                                            errors='ignore')
+                # We need to count the lines in the buffer the error occured in
+                # only counting newline terminated lines to ensure we do not
+                # double count the line the error occured on, the line count is
+                # initialized to one which accounts for the error line, and the
+                # error line will always terminate with the bad byte not a
+                # newline
+                for line in fh_error:
+                    if line[-1] == '\n':
+                        line_number += 1
+                raise ValidationError('Unicode cannot decode byte on line '
+                                      f'{line_number}') from e
+
+    def _validate_(self, level):
+        level_map = {'min': 100, 'max': float('inf')}
+        self._check(level_map[level])
 
 
 DNASequencesDirectoryFormat = model.SingleFileDirectoryFormat(
diff --git a/q2_types/feature_data/tests/data/dna-sequences-consecutive-ids.fasta b/q2_types/feature_data/tests/data/dna-sequences-consecutive-ids.fasta
new file mode 100644
index 00000000..bead64a8
--- /dev/null
+++ b/q2_types/feature_data/tests/data/dna-sequences-consecutive-ids.fasta
@@ -0,0 +1,2 @@
+>This is an ID
+>This is another ID
diff --git a/q2_types/feature_data/tests/data/dna-sequences-corrupt-characters.fasta b/q2_types/feature_data/tests/data/dna-sequences-corrupt-characters.fasta
new file mode 100644
index 00000000..94224b84
--- /dev/null
+++ b/q2_types/feature_data/tests/data/dna-sequences-corrupt-characters.fasta
@@ -0,0 +1,2 @@
+>This data is corrupt
+åå
\ No newline at end of file
diff --git a/q2_types/feature_data/tests/data/dna-sequences-first-line-not-id.fasta b/q2_types/feature_data/tests/data/dna-sequences-first-line-not-id.fasta
new file mode 100644
index 00000000..8c7e4d62
--- /dev/null
+++ b/q2_types/feature_data/tests/data/dna-sequences-first-line-not-id.fasta
@@ -0,0 +1 @@
+This is not an id
diff --git a/q2_types/feature_data/tests/test_format.py b/q2_types/feature_data/tests/test_format.py
index a18a29f7..bb5e0cf9 100644
--- a/q2_types/feature_data/tests/test_format.py
+++ b/q2_types/feature_data/tests/test_format.py
@@ -159,6 +159,27 @@ def test_dna_fasta_format_empty_file(self):
 
         format.validate()
 
+    def test_dna_fasta_format_consecutive_IDs(self):
+        filepath = self.get_data_path('dna-sequences-consecutive-ids.fasta')
+        format = DNAFASTAFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValidationError, 'consecutive IDs.*1'):
+            format.validate()
+
+    def test_dna_fasta_format_missing_initial_ID(self):
+        filepath = self.get_data_path('dna-sequences-first-line-not-id.fasta')
+        format = DNAFASTAFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValidationError, 'First line'):
+            format.validate()
+
+    def test_dna_fasta_format_corrupt_characters(self):
+        filepath = self.get_data_path('dna-sequences-corrupt-characters.fasta')
+        format = DNAFASTAFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValidationError, 'Unicode.*2'):
+            format.validate()
+
     def test_dna_sequences_directory_format(self):
         filepath = self.get_data_path('dna-sequences.fasta')
         shutil.copy(filepath,