From 572a6d6f3784676578ad9faa1a0df22898de9494 Mon Sep 17 00:00:00 2001
From: Antony Simard <als872@nau.edu>
Date: Thu, 9 Jul 2020 10:20:14 -0700
Subject: [PATCH 1/2] IMP: Implement AlignedDNAFASTAFormat validation

---
 q2_types/feature_data/_format.py | 111 ++++++++++++++++++++++++++-----
 1 file changed, 95 insertions(+), 16 deletions(-)

diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py
index c93cba55..408b1c57 100644
--- a/q2_types/feature_data/_format.py
+++ b/q2_types/feature_data/_format.py
@@ -7,7 +7,6 @@
 # ----------------------------------------------------------------------------
 
 import re
-import skbio.io
 
 import qiime2.plugin.model as model
 from qiime2.plugin import ValidationError
@@ -221,22 +220,102 @@ class PairedDNASequencesDirectoryFormat(model.DirectoryFormat):
 
 
 class AlignedDNAFASTAFormat(model.TextFileFormat):
-    def sniff(self):
-        filepath = str(self)
-        sniffer = skbio.io.io_registry.get_sniffer('fasta')
-        if sniffer(filepath)[0]:
-            generator = skbio.io.read(filepath, constructor=skbio.DNA,
-                                      format='fasta', verify=False)
+    def _validate_lines(self, max_lines):
+        FASTADNAValidator = re.compile(r'[ACGTURYKMSWBDHVN.-]+\r?\n?')
+        ValidationSet = frozenset(('A', 'C', 'G', 'T', 'U', 'R', 'Y', 'K', 'M',
+                                   'S', 'W', 'B', 'D', 'H', 'V', 'N', '.',
+                                   '-'))
+
+        last_line_was_ID = False
+        ids = {}
+
+        seq_len = 0
+        prev_seq_len = 0
+        prev_seq_start_line = 0
+
+        with open(str(self), 'rb') as fh:
             try:
-                initial_length = len(next(generator))
-                for seq, _ in zip(generator, range(4)):
-                    if len(seq) != initial_length:
-                        return False
-                return True
-            # ValueError raised by skbio if there are invalid DNA chars.
-            except (StopIteration, ValueError):
-                pass
-        return False
+                first = fh.read(6)
+                if first[:3] == b'\xEF\xBB\xBF':
+                    first = first[3:]
+
+                # Empty files should validate
+                if first.strip() == b'':
+                    return
+
+                if first[0] != ord(b'>'):
+                    raise ValidationError("First line of file is not a valid "
+                                          "description. Descriptions must "
+                                          "start with '>'")
+                fh.seek(0)
+
+                for line_number, line in enumerate(fh, 1):
+                    if line_number >= max_lines:
+                        return
+                    line = line.decode('utf-8-sig')
+
+                    if line.startswith('>'):
+                        if seq_len == 0:
+                            seq_len = prev_seq_len
+                        elif prev_seq_len != seq_len:
+                            raise ValidationError(
+                                'The sequence starting on line '
+                                f'{prev_seq_start_line} was length '
+                                f'{prev_seq_len}. All previous sequences '
+                                f'were length {seq_len}. All sequences must '
+                                'be the same length for '
+                                'AlignedDNAFASTAFormat.')
+
+                        prev_seq_len = 0
+                        prev_seq_start_line = 0
+
+                        if last_line_was_ID:
+                            raise ValidationError('Multiple consecutive '
+                                                  'descriptions starting on '
+                                                  f'line {line_number-1!r}')
+
+                        line = line.split()
+
+                        if line[0] == '>':
+                            if len(line) == 1:
+                                raise ValidationError(
+                                    f'Description on line {line_number} is '
+                                    'missing an ID.')
+                            else:
+                                raise ValidationError(
+                                    f'ID on line {line_number} starts with a '
+                                    'space. IDs may not start with spaces')
+
+                        if line[0] in ids:
+                            raise ValidationError(
+                                f'ID on line {line_number} is a duplicate of '
+                                f'another ID on line {ids[line[0]]}.')
+
+                        ids[line[0]] = line_number
+                        last_line_was_ID = True
+
+                    elif re.fullmatch(FASTADNAValidator, line):
+                        if prev_seq_start_line == 0:
+                            prev_seq_start_line = line
+
+                        prev_seq_len += len(line)
+                        last_line_was_ID = False
+                    else:
+                        for position, character in enumerate(line):
+                            if character not in ValidationSet:
+                                raise ValidationError(
+                                    f"Invalid character '{character}' at "
+                                    f"position {position} on line "
+                                    f"{line_number} (does not match IUPAC "
+                                    "characters for a DNA sequence).")
+
+            except UnicodeDecodeError as e:
+                raise ValidationError(f'utf-8 cannot decode byte on line '
+                                      f'{line_number}') from e
+
+    def _validate_(self, max_lines):
+        level_map = {'min': 100, 'max': float('inf')}
+        self._validate_lines(level_map[max_lines])
 
 
 AlignedDNASequencesDirectoryFormat = model.SingleFileDirectoryFormat(

From 7c47b297bbb820e68db74ec1159120a8c8876115 Mon Sep 17 00:00:00 2001
From: Antony Simard <als872@nau.edu>
Date: Thu, 9 Jul 2020 10:32:09 -0700
Subject: [PATCH 2/2] SQUASH: Typo

---
 q2_types/feature_data/_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py
index 408b1c57..bc8a0170 100644
--- a/q2_types/feature_data/_format.py
+++ b/q2_types/feature_data/_format.py
@@ -296,7 +296,7 @@ def _validate_lines(self, max_lines):
 
                     elif re.fullmatch(FASTADNAValidator, line):
                         if prev_seq_start_line == 0:
-                            prev_seq_start_line = line
+                            prev_seq_start_line = line_number
 
                         prev_seq_len += len(line)
                         last_line_was_ID = False