Skip to content

Commit

Permalink
Merge d3c8b99 into 42b97c5
Browse files Browse the repository at this point in the history
  • Loading branch information
Oddant1 committed Jun 19, 2019
2 parents 42b97c5 + d3c8b99 commit 7fa71d4
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 21 deletions.
61 changes: 44 additions & 17 deletions q2_types/feature_data/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import re
import skbio.io

import qiime2.plugin.model as model
import qiime2
from qiime2.plugin import ValidationError
import qiime2

from ..plugin_setup import plugin


Expand Down Expand Up @@ -132,23 +135,47 @@ def sniff(self):


class DNAFASTAFormat(model.TextFileFormat):
def sniff(self):
filepath = str(self)
sniffer = skbio.io.io_registry.get_sniffer('fasta')
if sniffer(filepath)[0]:
generator = skbio.io.read(filepath, constructor=skbio.DNA,
format='fasta', verify=False)
try:
for seq, _ in zip(generator, range(5)):
pass
return True
# ValueError raised by skbio if there are invalid DNA chars.
except ValueError:
pass
def _validate_lines(self, max_lines):
FASTADNAValidator = re.compile(r'[ACGTURYKMSWBDHVN]+\r?\n?')
last_line_was_ID = False

# Empty files are ok also
empty_sniffer = skbio.io.io_registry.get_sniffer('<emptyfile>')
return empty_sniffer(filepath)[0]
with open(str(self), 'rb') as fh:
try:
first = fh.read(6)
if first[:3] == b'\xEF\xBB\xBF':
first = first[3:]
# Empty files should validate
if first.strip() == b'':
return
if first[0] != ord(b'>'):
raise ValidationError("First line of file is not a valid "
"FASTA ID. FASTA IDs must start "
"with '>'")
fh.seek(0)
for line_number, line in enumerate(fh, 1):
if line_number >= max_lines:
return
line = line.decode('utf-8-sig')
if line.startswith('>'):
if last_line_was_ID:
raise ValidationError('Multiple consecutive IDs '
'starting on line '
f'{line_number-1!r}')
last_line_was_ID = True
elif re.fullmatch(FASTADNAValidator, line):
last_line_was_ID = False
else:
raise ValidationError('Invalid characters on line '
f'{line_number} (does not match '
'IUPAC characters for a DNA '
'sequence).')
except UnicodeDecodeError as e:
raise ValidationError(f'utf-8 cannot decode byte on line '
f'{line_number}') from e

def _validate_(self, max_lines):
level_map = {'min': 100, 'max': float('inf')}
self._validate_lines(level_map[max_lines])


DNASequencesDirectoryFormat = model.SingleFileDirectoryFormat(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>This is an ID
>This is another ID
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>This data is corrupt
����������
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is not an id
1 change: 1 addition & 0 deletions q2_types/feature_data/tests/data/dna-with-bom-fails.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Not a valid id
2 changes: 2 additions & 0 deletions q2_types/feature_data/tests/data/dna-with-bom-passes.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>Some kinda DNA
ACGTACGTACGT
42 changes: 38 additions & 4 deletions q2_types/feature_data/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,11 @@ def test_dna_fasta_format_validate_positive(self):

format.validate()

def test_dna_fasta_format_validate_negative(self):
filepath = self.get_data_path('not-dna-sequences')
def test_dna_fasta_format_bom_passes(self):
filepath = self.get_data_path('dna-with-bom-passes.fasta')
format = DNAFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError, 'DNAFASTA'):
format.validate()
format.validate()

def test_dna_fasta_format_empty_file(self):
filepath = os.path.join(self.temp_dir.name, 'empty')
Expand All @@ -159,6 +158,41 @@ def test_dna_fasta_format_empty_file(self):

format.validate()

def test_dna_fasta_format_validate_negative(self):
filepath = self.get_data_path('not-dna-sequences')
format = DNAFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError, 'DNAFASTA'):
format.validate()

def test_dna_fasta_format_consecutive_IDs(self):
filepath = self.get_data_path('dna-sequences-consecutive-ids.fasta')
format = DNAFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError, 'consecutive IDs.*1'):
format.validate()

def test_dna_fasta_format_missing_initial_ID(self):
filepath = self.get_data_path('dna-sequences-first-line-not-id.fasta')
format = DNAFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError, 'First line'):
format.validate()

def test_dna_fasta_format_corrupt_characters(self):
filepath = self.get_data_path('dna-sequences-corrupt-characters.fasta')
format = DNAFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError, 'utf-8.*2'):
format.validate()

def test_dna_fasta_format_bom_fails(self):
filepath = self.get_data_path('dna-with-bom-fails.fasta')
format = DNAFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError, 'First line'):
format.validate()

def test_dna_sequences_directory_format(self):
filepath = self.get_data_path('dna-sequences.fasta')
shutil.copy(filepath,
Expand Down

0 comments on commit 7fa71d4

Please sign in to comment.