Skip to content

Commit

Permalink
Merge 1744618 into cf45830
Browse files Browse the repository at this point in the history
  • Loading branch information
Oddant1 committed Aug 16, 2019
2 parents cf45830 + 1744618 commit f400ed8
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 6 deletions.
26 changes: 21 additions & 5 deletions q2_types/feature_data/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class DNAFASTAFormat(model.TextFileFormat):
def _validate_lines(self, max_lines):
FASTADNAValidator = re.compile(r'[ACGTURYKMSWBDHVN]+\r?\n?')
last_line_was_ID = False
ids = {}

with open(str(self), 'rb') as fh:
try:
Expand All @@ -143,18 +144,33 @@ def _validate_lines(self, max_lines):
return
if first[0] != ord(b'>'):
raise ValidationError("First line of file is not a valid "
"FASTA ID. FASTA IDs must start "
"with '>'")
"description. Descriptions must "
"start with '>'")
fh.seek(0)
for line_number, line in enumerate(fh, 1):
if line_number >= max_lines:
return
line = line.decode('utf-8-sig')
if line.startswith('>'):
if last_line_was_ID:
raise ValidationError('Multiple consecutive IDs '
'starting on line '
f'{line_number-1!r}')
raise ValidationError('Multiple consecutive '
'descriptions starting on '
f'line {line_number-1!r}')
line = line.split()
if line[0] == '>':
if len(line) == 1:
raise ValidationError(
f'Description on line {line_number} is '
'missing an ID.')
else:
raise ValidationError(
f'ID on line {line_number} starts with a '
'space. IDs may not start with spaces')
if line[0] in ids:
raise ValidationError(
f'ID on line {line_number} is a duplicate of '
f'another ID on line {ids[line[0]]}.')
ids[line[0]] = line_number
last_line_was_ID = True
elif re.fullmatch(FASTADNAValidator, line):
last_line_was_ID = False
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
>SEQUENCE1
ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
>SEQUENCE1
ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
ACGTACGTACGTACGTACGTACGT
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
> this_id_starts_with_a_space
1 change: 1 addition & 0 deletions q2_types/feature_data/tests/data/dna-sequences-no-id.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
>
25 changes: 24 additions & 1 deletion q2_types/feature_data/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ def test_dna_fasta_format_consecutive_IDs(self):
filepath = self.get_data_path('dna-sequences-consecutive-ids.fasta')
format = DNAFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError, 'consecutive IDs.*1'):
with self.assertRaisesRegex(
ValidationError, 'consecutive descriptions.*1'):
format.validate()

def test_dna_fasta_format_missing_initial_ID(self):
Expand Down Expand Up @@ -201,6 +202,28 @@ def test_dna_sequences_directory_format(self):

format.validate()

def test_dna_fasta_format_duplicate_ids(self):
filepath = self.get_data_path('dna-sequences-duplicate-id.fasta')
format = DNAFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError, '3.*duplicate.*1'):
format.validate()

def test_dna_fasta_format_no_id(self):
filepath = self.get_data_path('dna-sequences-no-id.fasta')
format = DNAFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError, '1.*missing an ID'):
format.validate()

def test_dna_fasta_format_id_starts_with_space(self):
filepath = self.get_data_path(
'dna-sequences-id-starts-with-space.fasta')
format = DNAFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(ValidationError, '1 starts with a space'):
format.validate()

def test_paired_dna_sequences_directory_format(self):
filepath = self.get_data_path('dna-sequences.fasta')
temp_dir = self.temp_dir.name
Expand Down

0 comments on commit f400ed8

Please sign in to comment.