Skip to content

Commit

Permalink
API: allow for optionally ignoring mismatches in sequence record desc…
Browse files Browse the repository at this point in the history
…riptions
  • Loading branch information
wasade committed Nov 11, 2020
1 parent a968396 commit 0a4f60f
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 53 deletions.
105 changes: 56 additions & 49 deletions q2_demux/_demux.py
Expand Up @@ -100,9 +100,11 @@ def _maintain_open_fh_count(per_sample_fastqs, paired=False):


class BarcodeSequenceFastqIterator(collections.abc.Iterable):
def __init__(self, barcode_generator, sequence_generator):
def __init__(self, barcode_generator, sequence_generator,
ignore_description_mismatch=False):
self.barcode_generator = barcode_generator
self.sequence_generator = sequence_generator
self.ignore_description_mismatch = ignore_description_mismatch

def __iter__(self):
# Adapted from q2-types
Expand All @@ -126,34 +128,36 @@ def __iter__(self):
(_trim_id(barcode_header.id),
_trim_id(sequence_header.id)))

# if a description field is present, confirm that they're equal
if barcode_header.description is None and \
sequence_header.description is None:
pass
elif barcode_header.description is None:
raise ValueError(
'Barcode header lines do not contain description fields '
'but sequence header lines do.')
elif sequence_header.description is None:
raise ValueError(
'Sequence header lines do not contain description fields '
'but barcode header lines do.')
elif _trim_description(barcode_header.description) != \
_trim_description(sequence_header.description):
raise ValueError(
'Mismatched sequence descriptions: %s and %s' %
(_trim_description(barcode_header.description),
_trim_description(sequence_header.description)))
if not self.ignore_description_mismatch:
# if a description field is present, confirm that they're equal
if barcode_header.description is None and \
sequence_header.description is None:
pass
elif barcode_header.description is None:
raise ValueError(
'Barcode header lines do not contain description '
'fields but sequence header lines do.')
elif sequence_header.description is None:
raise ValueError(
'Sequence header lines do not contain description '
'fields but barcode header lines do.')
elif _trim_description(barcode_header.description) != \
_trim_description(sequence_header.description):
raise ValueError(
'Mismatched sequence descriptions: %s and %s' %
(_trim_description(barcode_header.description),
_trim_description(sequence_header.description)))

yield barcode_record, sequence_record


class BarcodePairedSequenceFastqIterator(collections.abc.Iterable):
def __init__(self, barcode_generator, forward_generator,
reverse_generator):
reverse_generator, ignore_description_mismatch=False):
self.barcode_generator = barcode_generator
self.forward_generator = forward_generator
self.reverse_generator = reverse_generator
self.ignore_description_mismatch = ignore_description_mismatch

def __iter__(self):
# Adapted from q2-types
Expand Down Expand Up @@ -186,31 +190,32 @@ def __iter__(self):
_trim_id(forward_header.id),
_trim_id(reverse_header.id)))

# if a description field is present, confirm that they're equal
if barcode_header.description is None and \
forward_header.description is None and \
reverse_header.description is None:
pass
elif barcode_header.description is None:
raise ValueError(
'Barcode header lines do not contain description fields '
'but sequence header lines do.')
elif forward_header.description is None:
raise ValueError(
'Forward-read header lines do not contain description '
'fields but barcode header lines do.')
elif reverse_header.description is None:
raise ValueError(
'Reverse-read header lines do not contain description '
'fields but barcode header lines do.')
elif not (_trim_description(barcode_header.description) ==
_trim_description(forward_header.description) ==
_trim_description(reverse_header.description)):
raise ValueError(
'Mismatched sequence descriptions: %s, %s, and %s' %
(_trim_description(barcode_header.description),
_trim_description(forward_header.description),
_trim_description(reverse_header.description)))
if not self.ignore_description_mismatch:
# if a description field is present, confirm that they're equal
if barcode_header.description is None and \
forward_header.description is None and \
reverse_header.description is None:
pass
elif barcode_header.description is None:
raise ValueError(
'Barcode header lines do not contain description '
'fields but sequence header lines do.')
elif forward_header.description is None:
raise ValueError(
'Forward-read header lines do not contain description '
'fields but barcode header lines do.')
elif reverse_header.description is None:
raise ValueError(
'Reverse-read header lines do not contain description '
'fields but barcode header lines do.')
elif not (_trim_description(barcode_header.description) ==
_trim_description(forward_header.description) ==
_trim_description(reverse_header.description)):
raise ValueError(
'Mismatched sequence descriptions: %s, %s, and %s' %
(_trim_description(barcode_header.description),
_trim_description(forward_header.description),
_trim_description(reverse_header.description)))

yield barcode_record, forward_record, reverse_record

Expand Down Expand Up @@ -246,10 +251,11 @@ def emp_single(seqs: BarcodeSequenceFastqIterator,
barcodes: qiime2.CategoricalMetadataColumn,
golay_error_correction: bool = True,
rev_comp_barcodes: bool = False,
rev_comp_mapping_barcodes: bool = False
rev_comp_mapping_barcodes: bool = False,
ignore_description_mismatch: bool = False
) -> (SingleLanePerSampleSingleEndFastqDirFmt,
pd.DataFrame):

seqs.ignore_description_mismatch = ignore_description_mismatch
result = SingleLanePerSampleSingleEndFastqDirFmt()
barcode_map, barcode_len = _make_barcode_map(
barcodes, rev_comp_mapping_barcodes)
Expand Down Expand Up @@ -359,10 +365,11 @@ def emp_paired(seqs: BarcodePairedSequenceFastqIterator,
barcodes: qiime2.CategoricalMetadataColumn,
golay_error_correction: bool = True,
rev_comp_barcodes: bool = False,
rev_comp_mapping_barcodes: bool = False
rev_comp_mapping_barcodes: bool = False,
ignore_description_mismatch: bool = False
) -> (SingleLanePerSamplePairedEndFastqDirFmt,
pd.DataFrame):

seqs.ignore_description_mismatch = ignore_description_mismatch
result = SingleLanePerSamplePairedEndFastqDirFmt()
barcode_map, barcode_len = _make_barcode_map(
barcodes, rev_comp_mapping_barcodes)
Expand Down
14 changes: 10 additions & 4 deletions q2_demux/plugin_setup.py
Expand Up @@ -76,7 +76,8 @@
parameters={'barcodes': MetadataColumn[Categorical],
'golay_error_correction': Bool,
'rev_comp_barcodes': Bool,
'rev_comp_mapping_barcodes': Bool},
'rev_comp_mapping_barcodes': Bool,
'ignore_description_mismatch': Bool},
outputs=[('per_sample_sequences', SampleData[SequencesWithQuality]),
('error_correction_details', ErrorCorrectionDetails)],
input_descriptions={
Expand All @@ -91,7 +92,9 @@
'reverse complemented prior to demultiplexing.',
'rev_comp_mapping_barcodes': 'If provided, the barcode sequences in '
'the sample metadata will be reverse '
'complemented prior to demultiplexing.'
'complemented prior to demultiplexing.',
'ignore_description_mismatch': 'If True, ignore mismatches in '
'sequence record description fields.'
},
output_descriptions={
'per_sample_sequences': 'The resulting demultiplexed sequences.',
Expand All @@ -115,7 +118,8 @@
parameters={'barcodes': MetadataColumn[Categorical],
'golay_error_correction': Bool,
'rev_comp_barcodes': Bool,
'rev_comp_mapping_barcodes': Bool},
'rev_comp_mapping_barcodes': Bool,
'ignore_description_mismatch': Bool},
outputs=[
('per_sample_sequences', SampleData[PairedEndSequencesWithQuality]),
('error_correction_details', ErrorCorrectionDetails),
Expand All @@ -132,7 +136,9 @@
'reverse complemented prior to demultiplexing.',
'rev_comp_mapping_barcodes': 'If provided, the barcode sequences in '
'the sample metadata will be reverse '
'complemented prior to demultiplexing.'
'complemented prior to demultiplexing.',
'ignore_description_mismatch': 'If True, ignore mismatches in '
'sequence record description fields.'
},
output_descriptions={
'per_sample_sequences': 'The resulting demultiplexed sequences.',
Expand Down
55 changes: 55 additions & 0 deletions q2_demux/tests/test_demux.py
Expand Up @@ -105,6 +105,21 @@ def test_mismatched_description(self):
with self.assertRaises(ValueError):
list(bsi)

def test_mismatch_description_override(self):
barcodes = [('@s1/2 abc/2', 'AAAA', '+', 'YYYY'),
('@s2/2 abc/2', 'AAAA', '+', 'PPPP'),
('@s3/2 abc/2', 'AACC', '+', 'PPPP'),
('@s4/2 abc/2', 'AACC', '+', 'PPPP')]

sequences = [('@s1/1 abc/1', 'GGG', '+', 'YYY'),
('@s2/1 abc/1', 'CCC', '+', 'PPP'),
('@s3/1 abc/1', 'AAA', '+', 'PPP'),
('@s4/1 abd/1', 'TTT', '+', 'PPP')]

bsi = BarcodeSequenceFastqIterator(barcodes, sequences,
ignore_description_mismatch=True)
self.assertEqual(len(list(bsi)), 4)

def test_mismatched_handles_slashes_in_id(self):
# mismatch is detected as being before the last slash, even if there
# is more than one slash
Expand Down Expand Up @@ -791,6 +806,46 @@ def test_rev_comp_mapping_barcodes(self):
self.check_valid(self.bpsi, barcodes, rev_comp_mapping_barcodes=True,
golay_error_correction=False)

def test_mismatched_description(self):
barcodes = [('@s1/2 abc/2', 'AAAA', '+', 'YYYY'),
('@s2/2 abc/2', 'AAAA', '+', 'PPPP'),
('@s3/2 abc/2', 'AACC', '+', 'PPPP'),
('@s4/2 abc/2', 'AACC', '+', 'PPPP')]

forward = [('@s1/1 abc/1', 'GGG', '+', 'YYY'),
('@s2/1 abc/1', 'CCC', '+', 'PPP'),
('@s3/1 abc/1', 'AAA', '+', 'PPP'),
('@s4/1 abd/1', 'TTT', '+', 'PPP')]

reverse = [('@s1/1 abc/1', 'GGG', '+', 'YYY'),
('@s2/1 abc/1', 'CCC', '+', 'PPP'),
('@s3/1 abc/1', 'AAA', '+', 'PPP'),
('@s4/1 abd/1', 'TTT', '+', 'PPP')]

bsi = BarcodePairedSequenceFastqIterator(barcodes, forward, reverse)
with self.assertRaises(ValueError):
list(bsi)

def test_mismatch_description_override(self):
barcodes = [('@s1/2 abc/2', 'AAAA', '+', 'YYYY'),
('@s2/2 abc/2', 'AAAA', '+', 'PPPP'),
('@s3/2 abc/2', 'AACC', '+', 'PPPP'),
('@s4/2 abc/2', 'AACC', '+', 'PPPP')]

forward = [('@s1/1 abc/1', 'GGG', '+', 'YYY'),
('@s2/1 abc/1', 'CCC', '+', 'PPP'),
('@s3/1 abc/1', 'AAA', '+', 'PPP'),
('@s4/1 abd/1', 'TTT', '+', 'PPP')]

reverse = [('@s1/1 abc/1', 'GGG', '+', 'YYY'),
('@s2/1 abc/1', 'CCC', '+', 'PPP'),
('@s3/1 abc/1', 'AAA', '+', 'PPP'),
('@s4/1 abd/1', 'TTT', '+', 'PPP')]

bsi = BarcodePairedSequenceFastqIterator(barcodes, forward, reverse,
ignore_description_mismatch=True) # noqa
self.assertEqual(len(list(bsi)), 4)

def test_rev_comp_barcodes(self):
barcodes = [('@s1/2 abc/2', 'TTTT', '+', 'YYYY'),
('@s2/2 abc/2', 'TTAA', '+', 'PPPP'),
Expand Down

0 comments on commit 0a4f60f

Please sign in to comment.