From 6871b4283b67390a4a1749691b38f66406aeb430 Mon Sep 17 00:00:00 2001 From: Max von Hippel Date: Thu, 29 Jun 2017 11:06:19 -0700 Subject: [PATCH] BUG: Support sample IDs with underscores in summarize (#50) Fixes #49 --- q2_demux/_summarize/_visualizer.py | 20 ++++++++++++-------- q2_demux/tests/test_demux.py | 6 +++++- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/q2_demux/_summarize/_visualizer.py b/q2_demux/_summarize/_visualizer.py index 6ad6848..16f2023 100644 --- a/q2_demux/_summarize/_visualizer.py +++ b/q2_demux/_summarize/_visualizer.py @@ -40,15 +40,14 @@ def __init__(self, directory_format, paired): self.paired = paired -def _link_sample_n_to_file(files, counts, subsample_ns): +def _link_sample_n_to_file(file_records, counts, subsample_ns): results = collections.defaultdict(list) for num in subsample_ns: total = 0 - for file in files: - sample_name = os.path.basename(file).split('_', 1)[0] - total += counts[sample_name] + for file, sample_id in file_records: + total += counts[sample_id] if num < total: - idx = counts[sample_name] - (total - num) + idx = counts[sample_id] - (total - num) results[file].append(idx) break return results @@ -109,12 +108,15 @@ def summarize(output_dir: str, data: _PlotQualView, n: int=10000) -> None: per_sample_fastq_counts = {} reads = rev if not fwd and rev else fwd + file_records = [] for file in reads: count = 0 for seq in _read_fastq_seqs(file): count += 1 - sample_name = os.path.basename(file).split('_', 1)[0] - per_sample_fastq_counts[sample_name] = count + sample_id = manifest.loc[manifest.filename == file, + 'sample-id'].iloc[0] + per_sample_fastq_counts[sample_id] = count + file_records.append((file, sample_id)) result = pd.Series(per_sample_fastq_counts) result.name = 'Sequence count' @@ -131,7 +133,9 @@ def summarize(output_dir: str, data: _PlotQualView, n: int=10000) -> None: 'was generated using all available sequences.') subsample_ns = sorted(random.sample(range(sequence_count), n)) - link = _link_sample_n_to_file(reads, per_sample_fastq_counts, subsample_ns) + link = _link_sample_n_to_file(file_records, + per_sample_fastq_counts, + subsample_ns) if paired: sample_map = [(file, rev[fwd.index(file)], link[file]) for file in link] diff --git a/q2_demux/tests/test_demux.py b/q2_demux/tests/test_demux.py index e3b9bd5..2c1d6eb 100644 --- a/q2_demux/tests/test_demux.py +++ b/q2_demux/tests/test_demux.py @@ -651,7 +651,8 @@ def setUp(self): def test_basic(self): bsi = BarcodeSequenceFastqIterator(self.barcodes, self.sequences) - barcode_map = pd.Series(['AAAA', 'AACC'], index=['sample1', 'sample2']) + barcode_map = pd.Series(['AAAA', 'AACC'], + index=['sample_1', 'sample2']) barcode_map = qiime2.MetadataCategory(barcode_map) demux_data = emp_single(bsi, barcode_map) @@ -677,6 +678,9 @@ def test_basic(self): html = fh.read() self.assertIn('Minimum:1', html) self.assertIn('Maximum:3', html) + with open(csv_fp, 'r') as ch: + csv = ch.read() + self.assertIn('sample_1', csv) def test_single_sample(self): bsi = BarcodeSequenceFastqIterator(self.barcodes[:1],