From 6871b4283b67390a4a1749691b38f66406aeb430 Mon Sep 17 00:00:00 2001
From: Max von Hippel <maxvonhippel1996@gmail.com>
Date: Thu, 29 Jun 2017 11:06:19 -0700
Subject: [PATCH] BUG: Support sample IDs with underscores in summarize (#50)

Fixes #49
---
 q2_demux/_summarize/_visualizer.py | 20 ++++++++++++--------
 q2_demux/tests/test_demux.py       |  6 +++++-
 2 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/q2_demux/_summarize/_visualizer.py b/q2_demux/_summarize/_visualizer.py
index 6ad6848..16f2023 100644
--- a/q2_demux/_summarize/_visualizer.py
+++ b/q2_demux/_summarize/_visualizer.py
@@ -40,15 +40,14 @@ def __init__(self, directory_format, paired):
         self.paired = paired
 
 
-def _link_sample_n_to_file(files, counts, subsample_ns):
+def _link_sample_n_to_file(file_records, counts, subsample_ns):
     results = collections.defaultdict(list)
     for num in subsample_ns:
         total = 0
-        for file in files:
-            sample_name = os.path.basename(file).split('_', 1)[0]
-            total += counts[sample_name]
+        for file, sample_id in file_records:
+            total += counts[sample_id]
             if num < total:
-                idx = counts[sample_name] - (total - num)
+                idx = counts[sample_id] - (total - num)
                 results[file].append(idx)
                 break
     return results
@@ -109,12 +108,15 @@ def summarize(output_dir: str, data: _PlotQualView, n: int=10000) -> None:
 
     per_sample_fastq_counts = {}
     reads = rev if not fwd and rev else fwd
+    file_records = []
     for file in reads:
         count = 0
         for seq in _read_fastq_seqs(file):
             count += 1
-        sample_name = os.path.basename(file).split('_', 1)[0]
-        per_sample_fastq_counts[sample_name] = count
+        sample_id = manifest.loc[manifest.filename == file,
+                                 'sample-id'].iloc[0]
+        per_sample_fastq_counts[sample_id] = count
+        file_records.append((file, sample_id))
 
     result = pd.Series(per_sample_fastq_counts)
     result.name = 'Sequence count'
@@ -131,7 +133,9 @@ def summarize(output_dir: str, data: _PlotQualView, n: int=10000) -> None:
                         'was generated using all available sequences.')
 
     subsample_ns = sorted(random.sample(range(sequence_count), n))
-    link = _link_sample_n_to_file(reads, per_sample_fastq_counts, subsample_ns)
+    link = _link_sample_n_to_file(file_records,
+                                  per_sample_fastq_counts,
+                                  subsample_ns)
     if paired:
         sample_map = [(file, rev[fwd.index(file)], link[file])
                       for file in link]
diff --git a/q2_demux/tests/test_demux.py b/q2_demux/tests/test_demux.py
index e3b9bd5..2c1d6eb 100644
--- a/q2_demux/tests/test_demux.py
+++ b/q2_demux/tests/test_demux.py
@@ -651,7 +651,8 @@ def setUp(self):
     def test_basic(self):
         bsi = BarcodeSequenceFastqIterator(self.barcodes, self.sequences)
 
-        barcode_map = pd.Series(['AAAA', 'AACC'], index=['sample1', 'sample2'])
+        barcode_map = pd.Series(['AAAA', 'AACC'],
+                                index=['sample_1', 'sample2'])
         barcode_map = qiime2.MetadataCategory(barcode_map)
 
         demux_data = emp_single(bsi, barcode_map)
@@ -677,6 +678,9 @@ def test_basic(self):
                 html = fh.read()
                 self.assertIn('<td>Minimum:</td><td>1</td>', html)
                 self.assertIn('<td>Maximum:</td><td>3</td>', html)
+            with open(csv_fp, 'r') as ch:
+                csv = ch.read()
+                self.assertIn('sample_1', csv)
 
     def test_single_sample(self):
         bsi = BarcodeSequenceFastqIterator(self.barcodes[:1],