qiime2 · maxvonhippel · Jul 14, 2017 · Jul 14, 2017 · Jul 14, 2017 · Jul 14, 2017
diff --git a/q2_types/per_sample_sequences/_format.py b/q2_types/per_sample_sequences/_format.py
@@ -9,19 +9,34 @@
 import skbio.io
 import yaml
 import qiime2.plugin.model as model
+import pandas as pd
 
 from ..plugin_setup import plugin
 
 
 class FastqManifestFormat(model.TextFileFormat):
     """
     Mapping of sample identifiers to filepaths and read direction.
+    Note that we are currently doing exhaustive validation here.
 
     """
     def sniff(self):
         with self.open() as fh:
             header = fh.readline()
-            return header.strip() == 'sample-id,filename,direction'
+            if header.strip() != 'sample-id,filename,direction':
+                return False
+            try:
+                manifest = pd.read_csv(fh, comment='#', header=None,
+                                       skip_blank_lines=True, dtype=object)
+                manifest.columns = ['sample-id', 'filename', 'direction']
+                if manifest.isnull().values.any():
+                    return False
+                duplicated = manifest.drop(manifest.columns[1], 1)
+                if True in duplicated.duplicated().values:
+                    return False
+            except Exception as e:
+                return False
+        return True
 
 
 class YamlFormat(model.TextFileFormat):

diff --git a/q2_types/per_sample_sequences/_transformer.py b/q2_types/per_sample_sequences/_transformer.py
@@ -43,10 +43,11 @@ def _1(dirfmt: SingleLanePerSampleSingleEndFastqDirFmt) \
     fh = iter(dirfmt.manifest.view(FastqManifestFormat).open())
     next(fh)
     for line in fh:
-        sample_id, filename, _ = line.split(',')
-        filepath = str(dirfmt.path / filename)
-        result[sample_id] = skbio.io.read(filepath, format='fastq',
-                                          constructor=skbio.DNA)
+        if not line.startswith('#'):
+            sample_id, filename, _ = line.split(',')
+            filepath = str(dirfmt.path / filename)
+            result[sample_id] = skbio.io.read(filepath, format='fastq',
+                                              constructor=skbio.DNA)
     return result
 
 

diff --git a/q2_types/per_sample_sequences/tests/data/duplicate-MANIFEST b/q2_types/per_sample_sequences/tests/data/duplicate-MANIFEST
@@ -0,0 +1,4 @@
+sample-id,filename,direction
+# important comment
+Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz,forward
+Human-Kneecap,Human-Kneecap_S4_L001_R1_001.fastq.gz,forward
diff --git a/q2_types/per_sample_sequences/tests/data/extra-MANIFEST b/q2_types/per_sample_sequences/tests/data/extra-MANIFEST
@@ -0,0 +1,3 @@
+sample-id,filename,direction
+Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz,forward,banana
+Human-Kneecap,Human-Kneecap_S1_L001_R2_001.fastq.gz,reverse
diff --git a/q2_types/per_sample_sequences/tests/data/extra-opposite-MANIFEST b/q2_types/per_sample_sequences/tests/data/extra-opposite-MANIFEST
@@ -0,0 +1,3 @@
+sample-id,filename,direction
+Human-Kneecap,Human-Kneecap_S1_L001_R2_001.fastq.gz,reverse
+Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz,forward,banana
diff --git a/q2_types/per_sample_sequences/tests/data/lesser-MANIFEST b/q2_types/per_sample_sequences/tests/data/lesser-MANIFEST
@@ -0,0 +1,3 @@
+sample-id,filename,direction
+Human-Kneecap,Human-Kneecap_S1_L001_R2_001.fastq.gz,reverse
+Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz
diff --git a/q2_types/per_sample_sequences/tests/data/lesser-opposite-MANIFEST b/q2_types/per_sample_sequences/tests/data/lesser-opposite-MANIFEST
@@ -0,0 +1,3 @@
+sample-id,filename,direction
+Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz
+Human-Kneecap,Human-Kneecap_S1_L001_R2_001.fastq.gz,reverse
diff --git a/q2_types/per_sample_sequences/tests/data/single_end_data/MANIFEST b/q2_types/per_sample_sequences/tests/data/single_end_data/MANIFEST
@@ -1,2 +1,3 @@
 sample-id,filename,direction
-Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz,forward
+# important comment
+Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz,forward
diff --git a/q2_types/per_sample_sequences/tests/test_format.py b/q2_types/per_sample_sequences/tests/test_format.py
@@ -73,6 +73,41 @@ def test_fastq_manifest_format_validate_negative(self):
         with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
             format.validate()
 
+    def test_fastq_manifest_format_validate_negative_extra_col(self):
+        filepath = self.get_data_path('extra-MANIFEST')
+        format = FastqManifestFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
+            format.validate()
+
+    def test_fastq_manifest_format_validate_negative_extra_col_order(self):
+        filepath = self.get_data_path('extra-opposite-MANIFEST')
+        format = FastqManifestFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
+            format.validate()
+
+    def test_fastq_manifest_format_validate_negative_missing_col(self):
+        filepath = self.get_data_path('lesser-MANIFEST')
+        format = FastqManifestFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
+            format.validate()
+
+    def test_fastq_manifest_format_validate_negative_missing_col_order(self):
+        filepath = self.get_data_path('lesser-opposite-MANIFEST')
+        format = FastqManifestFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
+            format.validate()
+
+    def test_fastq_manifest_format_validate_negative_duplicate_id(self):
+        filepath = self.get_data_path('duplicate-MANIFEST')
+        format = FastqManifestFormat(filepath, mode='r')
+
+        with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
+            format.validate()
+
     def test_casava_one_eight_slanepsample_dir_fmt_validate_positive(self):
         filepath = self.get_data_path('Human-Kneecap_S1_L001_R1_001.fastq.gz')
         shutil.copy(filepath, self.temp_dir.name)