Skip to content

Commit

Permalink
ENH: Sniff for malformed MANIFEST
Browse files Browse the repository at this point in the history
  • Loading branch information
maxvonhippel committed Jul 14, 2017
1 parent 28ff86c commit 1bbe7d8
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 5 deletions.
18 changes: 17 additions & 1 deletion q2_types/per_sample_sequences/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import skbio.io
import yaml
import qiime2.plugin.model as model
import pandas as pd

from ..plugin_setup import plugin

Expand All @@ -21,7 +22,22 @@ class FastqManifestFormat(model.TextFileFormat):
def sniff(self):
with self.open() as fh:
header = fh.readline()
return header.strip() == 'sample-id,filename,direction'
if header.strip() != 'sample-id,filename,direction':
return False
try:
manifest = pd.read_csv(fh, comment='#', header=None,
skip_blank_lines=True, dtype=object)
manifest.columns = ['sample-id', 'filename', 'direction']
manifest = manifest.dropna(how='all')
if len(manifest.columns) != 3:
return False
duplicated = manifest.drop(manifest.columns[1], 1)
if True in duplicated.duplicated().values:
return False
except Exception as e:
raise
return False
return True


class YamlFormat(model.TextFileFormat):
Expand Down
9 changes: 5 additions & 4 deletions q2_types/per_sample_sequences/_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,11 @@ def _1(dirfmt: SingleLanePerSampleSingleEndFastqDirFmt) \
fh = iter(dirfmt.manifest.view(FastqManifestFormat).open())
next(fh)
for line in fh:
sample_id, filename, _ = line.split(',')
filepath = str(dirfmt.path / filename)
result[sample_id] = skbio.io.read(filepath, format='fastq',
constructor=skbio.DNA)
if not line.startswith('#'):
sample_id, filename, _ = line.split(',')
filepath = str(dirfmt.path / filename)
result[sample_id] = skbio.io.read(filepath, format='fastq',
constructor=skbio.DNA)
return result


Expand Down
4 changes: 4 additions & 0 deletions q2_types/per_sample_sequences/tests/data/duplicate-MANIFEST
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
sample-id,filename,direction
# important comment
Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz,forward
Human-Kneecap,Human-Kneecap_S4_L001_R1_001.fastq.gz,forward
3 changes: 3 additions & 0 deletions q2_types/per_sample_sequences/tests/data/extra-Manifest
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id,filename,direction
Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz,forward,banana
Human-Kneecap,Human-Kneecap_S1_L001_R2_001.fastq.gz,reverse
3 changes: 3 additions & 0 deletions q2_types/per_sample_sequences/tests/data/lesser-MANIFEST
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id,filename,direction
Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz
Human-Kneecap,Human-Kneecap_S1_L001_R2_001.fastq.gz,reverse
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
sample-id,filename,direction
# important comment
Human-Kneecap,Human-Kneecap_S1_L001_R1_001.fastq.gz,forward
21 changes: 21 additions & 0 deletions q2_types/per_sample_sequences/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,27 @@ def test_fastq_manifest_format_validate_negative(self):
with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
format.validate()

def test_fastq_manifest_format_validate_negative_extra_col(self):
filepath = self.get_data_path('extra-MANIFEST')
format = FastqManifestFormat(filepath, mode='r')

with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
format.validate()

def test_fastq_manifest_format_validate_negative_missing_col(self):
filepath = self.get_data_path('lesser-MANIFEST')
format = FastqManifestFormat(filepath, mode='r')

with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
format.validate()

def test_fastq_manifest_format_validate_negative_duplicate_id(self):
filepath = self.get_data_path('duplicate-MANIFEST')
format = FastqManifestFormat(filepath, mode='r')

with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
format.validate()

def test_casava_one_eight_slanepsample_dir_fmt_validate_positive(self):
filepath = self.get_data_path('Human-Kneecap_S1_L001_R1_001.fastq.gz')
shutil.copy(filepath, self.temp_dir.name)
Expand Down

0 comments on commit 1bbe7d8

Please sign in to comment.