Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Metadata-compatible manifest formats #210

Merged
merged 5 commits into from Apr 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion q2_types/per_sample_sequences/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
SingleEndFastqManifestPhred64,
PairedEndFastqManifestPhred33,
PairedEndFastqManifestPhred64,
SingleEndFastqManifestPhred33V2,
thermokarst marked this conversation as resolved.
Show resolved Hide resolved
SingleEndFastqManifestPhred64V2,
PairedEndFastqManifestPhred33V2,
PairedEndFastqManifestPhred64V2,
QIIME1DemuxFormat, QIIME1DemuxDirFmt)
from ._type import (Sequences, SequencesWithQuality,
PairedEndSequencesWithQuality,
Expand All @@ -32,7 +36,10 @@
'SequencesWithQuality', 'PairedEndSequencesWithQuality',
'JoinedSequencesWithQuality', 'SingleEndFastqManifestPhred33',
'SingleEndFastqManifestPhred64', 'PairedEndFastqManifestPhred33',
'PairedEndFastqManifestPhred64', 'QIIME1DemuxFormat',
'PairedEndFastqManifestPhred64', 'SingleEndFastqManifestPhred33V2',
'SingleEndFastqManifestPhred64V2',
'PairedEndFastqManifestPhred33V2',
'PairedEndFastqManifestPhred64V2', 'QIIME1DemuxFormat',
'QIIME1DemuxDirFmt']

importlib.import_module('q2_types.per_sample_sequences._transformer')
79 changes: 78 additions & 1 deletion q2_types/per_sample_sequences/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,90 @@
import itertools
import collections

import pandas as pd
import skbio
import skbio.io
import yaml
import qiime2
import qiime2.plugin.model as model
from qiime2.plugin import ValidationError

from ..plugin_setup import plugin


class FastqAbsolutePathManifestFormatV2(model.TextFileFormat):
"""
Base class for mapping of sample identifies to filepaths. This format
relies heavily on the qiime2.Metadata on-disk format, as well as the
validation rules and behavior.
"""
METADATA_COLUMNS = None

def _validate_(self, level):
try:
md = qiime2.Metadata.load(str(self))
except qiime2.metadata.MetadataFileError as md_exc:
raise ValidationError(md_exc) from md_exc

md = md.filter_columns(column_type='categorical')

md_cols = dict()
for column in self.METADATA_COLUMNS.keys():
try:
md_cols[column] = md.get_column(column)
except ValueError as md_exc:
raise ValidationError(md_exc) from md_exc

filepaths = dict()
for column_name, column in md_cols.items():
column = column.to_series()
for i, (id_, fp) in enumerate(column.iteritems(), start=1):
# QIIME 2 represents empty cells as np.nan once normalized
if pd.isna(fp):
raise ValidationError(
'Missing filepath on line %d and column "%s".'
% (i, column_name))
if not os.path.exists(os.path.expandvars(fp)):
raise ValidationError(
'Filepath on line %d and column "%s" could not '
'be found (%s) for sample "%s".'
% (i, column_name, fp, id_))
if fp in filepaths:
old_id, old_col_name, old_row = filepaths[fp]
raise ValidationError(
'Filepath on line %d and column "%s" (sample "%s") '
'has already been registered on line %d and column '
'"%s" (sample "%s").'
% (i, column_name, id_, old_row, old_col_name, old_id))
else:
filepaths[fp] = (id_, column_name, i)


class _SingleEndFastqManifestV2(FastqAbsolutePathManifestFormatV2):
METADATA_COLUMNS = {'absolute-filepath': 'forward'}


class SingleEndFastqManifestPhred33V2(_SingleEndFastqManifestV2):
pass


class SingleEndFastqManifestPhred64V2(_SingleEndFastqManifestV2):
pass


class _PairedEndFastqManifestV2(FastqAbsolutePathManifestFormatV2):
METADATA_COLUMNS = {'forward-absolute-filepath': 'forward',
'reverse-absolute-filepath': 'reverse'}


class PairedEndFastqManifestPhred33V2(_PairedEndFastqManifestV2):
pass


class PairedEndFastqManifestPhred64V2(_PairedEndFastqManifestV2):
pass


class _FastqManifestBase(model.TextFileFormat):
"""
Base class for mapping of sample identifiers to filepaths and read
Expand Down Expand Up @@ -390,5 +465,7 @@ def _validate_seq(self, seq):
_SingleLanePerSampleFastqDirFmt, SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt, SingleEndFastqManifestPhred33,
SingleEndFastqManifestPhred64, PairedEndFastqManifestPhred33,
PairedEndFastqManifestPhred64, QIIME1DemuxFormat, QIIME1DemuxDirFmt
PairedEndFastqManifestPhred64, SingleEndFastqManifestPhred33V2,
SingleEndFastqManifestPhred64V2, PairedEndFastqManifestPhred33V2,
PairedEndFastqManifestPhred64V2, QIIME1DemuxFormat, QIIME1DemuxDirFmt
)
58 changes: 58 additions & 0 deletions q2_types/per_sample_sequences/_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
CasavaOneEightLanelessPerSampleDirFmt,
SingleEndFastqManifestPhred33, SingleEndFastqManifestPhred64,
PairedEndFastqManifestPhred33, PairedEndFastqManifestPhred64,
SingleEndFastqManifestPhred33V2,
SingleEndFastqManifestPhred64V2,
PairedEndFastqManifestPhred33V2,
PairedEndFastqManifestPhred64V2,
QIIME1DemuxDirFmt)


Expand Down Expand Up @@ -387,3 +391,57 @@ def _21(ff: FastqManifestFormat) -> pd.DataFrame:
values='filename')
df.columns.name = None
return df


def _manifest_v2_to_v1(fmt):
df = qiime2.Metadata.load(str(fmt)).to_dataframe()
# Drop unneccessary metadata columns
df = df[list(fmt.METADATA_COLUMNS.keys())]
denormalized_dfs = []
for column, direction in fmt.METADATA_COLUMNS.items():
denormalized_df = df[[column]]
original_index_name = denormalized_df.index.name
denormalized_df.reset_index(drop=False, inplace=True)
denormalized_df.rename(columns={
original_index_name: 'sample-id',
column: 'absolute-filepath'
}, inplace=True)
denormalized_df['direction'] = direction
denormalized_dfs.append(denormalized_df)
old_fmt = FastqManifestFormat()
pd.concat(denormalized_dfs, axis=0).to_csv(str(old_fmt), index=False)
return old_fmt


@plugin.register_transformer
def _23(fmt: SingleEndFastqManifestPhred33V2) \
-> SingleLanePerSampleSingleEndFastqDirFmt:
old_fmt = _manifest_v2_to_v1(fmt)
return _fastq_manifest_helper(old_fmt, _copy_with_compression,
single_end=True)


@plugin.register_transformer
def _24(fmt: SingleEndFastqManifestPhred64V2) \
-> SingleLanePerSampleSingleEndFastqDirFmt:
warnings.warn(_phred64_warning)
old_fmt = _manifest_v2_to_v1(fmt)
return _fastq_manifest_helper(old_fmt, _write_phred64_to_phred33,
single_end=True)


@plugin.register_transformer
def _25(fmt: PairedEndFastqManifestPhred33V2) \
-> SingleLanePerSamplePairedEndFastqDirFmt:
old_fmt = _manifest_v2_to_v1(fmt)
return _fastq_manifest_helper(old_fmt, _copy_with_compression,
single_end=False)


@plugin.register_transformer
def _26(fmt: PairedEndFastqManifestPhred64V2) \
-> SingleLanePerSamplePairedEndFastqDirFmt:
warnings.warn(_phred64_warning)
old_fmt = _manifest_v2_to_v1(fmt)
return _fastq_manifest_helper(old_fmt, _write_phred64_to_phred33,
single_end=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id forward-absolute-filepath reverse-absolute-filepath
Human-Kneecap Human-Kneecap_S1_L001_R1_001.fastq.gz
Peanut-Eyeball Human-Kneecap_S1_L001_R1_001.fastq.gz Human-Kneecap_S1_L001_R1_001.fastq.gz
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id absolute-filepath reverse-absolute-filepath foo
Human-Kneecap $s1f $s1r 1
Peanut-Eyeball $s2f $s2r 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id absolute-filepath
Human-Kneecap 1
Peanut-Eyeball 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id forward-absolute-filepath reverse-absolute-filepath
Human-Kneecap $s1f $s1r
Peanut-Eyeball $s2f $s2r
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id absolute-filepath
Human-Kneecap $s1
Peanut-Eyeball $s2
116 changes: 116 additions & 0 deletions q2_types/per_sample_sequences/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
FastqAbsolutePathManifestFormat,
SingleEndFastqManifestPhred33, SingleEndFastqManifestPhred64,
PairedEndFastqManifestPhred33, PairedEndFastqManifestPhred64,
SingleEndFastqManifestPhred33V2, SingleEndFastqManifestPhred64V2,
PairedEndFastqManifestPhred33V2, PairedEndFastqManifestPhred64V2,
SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt,
QIIME1DemuxFormat, QIIME1DemuxDirFmt
Expand All @@ -26,6 +28,120 @@
from qiime2.plugin import ValidationError


class TestAbsoluteFastqManifestV2Formats(TestPluginBase):
package = 'q2_types.per_sample_sequences.tests'

def setUp(self):
super().setUp()
self.se_formats = [SingleEndFastqManifestPhred33V2,
SingleEndFastqManifestPhred64V2]
self.pe_formats = [PairedEndFastqManifestPhred33V2,
PairedEndFastqManifestPhred64V2]

def template_manifest(self, filepath, ctx):
with open(filepath) as fh:
tmpl = string.Template(fh.read())
basename = os.path.basename(filepath)
file_ = os.path.join(self.temp_dir.name, basename)
with open(file_, 'w') as fh:
fh.write(tmpl.substitute(**ctx))
return file_

def test_validate_se_positive(self):
s1 = self.get_data_path('Human-Kneecap_S1_L001_R1_001.fastq.gz')
s2 = self.get_data_path('Human-Armpit.fastq.gz')
fp = self.get_data_path('absolute_manifests_v2/single-MANIFEST')
manifest = self.template_manifest(fp, {'s1': s1, 's2': s2})

for fmt in self.se_formats:
fmt(manifest, mode='r').validate()

def test_validate_pe_positive(self):
s1f = self.get_data_path('Human-Kneecap_S1_L001_R1_001.fastq.gz')
s1r = self.get_data_path('Human-Armpit.fastq.gz')
s2f = self.get_data_path('Human-Armpit_S2_L001_R1_001.fastq.gz')
s2r = self.get_data_path('Human-Kneecap_S1_R1_001.fastq.gz')

fp = self.get_data_path('absolute_manifests_v2/paired-MANIFEST')
manifest = self.template_manifest(fp, {'s1f': s1f, 's1r': s1r,
's2f': s2f, 's2r': s2r})

for fmt in self.pe_formats:
fmt(manifest, mode='r').validate()

def test_extra_columns(self):
s1f = self.get_data_path('Human-Kneecap_S1_L001_R1_001.fastq.gz')
s1r = self.get_data_path('Human-Armpit.fastq.gz')
s2f = self.get_data_path('Human-Armpit_S2_L001_R1_001.fastq.gz')
s2r = self.get_data_path('Human-Kneecap_S1_R1_001.fastq.gz')

fp = self.get_data_path('absolute_manifests_v2/multicol-MANIFEST')
manifest = self.template_manifest(fp, {'s1f': s1f, 's1r': s1r,
's2f': s2f, 's2r': s2r})

for fmt in self.se_formats:
fmt(manifest, mode='r').validate()

def test_invalid_metadata(self):
manifest = self.get_data_path('absolute_manifests/single-MANIFEST')

for fmt in self.se_formats:
with self.assertRaisesRegex(ValidationError, 'unrecognized ID'):
fmt(manifest, mode='r').validate()

def test_missing_column_se(self):
manifest = self.get_data_path('absolute_manifests_v2/paired-MANIFEST')

for fmt in self.se_formats:
with self.assertRaisesRegex(ValidationError, 'is not a column'):
gregcaporaso marked this conversation as resolved.
Show resolved Hide resolved
fmt(manifest, mode='r').validate()

def test_missing_columns_pe(self):
manifest = self.get_data_path('absolute_manifests_v2/single-MANIFEST')

for fmt in self.pe_formats:
with self.assertRaisesRegex(ValidationError, 'is not a column'):
fmt(manifest, mode='r').validate()

def test_invalid_column_type(self):
manifest = self.get_data_path('absolute_manifests_v2/numeric-MANIFEST')

for fmt in self.se_formats:
with self.assertRaisesRegex(ValidationError, 'is not a column'):
fmt(manifest, mode='r').validate()

def test_missing_files(self):
manifest = self.get_data_path('absolute_manifests_v2/missing-MANIFEST')

for fmt in self.pe_formats:
with self.assertRaisesRegex(
ValidationError,
'Missing.*line 1.*absolute-filepath'):
fmt(manifest, mode='r').validate()

def test_path_not_found(self):
# we make sure the file is missing by skipping the templating step
manifest = self.get_data_path('absolute_manifests_v2/single-MANIFEST')

for fmt in self.se_formats:
with self.assertRaisesRegex(
ValidationError,
'line 1.*absolute-filepath.*Human-Kneecap'):
fmt(manifest, mode='r').validate()

def test_duplicate_filepaths(self):
s1 = self.get_data_path('Human-Kneecap_S1_L001_R1_001.fastq.gz')
fp = self.get_data_path('absolute_manifests_v2/single-MANIFEST')
manifest = self.template_manifest(fp, {'s1': s1, 's2': s1})

for fmt in self.se_formats:
with self.assertRaisesRegex(
ValidationError,
'line 2.*absolute-filepath.*Peanut-Eyeball.*'
'line 1.*absolute-filepath.*Human-Kneecap'):
fmt(manifest, mode='r').validate()


class TestAbsoluteFastqManifestFormats(TestPluginBase):
package = 'q2_types.per_sample_sequences.tests'

Expand Down
Loading