Skip to content

Commit

Permalink
Merge 9eac1de into 4fa47e6
Browse files Browse the repository at this point in the history
  • Loading branch information
thermokarst committed Apr 15, 2019
2 parents 4fa47e6 + 9eac1de commit ee003df
Show file tree
Hide file tree
Showing 10 changed files with 329 additions and 3 deletions.
9 changes: 8 additions & 1 deletion q2_types/per_sample_sequences/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
SingleEndFastqManifestPhred64,
PairedEndFastqManifestPhred33,
PairedEndFastqManifestPhred64,
SingleEndFastqManifestPhred33V2,
SingleEndFastqManifestPhred64V2,
PairedEndFastqManifestPhred33V2,
PairedEndFastqManifestPhred64V2,
QIIME1DemuxFormat, QIIME1DemuxDirFmt)
from ._type import (Sequences, SequencesWithQuality,
PairedEndSequencesWithQuality,
Expand All @@ -32,7 +36,10 @@
'SequencesWithQuality', 'PairedEndSequencesWithQuality',
'JoinedSequencesWithQuality', 'SingleEndFastqManifestPhred33',
'SingleEndFastqManifestPhred64', 'PairedEndFastqManifestPhred33',
'PairedEndFastqManifestPhred64', 'QIIME1DemuxFormat',
'PairedEndFastqManifestPhred64', 'SingleEndFastqManifestPhred33V2',
'SingleEndFastqManifestPhred64V2',
'PairedEndFastqManifestPhred33V2',
'PairedEndFastqManifestPhred64V2', 'QIIME1DemuxFormat',
'QIIME1DemuxDirFmt']

importlib.import_module('q2_types.per_sample_sequences._transformer')
69 changes: 68 additions & 1 deletion q2_types/per_sample_sequences/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,80 @@
import itertools
import collections

import pandas as pd
import skbio
import skbio.io
import yaml
import qiime2
import qiime2.plugin.model as model
from qiime2.plugin import ValidationError

from ..plugin_setup import plugin


class FastqAbsolutePathManifestFormatV2(model.TextFileFormat):
"""
Base class for mapping of sample identifies to filepaths. This format
relies heavily on the qiime2.Metadata on-disk format, as well as the
validation rules and behavior.
"""
METADATA_COLUMNS = None

def _validate_(self, level):
try:
md = qiime2.Metadata.load(str(self))
except qiime2.metadata.MetadataFileError as md_exc:
raise ValidationError(md_exc) from md_exc

md = md.filter_columns(column_type='categorical')

md_cols = dict()
for column in self.METADATA_COLUMNS.keys():
try:
md_cols[column] = md.get_column(column)
except ValueError as md_exc:
raise ValidationError(md_exc) from md_exc

for column_name, column in md_cols.items():
column = column.to_series()
for i, (id_, fp) in enumerate(column.iteritems(), start=1):
# QIIME 2 represents empty cells as np.nan once normalized
if pd.isna(fp):
raise ValidationError(
'Missing filepath on line %d and column "%s".'
% (i, column_name))
if not os.path.exists(os.path.expandvars(fp)):
raise ValidationError(
'Filepath on line %d and column "%s" could not '
'be found (%s) for sample "%s".'
% (i, column_name, fp, id_))


class _SingleEndFastqManifestV2(FastqAbsolutePathManifestFormatV2):
METADATA_COLUMNS = {'absolute-filepath': 'forward'}


class SingleEndFastqManifestPhred33V2(_SingleEndFastqManifestV2):
pass


class SingleEndFastqManifestPhred64V2(_SingleEndFastqManifestV2):
pass


class _PairedEndFastqManifestV2(FastqAbsolutePathManifestFormatV2):
METADATA_COLUMNS = {'forward-absolute-filepath': 'forward',
'reverse-absolute-filepath': 'reverse'}


class PairedEndFastqManifestPhred33V2(_PairedEndFastqManifestV2):
pass


class PairedEndFastqManifestPhred64V2(_PairedEndFastqManifestV2):
pass


class _FastqManifestBase(model.TextFileFormat):
"""
Base class for mapping of sample identifiers to filepaths and read
Expand Down Expand Up @@ -390,5 +455,7 @@ def _validate_seq(self, seq):
_SingleLanePerSampleFastqDirFmt, SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt, SingleEndFastqManifestPhred33,
SingleEndFastqManifestPhred64, PairedEndFastqManifestPhred33,
PairedEndFastqManifestPhred64, QIIME1DemuxFormat, QIIME1DemuxDirFmt
PairedEndFastqManifestPhred64, SingleEndFastqManifestPhred33V2,
SingleEndFastqManifestPhred64V2, PairedEndFastqManifestPhred33V2,
PairedEndFastqManifestPhred64V2, QIIME1DemuxFormat, QIIME1DemuxDirFmt
)
58 changes: 58 additions & 0 deletions q2_types/per_sample_sequences/_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
CasavaOneEightLanelessPerSampleDirFmt,
SingleEndFastqManifestPhred33, SingleEndFastqManifestPhred64,
PairedEndFastqManifestPhred33, PairedEndFastqManifestPhred64,
SingleEndFastqManifestPhred33V2,
SingleEndFastqManifestPhred64V2,
PairedEndFastqManifestPhred33V2,
PairedEndFastqManifestPhred64V2,
QIIME1DemuxDirFmt)


Expand Down Expand Up @@ -387,3 +391,57 @@ def _21(ff: FastqManifestFormat) -> pd.DataFrame:
values='filename')
df.columns.name = None
return df


def _manifest_v2_to_v1(fmt):
df = qiime2.Metadata.load(str(fmt)).to_dataframe()
# Drop unneccessary metadata columns
df = df[list(fmt.METADATA_COLUMNS.keys())]
denormalized_dfs = []
for column, direction in fmt.METADATA_COLUMNS.items():
denormalized_df = df[[column]]
original_index_name = denormalized_df.index.name
denormalized_df.reset_index(drop=False, inplace=True)
denormalized_df.rename(columns={
original_index_name: 'sample-id',
column: 'absolute-filepath'
}, inplace=True)
denormalized_df['direction'] = direction
denormalized_dfs.append(denormalized_df)
old_fmt = FastqManifestFormat()
pd.concat(denormalized_dfs, axis=0).to_csv(str(old_fmt), index=False)
return old_fmt


@plugin.register_transformer
def _23(fmt: SingleEndFastqManifestPhred33V2) \
-> SingleLanePerSampleSingleEndFastqDirFmt:
old_fmt = _manifest_v2_to_v1(fmt)
return _fastq_manifest_helper(old_fmt, _copy_with_compression,
single_end=True)


@plugin.register_transformer
def _24(fmt: SingleEndFastqManifestPhred64V2) \
-> SingleLanePerSampleSingleEndFastqDirFmt:
warnings.warn(_phred64_warning)
old_fmt = _manifest_v2_to_v1(fmt)
return _fastq_manifest_helper(old_fmt, _write_phred64_to_phred33,
single_end=True)


@plugin.register_transformer
def _25(fmt: PairedEndFastqManifestPhred33V2) \
-> SingleLanePerSamplePairedEndFastqDirFmt:
old_fmt = _manifest_v2_to_v1(fmt)
return _fastq_manifest_helper(old_fmt, _copy_with_compression,
single_end=False)


@plugin.register_transformer
def _26(fmt: PairedEndFastqManifestPhred64V2) \
-> SingleLanePerSamplePairedEndFastqDirFmt:
warnings.warn(_phred64_warning)
old_fmt = _manifest_v2_to_v1(fmt)
return _fastq_manifest_helper(old_fmt, _write_phred64_to_phred33,
single_end=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id forward-absolute-filepath reverse-absolute-filepath
Human-Kneecap Human-Kneecap_S1_L001_R1_001.fastq.gz
Peanut-Eyeball Human-Kneecap_S1_L001_R1_001.fastq.gz Human-Kneecap_S1_L001_R1_001.fastq.gz
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id absolute-filepath reverse-absolute-filepath foo
Human-Kneecap $path $path 1
Peanut-Eyeball $path $path 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id absolute-filepath
Human-Kneecap 1
Peanut-Eyeball 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id forward-absolute-filepath reverse-absolute-filepath
Human-Kneecap $path $path
Peanut-Eyeball $path $path
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample-id absolute-filepath
Human-Kneecap $path
Peanut-Eyeball $path
93 changes: 93 additions & 0 deletions q2_types/per_sample_sequences/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
FastqAbsolutePathManifestFormat,
SingleEndFastqManifestPhred33, SingleEndFastqManifestPhred64,
PairedEndFastqManifestPhred33, PairedEndFastqManifestPhred64,
SingleEndFastqManifestPhred33V2, SingleEndFastqManifestPhred64V2,
PairedEndFastqManifestPhred33V2, PairedEndFastqManifestPhred64V2,
SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt,
QIIME1DemuxFormat, QIIME1DemuxDirFmt
Expand All @@ -26,6 +28,97 @@
from qiime2.plugin import ValidationError


class TestAbsoluteFastqManifestV2Formats(TestPluginBase):
package = 'q2_types.per_sample_sequences.tests'

def setUp(self):
super().setUp()
self.se_formats = [SingleEndFastqManifestPhred33V2,
SingleEndFastqManifestPhred64V2]
self.pe_formats = [PairedEndFastqManifestPhred33V2,
PairedEndFastqManifestPhred64V2]

def template_manifest(self, filepath, ctx):
with open(filepath) as fh:
tmpl = string.Template(fh.read())
basename = os.path.basename(filepath)
file_ = os.path.join(self.temp_dir.name, basename)
with open(file_, 'w') as fh:
fh.write(tmpl.substitute(**ctx))
return file_

def test_validate_se_positive(self):
s1 = self.get_data_path('Human-Kneecap_S1_L001_R1_001.fastq.gz')
fp = self.get_data_path('absolute_manifests_v2/single-MANIFEST')
manifest = self.template_manifest(fp, {'path': s1})

for fmt in self.se_formats:
fmt(manifest, mode='r').validate()

def test_validate_pe_positive(self):
s1 = self.get_data_path('Human-Kneecap_S1_L001_R1_001.fastq.gz')
fp = self.get_data_path('absolute_manifests_v2/paired-MANIFEST')
manifest = self.template_manifest(fp, {'path': s1})

for fmt in self.pe_formats:
fmt(manifest, mode='r').validate()

def test_extra_columns(self):
s1 = self.get_data_path('Human-Kneecap_S1_L001_R1_001.fastq.gz')
fp = self.get_data_path('absolute_manifests_v2/multicol-MANIFEST')
manifest = self.template_manifest(fp, {'path': s1})

for fmt in self.se_formats:
fmt(manifest, mode='r').validate()

def test_invalid_metadata(self):
manifest = self.get_data_path('absolute_manifests/single-MANIFEST')

for fmt in self.se_formats:
with self.assertRaisesRegex(ValidationError, 'unrecognized ID'):
fmt(manifest, mode='r').validate()

def test_missing_column_se(self):
manifest = self.get_data_path('absolute_manifests_v2/paired-MANIFEST')

for fmt in self.se_formats:
with self.assertRaisesRegex(ValidationError, 'is not a column'):
fmt(manifest, mode='r').validate()

def test_missing_columns_pe(self):
manifest = self.get_data_path('absolute_manifests_v2/single-MANIFEST')

for fmt in self.pe_formats:
with self.assertRaisesRegex(ValidationError, 'is not a column'):
fmt(manifest, mode='r').validate()

def test_invalid_column_type(self):
manifest = self.get_data_path('absolute_manifests_v2/numeric-MANIFEST')

for fmt in self.se_formats:
with self.assertRaisesRegex(ValidationError, 'is not a column'):
fmt(manifest, mode='r').validate()

def test_missing_files(self):
manifest = self.get_data_path('absolute_manifests_v2/missing-MANIFEST')

for fmt in self.pe_formats:
with self.assertRaisesRegex(
ValidationError,
'Missing.*line 1.*absolute-filepath'):
fmt(manifest, mode='r').validate()

def test_path_not_found(self):
# we make sure the file is missing by skipping the templating step
manifest = self.get_data_path('absolute_manifests_v2/single-MANIFEST')

for fmt in self.se_formats:
with self.assertRaisesRegex(
ValidationError,
'line 1.*absolute-filepath.*Human-Kneecap'):
fmt(manifest, mode='r').validate()


class TestAbsoluteFastqManifestFormats(TestPluginBase):
package = 'q2_types.per_sample_sequences.tests'

Expand Down
Loading

0 comments on commit ee003df

Please sign in to comment.