From 7afe502e80139870c2c8d81acbcd53aa35468a0d Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 26 May 2020 15:37:11 -0700 Subject: [PATCH] IMP: transformers SingleLanePerSample... to Casava Fixes #209 --- q2_types/per_sample_sequences/_format.py | 24 ++++++- q2_types/per_sample_sequences/_transformer.py | 63 +++++++++++++++---- .../per_sample_sequences/tests/test_format.py | 12 ++++ 3 files changed, 87 insertions(+), 12 deletions(-) diff --git a/q2_types/per_sample_sequences/_format.py b/q2_types/per_sample_sequences/_format.py index 63fb9b35..4fb39581 100644 --- a/q2_types/per_sample_sequences/_format.py +++ b/q2_types/per_sample_sequences/_format.py @@ -10,6 +10,7 @@ import gzip import itertools import collections +import pathlib import pandas as pd import skbio @@ -17,7 +18,7 @@ import yaml import qiime2 import qiime2.plugin.model as model -from qiime2.plugin import ValidationError +from qiime2.plugin import ValidationError, util from ..plugin_setup import plugin @@ -296,6 +297,27 @@ def sequences_path_maker(self, sample_id, barcode_id, lane_number, def _find_duplicates(self, ids): return {x for x, c in collections.Counter(ids).items() if c > 1} + @property + def manifest(self): + # Invoke via the transformation API to + # a) prevent circular import issues + # b) potentially tie into provenance + tmp_manifest = util.transform(self, to_type=FastqManifestFormat) + df = util.transform(tmp_manifest, to_type=pd.DataFrame) + + if 'reverse' not in df: + df['reverse'] = None + + def munge_fn_closure(val): + if val is not None: + return str(self.path / pathlib.Path(val).name) + return val + + for column in {'forward', 'reverse'}: + df[column] = df[column].apply(munge_fn_closure) + + return df + def _validate_(self, level): forwards = [] reverse = [] diff --git a/q2_types/per_sample_sequences/_transformer.py b/q2_types/per_sample_sequences/_transformer.py index 8f4ed243..58518531 100644 --- a/q2_types/per_sample_sequences/_transformer.py +++ b/q2_types/per_sample_sequences/_transformer.py @@ -33,23 +33,31 @@ QIIME1DemuxDirFmt) +def _util_parse_casava_filename(path, parse_lane=True): + directions = ['forward', 'reverse'] + filename = str(path).replace('.fastq.gz', '') + if parse_lane: + sample_id, barcode_id, lane_number, read_number, _ = \ + filename.rsplit('_', maxsplit=4) + else: + sample_id, barcode_id, read_number, _ = \ + filename.rsplit('_', maxsplit=3) + read_number = int(read_number[1:]) + lane_number = int(lane_number[1:]) if parse_lane else 1 + direction = directions[read_number - 1] + + return sample_id, barcode_id, lane_number, read_number, direction + + def _single_lane_per_sample_fastq_helper(dirfmt, output_cls, parse_lane=True): result = output_cls() manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') - directions = ['forward', 'reverse'] for path, view in dirfmt.sequences.iter_views(FastqGzFormat): - filename = str(path).replace('.fastq.gz', '') - if parse_lane: - sample_id, barcode_id, lane_number, read_number, _ = \ - filename.rsplit('_', maxsplit=4) - else: - sample_id, barcode_id, read_number, _ = \ - filename.rsplit('_', maxsplit=3) - read_number = int(read_number[1:]) - lane_number = int(lane_number[1:]) if parse_lane else 1 - direction = directions[read_number - 1] + parsed = _util_parse_casava_filename(path, parse_lane) + sample_id, barcode_id, lane_number, read_number, direction = parsed + result.sequences.write_data(view, FastqGzFormat, sample_id=sample_id, barcode_id=barcode_id, lane_number=lane_number, @@ -73,6 +81,27 @@ def _single_lane_per_sample_fastq_helper(dirfmt, output_cls, parse_lane=True): return result +def _util_dirfmt_to_casava(dirfmt_in): + dirfmt_out = CasavaOneEightSingleLanePerSampleDirFmt() + for fastq, _ in dirfmt_in.sequences.iter_views(FastqGzFormat): + from_fp = str(dirfmt_in.path / fastq.name) + to_fp = str(dirfmt_out.path / fastq.name) + qiime2.util.duplicate(from_fp, to_fp) + return dirfmt_out + + +@plugin.register_transformer +def _2_and_a_half(dirfmt: CasavaOneEightSingleLanePerSampleDirFmt) \ + -> FastqManifestFormat: + manifest = FastqManifestFormat() + with manifest.open() as fh: + fh.write('sample-id,filename,direction\n') + for fp, _ in dirfmt.sequences.iter_views(FastqGzFormat): + sample_id, _, _, _, direction = _util_parse_casava_filename(fp) + fh.write('%s,%s,%s\n' % (sample_id, fp.name, direction)) + return manifest + + @plugin.register_transformer def _3(dirfmt: CasavaOneEightSingleLanePerSampleDirFmt) \ -> SingleLanePerSampleSingleEndFastqDirFmt: @@ -80,6 +109,12 @@ def _3(dirfmt: CasavaOneEightSingleLanePerSampleDirFmt) \ dirfmt, SingleLanePerSampleSingleEndFastqDirFmt) +@plugin.register_transformer +def _3_and_a_half(dirfmt_in: SingleLanePerSampleSingleEndFastqDirFmt) \ + -> CasavaOneEightSingleLanePerSampleDirFmt: + return _util_dirfmt_to_casava(dirfmt_in) + + @plugin.register_transformer def _4(dirfmt: CasavaOneEightSingleLanePerSampleDirFmt) \ -> SingleLanePerSamplePairedEndFastqDirFmt: @@ -87,6 +122,12 @@ def _4(dirfmt: CasavaOneEightSingleLanePerSampleDirFmt) \ dirfmt, SingleLanePerSamplePairedEndFastqDirFmt) +@plugin.register_transformer +def _4_and_a_half(dirfmt_in: SingleLanePerSamplePairedEndFastqDirFmt) \ + -> CasavaOneEightSingleLanePerSampleDirFmt: + return _util_dirfmt_to_casava(dirfmt_in) + + @plugin.register_transformer def _10(dirfmt: CasavaOneEightLanelessPerSampleDirFmt) \ -> SingleLanePerSampleSingleEndFastqDirFmt: diff --git a/q2_types/per_sample_sequences/tests/test_format.py b/q2_types/per_sample_sequences/tests/test_format.py index 7b3fd222..328ee2be 100644 --- a/q2_types/per_sample_sequences/tests/test_format.py +++ b/q2_types/per_sample_sequences/tests/test_format.py @@ -11,6 +11,7 @@ import unittest import string +import pandas as pd from q2_types.per_sample_sequences import ( CasavaOneEightSingleLanePerSampleDirFmt, CasavaOneEightLanelessPerSampleDirFmt, @@ -328,6 +329,17 @@ def test_casava_one_eight_slanepsample_dir_fmt_validate_positive(self): format.validate() + def test_casava_one_eight_slanepsample_dir_fmt_manifest_property(self): + filepath = self.get_data_path('Human-Kneecap_S1_L001_R1_001.fastq.gz') + shutil.copy(filepath, self.temp_dir.name) + + format = CasavaOneEightSingleLanePerSampleDirFmt( + self.temp_dir.name, mode='r') + + format.validate() + self.assertTrue(True) + self.assertIsInstance(format.manifest, pd.DataFrame) + def test_casava_one_eight_slanepsample_dir_fmt_validate_negative(self): filepath = self.get_data_path('not-fastq.fastq.gz') shutil.copy(filepath, self.temp_dir.name)