From 31e6d202f079da9a3413727044c52bcde5120e0b Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 24 Jul 2017 15:18:47 -0700 Subject: [PATCH 1/2] ENH: Support importing MiSeq demultiplexed data Fixes #128 --- q2_types/per_sample_sequences/__init__.py | 10 ++--- q2_types/per_sample_sequences/_format.py | 13 +++++- q2_types/per_sample_sequences/_transformer.py | 28 ++++++++++--- .../data/Human-Kneecap_S1_R1_001.fastq.gz | Bin 0 -> 763 bytes .../per_sample_sequences/tests/test_format.py | 21 +++++++++- .../tests/test_transformer.py | 37 ++++++++++++++++++ 6 files changed, 94 insertions(+), 15 deletions(-) create mode 100644 q2_types/per_sample_sequences/tests/data/Human-Kneecap_S1_R1_001.fastq.gz diff --git a/q2_types/per_sample_sequences/__init__.py b/q2_types/per_sample_sequences/__init__.py index b78bb192..a724ec1a 100644 --- a/q2_types/per_sample_sequences/__init__.py +++ b/q2_types/per_sample_sequences/__init__.py @@ -8,9 +8,9 @@ import importlib -from ._format import (CasavaOneEightSingleLanePerSampleDirFmt, FastqGzFormat, - YamlFormat, FastqManifestFormat, - FastqAbsolutePathManifestFormat, +from ._format import (CasavaOneEightSingleLanePerSampleDirFmt, + MiSeqDemuxDirFmt, FastqGzFormat, YamlFormat, + FastqManifestFormat, FastqAbsolutePathManifestFormat, SingleLanePerSampleSingleEndFastqDirFmt, SingleLanePerSamplePairedEndFastqDirFmt, SingleEndFastqManifestPhred33, @@ -20,8 +20,8 @@ from ._type import SequencesWithQuality, PairedEndSequencesWithQuality from ._transformer import PerSampleDNAIterators, PerSamplePairedDNAIterators -__all__ = ['CasavaOneEightSingleLanePerSampleDirFmt', 'FastqGzFormat', - 'YamlFormat', 'FastqManifestFormat', +__all__ = ['CasavaOneEightSingleLanePerSampleDirFmt', 'MiSeqDemuxDirFmt', + 'FastqGzFormat', 'YamlFormat', 'FastqManifestFormat', 'FastqAbsolutePathManifestFormat', 'SingleLanePerSampleSingleEndFastqDirFmt', 'SingleLanePerSamplePairedEndFastqDirFmt', 'SequencesWithQuality', diff --git a/q2_types/per_sample_sequences/_format.py b/q2_types/per_sample_sequences/_format.py index 8480f2b4..7bbe93c4 100644 --- a/q2_types/per_sample_sequences/_format.py +++ b/q2_types/per_sample_sequences/_format.py @@ -151,10 +151,19 @@ class SingleLanePerSamplePairedEndFastqDirFmt(_SingleLanePerSampleFastqDirFmt): pass +class MiSeqDemuxDirFmt(model.DirectoryFormat): + sequences = model.FileCollection(r'.+_.+_R[12]_001\.fastq\.gz', + format=FastqGzFormat) + + @sequences.set_path_maker + def sequences_path_maker(self, sample_id, barcode_id, read_number): + return '%s_%s_R%d_001.fastq.gz' % (sample_id, barcode_id, read_number) + + plugin.register_formats( FastqManifestFormat, YamlFormat, FastqGzFormat, - CasavaOneEightSingleLanePerSampleDirFmt, _SingleLanePerSampleFastqDirFmt, - SingleLanePerSampleSingleEndFastqDirFmt, + CasavaOneEightSingleLanePerSampleDirFmt, MiSeqDemuxDirFmt, + _SingleLanePerSampleFastqDirFmt, SingleLanePerSampleSingleEndFastqDirFmt, SingleLanePerSamplePairedEndFastqDirFmt, SingleEndFastqManifestPhred33, SingleEndFastqManifestPhred64, PairedEndFastqManifestPhred33, PairedEndFastqManifestPhred64 diff --git a/q2_types/per_sample_sequences/_transformer.py b/q2_types/per_sample_sequences/_transformer.py index fabaccc0..1e958ff4 100644 --- a/q2_types/per_sample_sequences/_transformer.py +++ b/q2_types/per_sample_sequences/_transformer.py @@ -21,7 +21,7 @@ from . import (SingleLanePerSampleSingleEndFastqDirFmt, FastqManifestFormat, FastqAbsolutePathManifestFormat, FastqGzFormat, SingleLanePerSamplePairedEndFastqDirFmt, YamlFormat, - CasavaOneEightSingleLanePerSampleDirFmt, + CasavaOneEightSingleLanePerSampleDirFmt, MiSeqDemuxDirFmt, SingleEndFastqManifestPhred33, SingleEndFastqManifestPhred64, PairedEndFastqManifestPhred33, PairedEndFastqManifestPhred64) @@ -76,18 +76,22 @@ def _2(dirfmt: SingleLanePerSamplePairedEndFastqDirFmt) \ return result -def _single_lane_per_sample_fastq_helper(dirfmt, output_cls): +def _single_lane_per_sample_fastq_helper(dirfmt, output_cls, parse_lane=True): result = output_cls() manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') directions = ['forward', 'reverse'] for path, view in dirfmt.sequences.iter_views(FastqGzFormat): - - sample_id, barcode_id, lane_number, read_number, _ = \ - str(path).replace('.fastq.gz', '').rsplit('_', maxsplit=4) + filename = str(path).replace('.fastq.gz', '') + if parse_lane: + sample_id, barcode_id, lane_number, read_number, _ = \ + filename.rsplit('_', maxsplit=4) + else: + sample_id, barcode_id, read_number, _ = \ + filename.rsplit('_', maxsplit=3) read_number = int(read_number[1:]) - lane_number = int(lane_number[1:]) + lane_number = int(lane_number[1:]) if parse_lane else 1 direction = directions[read_number - 1] result.sequences.write_data(view, FastqGzFormat, sample_id=sample_id, barcode_id=barcode_id, @@ -119,6 +123,18 @@ def _4(dirfmt: CasavaOneEightSingleLanePerSampleDirFmt) \ dirfmt, SingleLanePerSamplePairedEndFastqDirFmt) +@plugin.register_transformer +def _10(dirfmt: MiSeqDemuxDirFmt) -> SingleLanePerSampleSingleEndFastqDirFmt: + return _single_lane_per_sample_fastq_helper( + dirfmt, SingleLanePerSampleSingleEndFastqDirFmt, parse_lane=False) + + +@plugin.register_transformer +def _11(dirfmt: MiSeqDemuxDirFmt) -> SingleLanePerSamplePairedEndFastqDirFmt: + return _single_lane_per_sample_fastq_helper( + dirfmt, SingleLanePerSamplePairedEndFastqDirFmt, parse_lane=False) + + @plugin.register_transformer def _5(dirfmt: SingleLanePerSamplePairedEndFastqDirFmt) \ -> SingleLanePerSampleSingleEndFastqDirFmt: diff --git a/q2_types/per_sample_sequences/tests/data/Human-Kneecap_S1_R1_001.fastq.gz b/q2_types/per_sample_sequences/tests/data/Human-Kneecap_S1_R1_001.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..c2a52db4149ccc5d9c59973ea01e6acd3956467f GIT binary patch literal 763 zcmVjdc#% zwuk02UeWyQ8fZn^$irIGKxcaQt%ctp(?g<+l*%B~U)CvN#+*eYnW;%MVx^ogS3e0N z`H8zuQC}%#%2|@3JhBFelvBx5lZr+2X+f!~=|m_75kx7yVChFfYB!~ZCM#O3eac$| z>5xh>nNe-B6ipgke6x0>Eom!M=N!O;bJMcMNX#iP2Ayz>Aw-TLh9WT0Mm)>27b|R=Xol!Epf-=8}!f-tw3k8?U|fk zq=co>Sl#;$#z3>IKst8Mon#$c_hz?o4o~CCll01bbZk1`y>!<328IDQKe+F`pV_oe zKTa+zp&yRJ0E0JvTL2n+I;?+ZvNMAjpf{tvGD$m=cw-W;Os0HeGVM%ap`~ndL8Ab- zgQl1je~Ha?b7fzPP*Ja>Qu%AFGOpCtEXrwtPs_c5pYS)j_~J*T-gvU7?% zrOm`<>glRXYQ0pt%7sa*XDNjhacb>U$5}QrJun_}y6{#bm~T|Z z$3m&psibNYzc)&jaFZwf!jZyVb0vDFc~hcU2k2>-PSXjHx2j@v=tWxA=4mEqJn=%q zNsgpn7T25smVonPOt#|){i!kk#^k>D`HPq=_g!VOMtlL2_#c?WkC?P_O~!!1H5Vq& zXR6m1f5P1(Jn|=!(sc(D^E Date: Mon, 24 Jul 2017 16:22:03 -0700 Subject: [PATCH 2/2] SQUASH: renaming --- q2_types/per_sample_sequences/__init__.py | 6 ++++-- q2_types/per_sample_sequences/_format.py | 5 +++-- q2_types/per_sample_sequences/_transformer.py | 9 ++++++--- .../per_sample_sequences/tests/test_format.py | 15 ++++++++++----- .../tests/test_transformer.py | 8 +++++--- 5 files changed, 28 insertions(+), 15 deletions(-) diff --git a/q2_types/per_sample_sequences/__init__.py b/q2_types/per_sample_sequences/__init__.py index a724ec1a..538f1197 100644 --- a/q2_types/per_sample_sequences/__init__.py +++ b/q2_types/per_sample_sequences/__init__.py @@ -9,7 +9,8 @@ import importlib from ._format import (CasavaOneEightSingleLanePerSampleDirFmt, - MiSeqDemuxDirFmt, FastqGzFormat, YamlFormat, + CasavaOneEightLanelessPerSampleDirFmt, + FastqGzFormat, YamlFormat, FastqManifestFormat, FastqAbsolutePathManifestFormat, SingleLanePerSampleSingleEndFastqDirFmt, SingleLanePerSamplePairedEndFastqDirFmt, @@ -20,7 +21,8 @@ from ._type import SequencesWithQuality, PairedEndSequencesWithQuality from ._transformer import PerSampleDNAIterators, PerSamplePairedDNAIterators -__all__ = ['CasavaOneEightSingleLanePerSampleDirFmt', 'MiSeqDemuxDirFmt', +__all__ = ['CasavaOneEightSingleLanePerSampleDirFmt', + 'CasavaOneEightLanelessPerSampleDirFmt', 'FastqGzFormat', 'YamlFormat', 'FastqManifestFormat', 'FastqAbsolutePathManifestFormat', 'SingleLanePerSampleSingleEndFastqDirFmt', diff --git a/q2_types/per_sample_sequences/_format.py b/q2_types/per_sample_sequences/_format.py index 7bbe93c4..07425240 100644 --- a/q2_types/per_sample_sequences/_format.py +++ b/q2_types/per_sample_sequences/_format.py @@ -151,7 +151,7 @@ class SingleLanePerSamplePairedEndFastqDirFmt(_SingleLanePerSampleFastqDirFmt): pass -class MiSeqDemuxDirFmt(model.DirectoryFormat): +class CasavaOneEightLanelessPerSampleDirFmt(model.DirectoryFormat): sequences = model.FileCollection(r'.+_.+_R[12]_001\.fastq\.gz', format=FastqGzFormat) @@ -162,7 +162,8 @@ def sequences_path_maker(self, sample_id, barcode_id, read_number): plugin.register_formats( FastqManifestFormat, YamlFormat, FastqGzFormat, - CasavaOneEightSingleLanePerSampleDirFmt, MiSeqDemuxDirFmt, + CasavaOneEightSingleLanePerSampleDirFmt, + CasavaOneEightLanelessPerSampleDirFmt, _SingleLanePerSampleFastqDirFmt, SingleLanePerSampleSingleEndFastqDirFmt, SingleLanePerSamplePairedEndFastqDirFmt, SingleEndFastqManifestPhred33, SingleEndFastqManifestPhred64, PairedEndFastqManifestPhred33, diff --git a/q2_types/per_sample_sequences/_transformer.py b/q2_types/per_sample_sequences/_transformer.py index 1e958ff4..d92d5b3b 100644 --- a/q2_types/per_sample_sequences/_transformer.py +++ b/q2_types/per_sample_sequences/_transformer.py @@ -21,7 +21,8 @@ from . import (SingleLanePerSampleSingleEndFastqDirFmt, FastqManifestFormat, FastqAbsolutePathManifestFormat, FastqGzFormat, SingleLanePerSamplePairedEndFastqDirFmt, YamlFormat, - CasavaOneEightSingleLanePerSampleDirFmt, MiSeqDemuxDirFmt, + CasavaOneEightSingleLanePerSampleDirFmt, + CasavaOneEightLanelessPerSampleDirFmt, SingleEndFastqManifestPhred33, SingleEndFastqManifestPhred64, PairedEndFastqManifestPhred33, PairedEndFastqManifestPhred64) @@ -124,13 +125,15 @@ def _4(dirfmt: CasavaOneEightSingleLanePerSampleDirFmt) \ @plugin.register_transformer -def _10(dirfmt: MiSeqDemuxDirFmt) -> SingleLanePerSampleSingleEndFastqDirFmt: +def _10(dirfmt: CasavaOneEightLanelessPerSampleDirFmt) \ + -> SingleLanePerSampleSingleEndFastqDirFmt: return _single_lane_per_sample_fastq_helper( dirfmt, SingleLanePerSampleSingleEndFastqDirFmt, parse_lane=False) @plugin.register_transformer -def _11(dirfmt: MiSeqDemuxDirFmt) -> SingleLanePerSamplePairedEndFastqDirFmt: +def _11(dirfmt: CasavaOneEightLanelessPerSampleDirFmt) \ + -> SingleLanePerSamplePairedEndFastqDirFmt: return _single_lane_per_sample_fastq_helper( dirfmt, SingleLanePerSamplePairedEndFastqDirFmt, parse_lane=False) diff --git a/q2_types/per_sample_sequences/tests/test_format.py b/q2_types/per_sample_sequences/tests/test_format.py index 0ae39b3e..bd0ea46a 100644 --- a/q2_types/per_sample_sequences/tests/test_format.py +++ b/q2_types/per_sample_sequences/tests/test_format.py @@ -10,8 +10,10 @@ import unittest from q2_types.per_sample_sequences import ( - CasavaOneEightSingleLanePerSampleDirFmt, MiSeqDemuxDirFmt, FastqGzFormat, - YamlFormat, FastqManifestFormat, FastqAbsolutePathManifestFormat, + CasavaOneEightSingleLanePerSampleDirFmt, + CasavaOneEightLanelessPerSampleDirFmt, + FastqGzFormat, YamlFormat, FastqManifestFormat, + FastqAbsolutePathManifestFormat, SingleEndFastqManifestPhred33, SingleEndFastqManifestPhred64, PairedEndFastqManifestPhred33, PairedEndFastqManifestPhred64, SingleLanePerSampleSingleEndFastqDirFmt, @@ -130,7 +132,8 @@ def test_miseq_demux_dir_fmt_validate_positive(self): filepath = self.get_data_path('Human-Kneecap_S1_R1_001.fastq.gz') shutil.copy(filepath, self.temp_dir.name) - format = MiSeqDemuxDirFmt(self.temp_dir.name, mode='r') + format = CasavaOneEightLanelessPerSampleDirFmt(self.temp_dir.name, + mode='r') format.validate() @@ -138,9 +141,11 @@ def test_miseq_demux_dir_fmt_validate_negative(self): filepath = self.get_data_path('not-fastq.fastq.gz') shutil.copy(filepath, self.temp_dir.name) - format = MiSeqDemuxDirFmt(self.temp_dir.name, mode='r') + format = CasavaOneEightLanelessPerSampleDirFmt(self.temp_dir.name, + mode='r') - with self.assertRaisesRegex(ValueError, 'MiSeqDemuxDirFmt'): + with self.assertRaisesRegex(ValueError, + 'CasavaOneEightLanelessPerSampleDirFmt'): format.validate() def test_slanepsample_single_end_fastq_dir_fmt_validate_positive(self): diff --git a/q2_types/per_sample_sequences/tests/test_transformer.py b/q2_types/per_sample_sequences/tests/test_transformer.py index cfe541b0..c441fe08 100644 --- a/q2_types/per_sample_sequences/tests/test_transformer.py +++ b/q2_types/per_sample_sequences/tests/test_transformer.py @@ -20,7 +20,7 @@ SingleLanePerSampleSingleEndFastqDirFmt, SingleLanePerSamplePairedEndFastqDirFmt, CasavaOneEightSingleLanePerSampleDirFmt, - MiSeqDemuxDirFmt, + CasavaOneEightLanelessPerSampleDirFmt, SingleEndFastqManifestPhred33, SingleEndFastqManifestPhred64, PairedEndFastqManifestPhred33, @@ -138,7 +138,8 @@ def test_casava_one_eight_single_lane_per_sample_dirfmt_to_slpspefdf(self): def test_miseq_demux_dirfmt_to_slpssefdf(self): input, obs = self.transform_format( - MiSeqDemuxDirFmt, SingleLanePerSampleSingleEndFastqDirFmt, + CasavaOneEightLanelessPerSampleDirFmt, + SingleLanePerSampleSingleEndFastqDirFmt, filenames=('Human-Kneecap_S1_R1_001.fastq.gz',), ) @@ -156,7 +157,8 @@ def test_miseq_demux_dirfmt_to_slpssefdf(self): def test_miseq_demux_dirfmt_to_slpspefdf(self): input, obs = self.transform_format( - MiSeqDemuxDirFmt, SingleLanePerSamplePairedEndFastqDirFmt, + CasavaOneEightLanelessPerSampleDirFmt, + SingleLanePerSamplePairedEndFastqDirFmt, filenames=('Human-Kneecap_S1_R1_001.fastq.gz',), )