Skip to content

Commit

Permalink
IMP: transformers SingleLanePerSample... to Casava
Browse files Browse the repository at this point in the history
Fixes #209
  • Loading branch information
thermokarst committed May 26, 2020
1 parent 058ee0e commit 7afe502
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 12 deletions.
24 changes: 23 additions & 1 deletion q2_types/per_sample_sequences/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@
import gzip
import itertools
import collections
import pathlib

import pandas as pd
import skbio
import skbio.io
import yaml
import qiime2
import qiime2.plugin.model as model
from qiime2.plugin import ValidationError
from qiime2.plugin import ValidationError, util

from ..plugin_setup import plugin

Expand Down Expand Up @@ -296,6 +297,27 @@ def sequences_path_maker(self, sample_id, barcode_id, lane_number,
def _find_duplicates(self, ids):
return {x for x, c in collections.Counter(ids).items() if c > 1}

@property
def manifest(self):
# Invoke via the transformation API to
# a) prevent circular import issues
# b) potentially tie into provenance
tmp_manifest = util.transform(self, to_type=FastqManifestFormat)
df = util.transform(tmp_manifest, to_type=pd.DataFrame)

if 'reverse' not in df:
df['reverse'] = None

def munge_fn_closure(val):
if val is not None:
return str(self.path / pathlib.Path(val).name)
return val

for column in {'forward', 'reverse'}:
df[column] = df[column].apply(munge_fn_closure)

return df

def _validate_(self, level):
forwards = []
reverse = []
Expand Down
63 changes: 52 additions & 11 deletions q2_types/per_sample_sequences/_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,31 @@
QIIME1DemuxDirFmt)


def _util_parse_casava_filename(path, parse_lane=True):
directions = ['forward', 'reverse']
filename = str(path).replace('.fastq.gz', '')
if parse_lane:
sample_id, barcode_id, lane_number, read_number, _ = \
filename.rsplit('_', maxsplit=4)
else:
sample_id, barcode_id, read_number, _ = \
filename.rsplit('_', maxsplit=3)
read_number = int(read_number[1:])
lane_number = int(lane_number[1:]) if parse_lane else 1
direction = directions[read_number - 1]

return sample_id, barcode_id, lane_number, read_number, direction


def _single_lane_per_sample_fastq_helper(dirfmt, output_cls, parse_lane=True):
result = output_cls()
manifest = FastqManifestFormat()
manifest_fh = manifest.open()
manifest_fh.write('sample-id,filename,direction\n')
directions = ['forward', 'reverse']
for path, view in dirfmt.sequences.iter_views(FastqGzFormat):
filename = str(path).replace('.fastq.gz', '')
if parse_lane:
sample_id, barcode_id, lane_number, read_number, _ = \
filename.rsplit('_', maxsplit=4)
else:
sample_id, barcode_id, read_number, _ = \
filename.rsplit('_', maxsplit=3)
read_number = int(read_number[1:])
lane_number = int(lane_number[1:]) if parse_lane else 1
direction = directions[read_number - 1]
parsed = _util_parse_casava_filename(path, parse_lane)
sample_id, barcode_id, lane_number, read_number, direction = parsed

result.sequences.write_data(view, FastqGzFormat, sample_id=sample_id,
barcode_id=barcode_id,
lane_number=lane_number,
Expand All @@ -73,20 +81,53 @@ def _single_lane_per_sample_fastq_helper(dirfmt, output_cls, parse_lane=True):
return result


def _util_dirfmt_to_casava(dirfmt_in):
dirfmt_out = CasavaOneEightSingleLanePerSampleDirFmt()
for fastq, _ in dirfmt_in.sequences.iter_views(FastqGzFormat):
from_fp = str(dirfmt_in.path / fastq.name)
to_fp = str(dirfmt_out.path / fastq.name)
qiime2.util.duplicate(from_fp, to_fp)
return dirfmt_out


@plugin.register_transformer
def _2_and_a_half(dirfmt: CasavaOneEightSingleLanePerSampleDirFmt) \
-> FastqManifestFormat:
manifest = FastqManifestFormat()
with manifest.open() as fh:
fh.write('sample-id,filename,direction\n')
for fp, _ in dirfmt.sequences.iter_views(FastqGzFormat):
sample_id, _, _, _, direction = _util_parse_casava_filename(fp)
fh.write('%s,%s,%s\n' % (sample_id, fp.name, direction))
return manifest


@plugin.register_transformer
def _3(dirfmt: CasavaOneEightSingleLanePerSampleDirFmt) \
-> SingleLanePerSampleSingleEndFastqDirFmt:
return _single_lane_per_sample_fastq_helper(
dirfmt, SingleLanePerSampleSingleEndFastqDirFmt)


@plugin.register_transformer
def _3_and_a_half(dirfmt_in: SingleLanePerSampleSingleEndFastqDirFmt) \
-> CasavaOneEightSingleLanePerSampleDirFmt:
return _util_dirfmt_to_casava(dirfmt_in)


@plugin.register_transformer
def _4(dirfmt: CasavaOneEightSingleLanePerSampleDirFmt) \
-> SingleLanePerSamplePairedEndFastqDirFmt:
return _single_lane_per_sample_fastq_helper(
dirfmt, SingleLanePerSamplePairedEndFastqDirFmt)


@plugin.register_transformer
def _4_and_a_half(dirfmt_in: SingleLanePerSamplePairedEndFastqDirFmt) \
-> CasavaOneEightSingleLanePerSampleDirFmt:
return _util_dirfmt_to_casava(dirfmt_in)


@plugin.register_transformer
def _10(dirfmt: CasavaOneEightLanelessPerSampleDirFmt) \
-> SingleLanePerSampleSingleEndFastqDirFmt:
Expand Down
12 changes: 12 additions & 0 deletions q2_types/per_sample_sequences/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import unittest
import string

import pandas as pd
from q2_types.per_sample_sequences import (
CasavaOneEightSingleLanePerSampleDirFmt,
CasavaOneEightLanelessPerSampleDirFmt,
Expand Down Expand Up @@ -328,6 +329,17 @@ def test_casava_one_eight_slanepsample_dir_fmt_validate_positive(self):

format.validate()

def test_casava_one_eight_slanepsample_dir_fmt_manifest_property(self):
filepath = self.get_data_path('Human-Kneecap_S1_L001_R1_001.fastq.gz')
shutil.copy(filepath, self.temp_dir.name)

format = CasavaOneEightSingleLanePerSampleDirFmt(
self.temp_dir.name, mode='r')

format.validate()
self.assertTrue(True)
self.assertIsInstance(format.manifest, pd.DataFrame)

def test_casava_one_eight_slanepsample_dir_fmt_validate_negative(self):
filepath = self.get_data_path('not-fastq.fastq.gz')
shutil.copy(filepath, self.temp_dir.name)
Expand Down

0 comments on commit 7afe502

Please sign in to comment.