Skip to content

Commit

Permalink
ENH: new demultiplexed illumina formats (#111)
Browse files Browse the repository at this point in the history
This adds the formats:
 - SingleEndFastqManifestPhred33
 - SingleEndFastqManifestPhred64
 - PairedEndFastqManifestPhred33
 - PairedEndFastqManifestPhred64
  • Loading branch information
gregcaporaso authored and ebolyen committed Mar 31, 2017
1 parent ad67afa commit e37a040
Show file tree
Hide file tree
Showing 7 changed files with 793 additions and 5 deletions.
10 changes: 8 additions & 2 deletions q2_types/per_sample_sequences/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
from ._format import (CasavaOneEightSingleLanePerSampleDirFmt, FastqGzFormat,
YamlFormat, FastqManifestFormat,
SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt)
SingleLanePerSamplePairedEndFastqDirFmt,
SingleEndFastqManifestPhred33,
SingleEndFastqManifestPhred64,
PairedEndFastqManifestPhred33,
PairedEndFastqManifestPhred64)
from ._type import SequencesWithQuality, PairedEndSequencesWithQuality
from ._transformer import PerSampleDNAIterators, PerSamplePairedDNAIterators

Expand All @@ -20,6 +24,8 @@
'SingleLanePerSampleSingleEndFastqDirFmt',
'SingleLanePerSamplePairedEndFastqDirFmt', 'SequencesWithQuality',
'PairedEndSequencesWithQuality', 'PerSampleDNAIterators',
'PerSamplePairedDNAIterators']
'PerSamplePairedDNAIterators', 'SingleEndFastqManifestPhred33',
'SingleEndFastqManifestPhred64', 'PairedEndFastqManifestPhred33',
'PairedEndFastqManifestPhred64']

importlib.import_module('q2_types.per_sample_sequences._transformer')
38 changes: 37 additions & 1 deletion q2_types/per_sample_sequences/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,40 @@ def sniff(self):
return False


class FastqAbsolutePathManifestFormat(model.TextFileFormat):
"""
Mapping of sample identifiers to filepaths and read direction.
"""
def sniff(self):
expected_header = 'sample-id,absolute-filepath,direction'
with self.open() as fh:
for line in fh:
line = line.strip()
if line and not line.startswith('#'):
# the first non-blank, non-comment line should be
# the header
return line == expected_header
# never found the header
return False


class SingleEndFastqManifestPhred33(FastqAbsolutePathManifestFormat):
pass


class SingleEndFastqManifestPhred64(FastqAbsolutePathManifestFormat):
pass


class PairedEndFastqManifestPhred33(FastqAbsolutePathManifestFormat):
pass


class PairedEndFastqManifestPhred64(FastqAbsolutePathManifestFormat):
pass


class CasavaOneEightSingleLanePerSampleDirFmt(model.DirectoryFormat):
sequences = model.FileCollection(
r'.+_.+_L[0-9][0-9][0-9]_R[12]_001\.fastq\.gz',
Expand Down Expand Up @@ -92,5 +126,7 @@ class SingleLanePerSamplePairedEndFastqDirFmt(_SingleLanePerSampleFastqDirFmt):
FastqManifestFormat, YamlFormat, FastqGzFormat,
CasavaOneEightSingleLanePerSampleDirFmt, _SingleLanePerSampleFastqDirFmt,
SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt
SingleLanePerSamplePairedEndFastqDirFmt, SingleEndFastqManifestPhred33,
SingleEndFastqManifestPhred64, PairedEndFastqManifestPhred33,
PairedEndFastqManifestPhred64
)
228 changes: 227 additions & 1 deletion q2_types/per_sample_sequences/_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,19 @@
# ----------------------------------------------------------------------------

import os
import warnings
import collections

import skbio
import yaml
import pandas as pd

from ..plugin_setup import plugin
from . import (SingleLanePerSampleSingleEndFastqDirFmt, FastqManifestFormat,
SingleLanePerSamplePairedEndFastqDirFmt, FastqGzFormat,
CasavaOneEightSingleLanePerSampleDirFmt, YamlFormat)
CasavaOneEightSingleLanePerSampleDirFmt, YamlFormat,
SingleEndFastqManifestPhred33, SingleEndFastqManifestPhred64,
PairedEndFastqManifestPhred33, PairedEndFastqManifestPhred64)


class PerSampleDNAIterators(dict):
Expand Down Expand Up @@ -135,3 +140,224 @@ def _5(dirfmt: SingleLanePerSamplePairedEndFastqDirFmt) \
result.metadata.write_data(metadata, YamlFormat)

return result


def _parse_and_validate_manifest(manifest_fh, single_end):
try:
manifest = pd.read_csv(manifest_fh, comment='#', header=0,
skip_blank_lines=True, dtype=object)
except pd.io.common.CParserError as e:
raise ValueError('All records in manifest must contain '
'exactly three comma-separated fields, but it '
'that appears at least one record contains more. '
'Original error message:\n %s' % str(e))

_validate_header(manifest)

for idx in manifest.index:
record = manifest.loc[idx]
if record.isnull().any():
raise ValueError('All records in manifest must contain '
'exactly three comma-separated fields, but at '
'least one contains fewer. The data in that '
'record is: %s' % ','.join(map(str, record)))
record['absolute-filepath'] = \
os.path.expandvars(record['absolute-filepath'])
_validate_path(record['absolute-filepath'])
_validate_direction(record['direction'])

if single_end:
_validate_single_end_fastq_manifest_directions(manifest)
else:
_validate_paired_end_fastq_manifest_directions(manifest)

return manifest


def _write_phred64_to_phred33(phred64_path, phred33_path):
with open(phred64_path, 'rb') as phred64_fh, \
open(phred33_path, 'wb') as phred33_fh:
for seq in skbio.io.read(phred64_fh, format='fastq',
variant='illumina1.3'):
skbio.io.write(seq, into=phred33_fh,
format='fastq',
variant='illumina1.8',
compression='gzip')


def _validate_header(manifest):
header = manifest.columns
if len(header) != 3:
raise ValueError('manifest header must contain exactly three columns.')
expected_header = ['sample-id', 'absolute-filepath', 'direction']
for i, is_correct in enumerate(header == expected_header):
if not is_correct:
raise ValueError('Expected manifest header column "%s" but '
'observed "%s".'
% (expected_header[i], header[i]))


def _validate_path(path):
if not os.path.isabs(path):
raise ValueError('All paths provided in manifest must be absolute '
'but observed: %s' % path)
if not os.path.exists(path):
raise FileNotFoundError(
'A path specified in the manifest does not exist: '
'%s' % path)


def _validate_direction(direction):
if direction not in {'forward', 'reverse'}:
raise ValueError('Direction can only be "forward" or '
'"reverse", but observed: %s' % direction)


def _duplicated_ids(sample_ids):
counts = collections.Counter(sample_ids).most_common()
if len(counts) == 0 or counts[0][1] == 1:
# if there were no sample ids provided, or the most frequent sample id
# was only observed once, there are no duplicates
return []
else:
return [e[0] for e in counts if e[1] > 1]


def _validate_single_end_fastq_manifest_directions(manifest):
directions = set(manifest['direction'])
if len(directions) > 1:
raise ValueError('Manifest for single-end reads can '
'contain only forward or reverse reads '
'but the following directions were observed: %s'
% ', '.join(directions))

duplicated_ids = _duplicated_ids(manifest['sample-id'])
if len(duplicated_ids) > 0:
raise ValueError('Each sample id can only appear one time in a '
'manifest for single-end reads, but the following '
'sample ids were observed more than once: '
'%s' % ', '.join(duplicated_ids))


def _validate_paired_end_fastq_manifest_directions(manifest):
forward_direction_sample_ids = []
reverse_direction_sample_ids = []

for _, sample_id, _, direction in manifest.itertuples():
if direction == 'forward':
forward_direction_sample_ids.append(sample_id)
elif direction == 'reverse':
reverse_direction_sample_ids.append(sample_id)
else:
raise ValueError('Directions can only be "forward" and '
'"reverse", but observed: %s' % direction)

duplicated_ids_forward = _duplicated_ids(forward_direction_sample_ids)
if len(duplicated_ids_forward) > 0:
raise ValueError('Each sample id can have only one forward read '
'record in a paired-end read manifest, but the '
'following sample ids were associated with more '
'than one forward read record: '
'%s' % ', '.join(duplicated_ids_forward))

duplicated_ids_reverse = _duplicated_ids(reverse_direction_sample_ids)
if len(duplicated_ids_reverse) > 0:
raise ValueError('Each sample id can have only one reverse read '
'record in a paired-end read manifest, but the '
'following sample ids were associated with more '
'than one reverse read record: '
'%s' % ', '.join(duplicated_ids_reverse))

if sorted(forward_direction_sample_ids) != \
sorted(reverse_direction_sample_ids):

forward_but_no_reverse = set(forward_direction_sample_ids) - \
set(reverse_direction_sample_ids)

reverse_but_no_forward = set(reverse_direction_sample_ids) - \
set(forward_direction_sample_ids)

if len(forward_but_no_reverse) > 0:
raise ValueError('Forward and reverse reads must be provided '
'exactly one time each for each sample. The '
'following samples had forward but not '
'reverse read fastq files: %s'
% ', '.join(forward_but_no_reverse))
else:
raise ValueError('Forward and reverse reads must be provided '
'exactly one time each for each sample. The '
'following samples had reverse but not '
'forward read fastq files: %s'
% ', '.join(reverse_but_no_forward))


def _fastq_manifest_helper(fmt, fastq_copy_fn, single_end):
direction_to_read_number = {'forward': 1, 'reverse': 2}
input_manifest = _parse_and_validate_manifest(fmt.open(),
single_end=single_end)
if single_end:
result = SingleLanePerSampleSingleEndFastqDirFmt()
else:
result = SingleLanePerSamplePairedEndFastqDirFmt()

output_manifest_data = []
for idx, sample_id, input_fastq_fp, direction in \
input_manifest.itertuples():
read_number = direction_to_read_number[direction]
output_fastq_fp = \
result.sequences.path_maker(sample_id=sample_id,
# the remaining values aren't used
# internally by QIIME, so their values
# aren't very important
barcode_id=idx,
lane_number=1,
read_number=read_number)
output_manifest_data.append(
[sample_id, output_fastq_fp.name, direction])
fastq_copy_fn(input_fastq_fp, str(output_fastq_fp))

output_manifest = FastqManifestFormat()
output_manifest_df = \
pd.DataFrame(output_manifest_data,
columns=['sample-id', 'filename', 'direction'])
output_manifest_df.to_csv(str(output_manifest), index=False)
result.manifest.write_data(output_manifest, FastqManifestFormat)

metadata = YamlFormat()
metadata.path.write_text(yaml.dump({'phred-offset': 33}))
result.metadata.write_data(metadata, YamlFormat)

return result


_phred64_warning = ('Importing of PHRED 64 data is slow as it is converted '
'internally to PHRED 33. Working with the imported data '
'will not be slower than working with PHRED 33 data.')


@plugin.register_transformer
def _6(fmt: SingleEndFastqManifestPhred33) \
-> SingleLanePerSampleSingleEndFastqDirFmt:
return _fastq_manifest_helper(fmt, os.link, single_end=True)


@plugin.register_transformer
def _7(fmt: SingleEndFastqManifestPhred64) \
-> SingleLanePerSampleSingleEndFastqDirFmt:
warnings.warn(_phred64_warning)
return _fastq_manifest_helper(fmt, _write_phred64_to_phred33,
single_end=True)


@plugin.register_transformer
def _8(fmt: PairedEndFastqManifestPhred33) \
-> SingleLanePerSamplePairedEndFastqDirFmt:
return _fastq_manifest_helper(fmt, os.link, single_end=False)


@plugin.register_transformer
def _9(fmt: PairedEndFastqManifestPhred64) \
-> SingleLanePerSamplePairedEndFastqDirFmt:
warnings.warn(_phred64_warning)
return _fastq_manifest_helper(fmt, _write_phred64_to_phred33,
single_end=False)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit e37a040

Please sign in to comment.