Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 19 additions & 33 deletions qiita_files/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,11 @@
import numpy as np
from future.utils import viewitems, viewvalues
from future.builtins import zip
from skbio.parse.sequences import load
from skbio.format.sequences import format_fastq_record

from .util import open_file
from qiita_files.parse import load
from qiita_files.format.fasta import format_fasta_record
from qiita_files.format.fastq import format_fastq_record
from qiita_files.util import open_file


# track some basic stats about the samples
Expand All @@ -72,6 +73,8 @@
'barcode_error': 'barcode/error',
'qual': 'qual'}

dset_paths_bytes = {k: v.encode('ascii') for k, v in dset_paths.items()}


class _buffer(object):
"""Buffer baseclass that sits on top of an HDF5 dataset
Expand Down Expand Up @@ -386,26 +389,6 @@ def to_hdf5(fp, h5file, max_barcode_length=12):
buffers[pjoin(dset_paths['qual'])].write(qual)


def format_fasta_record(seqid, seq, qual):
"""Format a fasta record

Parameters
----------
seqid : str
The sequence ID
seq : str
The sequence
qual : ignored
This is ignored

Returns
-------
str
A formatted sequence record
"""
return b'\n'.join([b'>' + seqid, seq, b''])


def to_ascii(demux, samples=None):
"""Consume a demuxed HDF5 file and yield sequence records

Expand All @@ -429,15 +412,15 @@ def to_ascii(demux, samples=None):
else:
formatter = format_fasta_record

id_fmt = ("%(sample)s_%(idx)d orig_bc=%(bc_ori)s new_bc=%(bc_cor)s "
"bc_diffs=%(bc_diff)d")
id_fmt = (b"%(sample)s_%(idx)d orig_bc=%(bc_ori)s new_bc=%(bc_cor)s "
b"bc_diffs=%(bc_diff)d")

if samples is None:
samples = demux.keys()

for samp, idx, seq, qual, bc_ori, bc_cor, bc_err in fetch(demux, samples):
seq_id = id_fmt % {'sample': samp, 'idx': idx, 'bc_ori': bc_ori,
'bc_cor': bc_cor, 'bc_diff': bc_err}
seq_id = id_fmt % {b'sample': samp, b'idx': idx, b'bc_ori': bc_ori,
b'bc_cor': bc_cor, b'bc_diff': bc_err}
if qual != []:
qual = qual.astype(np.uint8)

Expand Down Expand Up @@ -468,6 +451,7 @@ def to_per_sample_ascii(demux, samples=None):
samples = demux.keys()

for samp in samples:
samp = samp.encode()
yield samp, to_ascii(demux, samples=[samp])


Expand Down Expand Up @@ -513,20 +497,22 @@ def fetch(demux, samples=None, k=None):
indices = np.logical_not(indices)
indices[to_keep[:k]] = True

seqs = demux[pjoin(dset_paths['sequence'])][indices]
seqs = demux[pjoin(dset_paths_bytes['sequence'])][indices]

# only yield qual if we have it
quals = repeat([])
if demux.attrs['has-qual']:
if len(indices) == 1:
if indices[0]:
quals = demux[pjoin(dset_paths['qual'])][:]
quals = demux[pjoin(dset_paths_bytes['qual'])][:]
else:
quals = demux[pjoin(dset_paths['qual'])][indices, :]
quals = demux[pjoin(dset_paths_bytes['qual'])][indices, :]

bc_original = demux[pjoin(dset_paths['barcode_original'])][indices]
bc_corrected = demux[pjoin(dset_paths['barcode_corrected'])][indices]
bc_error = demux[pjoin(dset_paths['barcode_error'])][indices]
bc_original = demux[
pjoin(dset_paths_bytes['barcode_original'])][indices]
bc_corrected = demux[
pjoin(dset_paths_bytes['barcode_corrected'])][indices]
bc_error = demux[pjoin(dset_paths_bytes['barcode_error'])][indices]

iter_ = zip(repeat(sample), np.arange(indices.size)[indices], seqs,
quals, bc_original, bc_corrected, bc_error)
Expand Down
Empty file added qiita_files/format/__init__.py
Empty file.
27 changes: 27 additions & 0 deletions qiita_files/format/fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# -----------------------------------------------------------------------------
# Copyright (c) 2014--, The Qiita Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------


def format_fasta_record(seqid, seq, qual):
"""Format a fasta record

Parameters
----------
seqid : str
The sequence ID
seq : str
The sequence
qual : ignored
This is ignored

Returns
-------
str
A formatted sequence record
"""
return b'\n'.join([b'>' + seqid, seq, b''])
50 changes: 50 additions & 0 deletions qiita_files/format/fastq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# -----------------------------------------------------------------------------
# Copyright (c) 2014--, The Qiita Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------


def _phred_to_ascii(a, offset):
"""Convert Phred quality score to ASCII character with specified offset"""
return (a + offset).tobytes()


def _phred_to_ascii33(a):
"""Convert Phred quality score to ASCII character with offset of 33"""
return _phred_to_ascii(a, 33)


def _phred_to_ascii64(a):
"""Convert Phred quality score to ASCII character with offset of 64"""
return _phred_to_ascii(a, 64)


def format_fastq_record(seqid, seq, qual, phred_offset=33):
"""Format a FASTQ record

Parameters
----------
seqid : bytes
The sequence ID
seq : bytes
The sequence
qual : np.array of int8
The quality scores
phred_offset : int, either 33 or 64
Set a phred offset

Returns
-------
bytes : a string representation of a single FASTQ record
"""
if phred_offset == 33:
phred_f = _phred_to_ascii33
elif phred_offset == 64:
phred_f = _phred_to_ascii64
else:
raise ValueError("Unknown phred offset: %d" % phred_offset)

return b'\n'.join([b"@" + seqid, seq, b'+', phred_f(qual), b''])
Empty file.
20 changes: 20 additions & 0 deletions qiita_files/format/tests/test_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# -----------------------------------------------------------------------------
# Copyright (c) 2014--, The Qiita Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------

from unittest import TestCase, main
from qiita_files.format.fasta import format_fasta_record


class FastaTests(TestCase):
def test_format_fasta_record(self):
exp = b">a\nxyz\n"
obs = format_fasta_record(b"a", b"xyz", b'ignored')
self.assertEqual(obs, exp)

if __name__ == '__main__':
main()
44 changes: 44 additions & 0 deletions qiita_files/format/tests/test_fastq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# -----------------------------------------------------------------------------
# Copyright (c) 2014--, The Qiita Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------

from unittest import TestCase, main
import numpy as np

from qiita_files.format.fastq import (format_fastq_record, _phred_to_ascii33,
_phred_to_ascii64)


class FastqTests(TestCase):
def setUp(self):
self.qual_scores = np.array([38, 39, 40], dtype=np.int8)
self.args = (b'abc', b'def', self.qual_scores)

def test_format_fastq_record_phred_offset_33(self):
exp = b"@abc\ndef\n+\nGHI\n"
obs = format_fastq_record(*self.args, phred_offset=33)
self.assertEqual(obs, exp)

def test_format_fastq_record_phred_offset_64(self):
exp = b"@abc\ndef\n+\nfgh\n"
obs = format_fastq_record(*self.args, phred_offset=64)
self.assertEqual(obs, exp)

def test_format_fastq_record_invalid_phred_offset(self):
with self.assertRaises(ValueError):
format_fastq_record(*self.args, phred_offset=42)

def test_phred_to_ascii33(self):
obs = _phred_to_ascii33(self.qual_scores)
self.assertEqual(obs, b'GHI')

def test_phred_to_ascii64(self):
obs = _phred_to_ascii64(self.qual_scores)
self.assertEqual(obs, b'fgh')

if __name__ == '__main__':
main()
139 changes: 139 additions & 0 deletions qiita_files/parse/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from itertools import chain
from gzip import open as gzip_open
from os.path import exists

from .iterator import FastaIterator, FastqIterator

FILEEXT_MAP = {'fna': (FastaIterator, open),
'fna.gz': (FastaIterator, gzip_open),
'fasta': (FastaIterator, open),
'fasta.gz': (FastaIterator, gzip_open),
'qual': (FastaIterator, open),
'qual.gz': (FastaIterator, gzip_open),
'fastq': (FastqIterator, open),
'fastq.gz': (FastqIterator, gzip_open),
'fq': (FastqIterator, open),
'fq.gz': (FastqIterator, gzip_open)}


def _determine_types_and_openers(files):
"""Attempt to determine the appropriate iterators and openers"""
if files is None:
return [], []

iters = []
openers = []
for fpath in files:
if fpath.endswith('.gz'):
ext = '.'.join(fpath.rsplit('.', 2)[-2:])
else:
ext = fpath.rsplit('.', 1)[-1]

i, o = FILEEXT_MAP.get(ext, (None, None))
if i is None:
raise IOError("Unknown filetype for %s" % fpath)

iters.append(i)
openers.append(o)

return iters, openers


def _is_single_iterator_type(iters):
"""Determine if there is a single or multiple type of iterator
If iters is [], this method returns True it considers the null case to be
a single iterator type.
"""
if iters:
return len(set(iters)) == 1
else:
return True


def _open_or_none(opener, f):
"""Open a file or returns None"""
if not opener:
return None
else:
name = opener.__name__

if not exists(f):
raise IOError("%s does not appear to exist!" % f)
try:
opened = opener(f)
except IOError:
raise IOError("Could not open %s with %s!" % (f, name))

return opened


def load(seqs, qual=None, constructor=None, **kwargs):
"""Construct the appropriate iterator for all your processing needs
This method will attempt to open all files correctly and to feed the
appropriate objects into the correct iterators.
Seqs can list multiple types of files (e.g., FASTA and FASTQ), but if
multiple file types are specified, qual must be None
Parameters
----------
seqs : str or list of sequence file paths
qual : str or list of qual file paths or None
constructor : force a constructor on seqs
kwargs : dict
passed into the subsequent generators.
Returns
-------
SequenceIterator
the return is ``Iterable``
See Also
--------
SequenceIterator
FastaIterator
FastqIterator
"""
if not seqs:
raise ValueError("Must supply sequences.")

if isinstance(seqs, str):
seqs = [seqs]

if isinstance(qual, str):
qual = [qual]

# i -> iters, o -> openers
if constructor is not None:
i_seqs = [constructor] * len(seqs)
o_seqs = [open] * len(seqs)
else:
i_seqs, o_seqs = _determine_types_and_openers(seqs)

i_qual, o_qual = _determine_types_and_openers(qual)

seqs = [_open_or_none(o, f) for f, o in zip(seqs, o_seqs)]
qual = [_open_or_none(o, f) for f, o in zip(qual or [], o_qual or [])]

if not qual:
qual = None

if not _is_single_iterator_type(i_seqs) and qual is not None:
# chaining Fasta/Fastq for sequence is easy, but it gets nasty quick
# if seqs is a mix of fasta/fastq, with qual coming in as there aren't
# 1-1 mappings. This could be addressed if necessary, but seems like
# an unnecessary block of code right now
raise ValueError("Cannot handle multiple sequence file types and qual "
"file(s) at the same time.")

if _is_single_iterator_type(i_seqs):
seqs_constructor = i_seqs[0]
gen = seqs_constructor(seq=seqs, qual=qual, **kwargs)
else:
gen = chain(*[c(seq=[fp], **kwargs) for c, fp in zip(i_seqs, seqs)])

return gen
Loading