qiita-spots · antgonza · Oct 10, 2016 · Oct 9, 2016 · Oct 9, 2016 · Oct 9, 2016
diff --git a/qiita_files/demux.py b/qiita_files/demux.py
@@ -56,10 +56,11 @@
 import numpy as np
 from future.utils import viewitems, viewvalues
 from future.builtins import zip
-from skbio.parse.sequences import load
-from skbio.format.sequences import format_fastq_record
 
-from .util import open_file
+from qiita_files.parse import load
+from qiita_files.format.fasta import format_fasta_record
+from qiita_files.format.fastq import format_fastq_record
+from qiita_files.util import open_file
 
 
 # track some basic stats about the samples
@@ -72,6 +73,8 @@
               'barcode_error': 'barcode/error',
               'qual': 'qual'}
 
+dset_paths_bytes = {k: v.encode('ascii') for k, v in dset_paths.items()}
+
 
 class _buffer(object):
     """Buffer baseclass that sits on top of an HDF5 dataset
@@ -386,26 +389,6 @@ def to_hdf5(fp, h5file, max_barcode_length=12):
             buffers[pjoin(dset_paths['qual'])].write(qual)
 
 
-def format_fasta_record(seqid, seq, qual):
-    """Format a fasta record
-
-    Parameters
-    ----------
-    seqid : str
-        The sequence ID
-    seq : str
-        The sequence
-    qual : ignored
-        This is ignored
-
-    Returns
-    -------
-    str
-        A formatted sequence record
-    """
-    return b'\n'.join([b'>' + seqid, seq, b''])
-
-
 def to_ascii(demux, samples=None):
     """Consume a demuxed HDF5 file and yield sequence records
 
@@ -429,15 +412,15 @@ def to_ascii(demux, samples=None):
     else:
         formatter = format_fasta_record
 
-    id_fmt = ("%(sample)s_%(idx)d orig_bc=%(bc_ori)s new_bc=%(bc_cor)s "
-              "bc_diffs=%(bc_diff)d")
+    id_fmt = (b"%(sample)s_%(idx)d orig_bc=%(bc_ori)s new_bc=%(bc_cor)s "
+              b"bc_diffs=%(bc_diff)d")
 
     if samples is None:
         samples = demux.keys()
 
     for samp, idx, seq, qual, bc_ori, bc_cor, bc_err in fetch(demux, samples):
-        seq_id = id_fmt % {'sample': samp, 'idx': idx, 'bc_ori': bc_ori,
-                           'bc_cor': bc_cor, 'bc_diff': bc_err}
+        seq_id = id_fmt % {b'sample': samp, b'idx': idx, b'bc_ori': bc_ori,
+                           b'bc_cor': bc_cor, b'bc_diff': bc_err}
         if qual != []:
             qual = qual.astype(np.uint8)
 
@@ -468,6 +451,7 @@ def to_per_sample_ascii(demux, samples=None):
         samples = demux.keys()
 
     for samp in samples:
+        samp = samp.encode()
         yield samp, to_ascii(demux, samples=[samp])
 
 
@@ -513,20 +497,22 @@ def fetch(demux, samples=None, k=None):
             indices = np.logical_not(indices)
             indices[to_keep[:k]] = True
 
-        seqs = demux[pjoin(dset_paths['sequence'])][indices]
+        seqs = demux[pjoin(dset_paths_bytes['sequence'])][indices]
 
         # only yield qual if we have it
         quals = repeat([])
         if demux.attrs['has-qual']:
             if len(indices) == 1:
                 if indices[0]:
-                    quals = demux[pjoin(dset_paths['qual'])][:]
+                    quals = demux[pjoin(dset_paths_bytes['qual'])][:]
             else:
-                quals = demux[pjoin(dset_paths['qual'])][indices, :]
+                quals = demux[pjoin(dset_paths_bytes['qual'])][indices, :]
 
-        bc_original = demux[pjoin(dset_paths['barcode_original'])][indices]
-        bc_corrected = demux[pjoin(dset_paths['barcode_corrected'])][indices]
-        bc_error = demux[pjoin(dset_paths['barcode_error'])][indices]
+        bc_original = demux[
+            pjoin(dset_paths_bytes['barcode_original'])][indices]
+        bc_corrected = demux[
+            pjoin(dset_paths_bytes['barcode_corrected'])][indices]
+        bc_error = demux[pjoin(dset_paths_bytes['barcode_error'])][indices]
 
         iter_ = zip(repeat(sample), np.arange(indices.size)[indices], seqs,
                     quals, bc_original, bc_corrected, bc_error)

diff --git a/qiita_files/format/__init__.py b/qiita_files/format/__init__.py
diff --git a/qiita_files/format/fasta.py b/qiita_files/format/fasta.py
@@ -0,0 +1,27 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2014--, The Qiita Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# -----------------------------------------------------------------------------
+
+
+def format_fasta_record(seqid, seq, qual):
+    """Format a fasta record
+
+    Parameters
+    ----------
+    seqid : str
+        The sequence ID
+    seq : str
+        The sequence
+    qual : ignored
+        This is ignored
+
+    Returns
+    -------
+    str
+        A formatted sequence record
+    """
+    return b'\n'.join([b'>' + seqid, seq, b''])
diff --git a/qiita_files/format/fastq.py b/qiita_files/format/fastq.py
@@ -0,0 +1,50 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2014--, The Qiita Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# -----------------------------------------------------------------------------
+
+
+def _phred_to_ascii(a, offset):
+    """Convert Phred quality score to ASCII character with specified offset"""
+    return (a + offset).tobytes()
+
+
+def _phred_to_ascii33(a):
+    """Convert Phred quality score to ASCII character with offset of 33"""
+    return _phred_to_ascii(a, 33)
+
+
+def _phred_to_ascii64(a):
+    """Convert Phred quality score to ASCII character with offset of 64"""
+    return _phred_to_ascii(a, 64)
+
+
+def format_fastq_record(seqid, seq, qual, phred_offset=33):
+    """Format a FASTQ record
+
+    Parameters
+    ----------
+    seqid : bytes
+        The sequence ID
+    seq : bytes
+        The sequence
+    qual : np.array of int8
+        The quality scores
+    phred_offset : int, either 33 or 64
+        Set a phred offset
+
+    Returns
+    -------
+    bytes : a string representation of a single FASTQ record
+    """
+    if phred_offset == 33:
+        phred_f = _phred_to_ascii33
+    elif phred_offset == 64:
+        phred_f = _phred_to_ascii64
+    else:
+        raise ValueError("Unknown phred offset: %d" % phred_offset)
+
+    return b'\n'.join([b"@" + seqid, seq, b'+', phred_f(qual), b''])
diff --git a/qiita_files/format/tests/__init__.py b/qiita_files/format/tests/__init__.py
diff --git a/qiita_files/format/tests/test_fasta.py b/qiita_files/format/tests/test_fasta.py
@@ -0,0 +1,20 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2014--, The Qiita Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# -----------------------------------------------------------------------------
+
+from unittest import TestCase, main
+from qiita_files.format.fasta import format_fasta_record
+
+
+class FastaTests(TestCase):
+    def test_format_fasta_record(self):
+        exp = b">a\nxyz\n"
+        obs = format_fasta_record(b"a", b"xyz", b'ignored')
+        self.assertEqual(obs, exp)
+
+if __name__ == '__main__':
+    main()
diff --git a/qiita_files/format/tests/test_fastq.py b/qiita_files/format/tests/test_fastq.py
@@ -0,0 +1,44 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2014--, The Qiita Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# -----------------------------------------------------------------------------
+
+from unittest import TestCase, main
+import numpy as np
+
+from qiita_files.format.fastq import (format_fastq_record, _phred_to_ascii33,
+                                      _phred_to_ascii64)
+
+
+class FastqTests(TestCase):
+    def setUp(self):
+        self.qual_scores = np.array([38, 39, 40], dtype=np.int8)
+        self.args = (b'abc', b'def', self.qual_scores)
+
+    def test_format_fastq_record_phred_offset_33(self):
+        exp = b"@abc\ndef\n+\nGHI\n"
+        obs = format_fastq_record(*self.args, phred_offset=33)
+        self.assertEqual(obs, exp)
+
+    def test_format_fastq_record_phred_offset_64(self):
+        exp = b"@abc\ndef\n+\nfgh\n"
+        obs = format_fastq_record(*self.args, phred_offset=64)
+        self.assertEqual(obs, exp)
+
+    def test_format_fastq_record_invalid_phred_offset(self):
+        with self.assertRaises(ValueError):
+            format_fastq_record(*self.args, phred_offset=42)
+
+    def test_phred_to_ascii33(self):
+        obs = _phred_to_ascii33(self.qual_scores)
+        self.assertEqual(obs, b'GHI')
+
+    def test_phred_to_ascii64(self):
+        obs = _phred_to_ascii64(self.qual_scores)
+        self.assertEqual(obs, b'fgh')
+
+if __name__ == '__main__':
+    main()
diff --git a/qiita_files/parse/__init__.py b/qiita_files/parse/__init__.py
@@ -0,0 +1,139 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2013--, scikit-bio development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from itertools import chain
+from gzip import open as gzip_open
+from os.path import exists
+
+from .iterator import FastaIterator, FastqIterator
+
+FILEEXT_MAP = {'fna': (FastaIterator, open),
+               'fna.gz': (FastaIterator, gzip_open),
+               'fasta': (FastaIterator, open),
+               'fasta.gz': (FastaIterator, gzip_open),
+               'qual': (FastaIterator, open),
+               'qual.gz': (FastaIterator, gzip_open),
+               'fastq': (FastqIterator, open),
+               'fastq.gz': (FastqIterator, gzip_open),
+               'fq': (FastqIterator, open),
+               'fq.gz': (FastqIterator, gzip_open)}
+
+
+def _determine_types_and_openers(files):
+    """Attempt to determine the appropriate iterators and openers"""
+    if files is None:
+        return [], []
+
+    iters = []
+    openers = []
+    for fpath in files:
+        if fpath.endswith('.gz'):
+            ext = '.'.join(fpath.rsplit('.', 2)[-2:])
+        else:
+            ext = fpath.rsplit('.', 1)[-1]
+
+        i, o = FILEEXT_MAP.get(ext, (None, None))
+        if i is None:
+            raise IOError("Unknown filetype for %s" % fpath)
+
+        iters.append(i)
+        openers.append(o)
+
+    return iters, openers
+
+
+def _is_single_iterator_type(iters):
+    """Determine if there is a single or multiple type of iterator
+    If iters is [], this method returns True it considers the null case to be
+    a single iterator type.
+    """
+    if iters:
+        return len(set(iters)) == 1
+    else:
+        return True
+
+
+def _open_or_none(opener, f):
+    """Open a file or returns None"""
+    if not opener:
+        return None
+    else:
+        name = opener.__name__
+
+        if not exists(f):
+            raise IOError("%s does not appear to exist!" % f)
+        try:
+            opened = opener(f)
+        except IOError:
+            raise IOError("Could not open %s with %s!" % (f, name))
+
+        return opened
+
+
+def load(seqs, qual=None, constructor=None, **kwargs):
+    """Construct the appropriate iterator for all your processing needs
+    This method will attempt to open all files correctly and to feed the
+    appropriate objects into the correct iterators.
+    Seqs can list multiple types of files (e.g., FASTA and FASTQ), but if
+    multiple file types are specified, qual must be None
+    Parameters
+    ----------
+    seqs : str or list of sequence file paths
+    qual : str or list of qual file paths or None
+    constructor : force a constructor on seqs
+    kwargs : dict
+        passed into the subsequent generators.
+    Returns
+    -------
+    SequenceIterator
+        the return is ``Iterable``
+    See Also
+    --------
+    SequenceIterator
+    FastaIterator
+    FastqIterator
+    """
+    if not seqs:
+        raise ValueError("Must supply sequences.")
+
+    if isinstance(seqs, str):
+        seqs = [seqs]
+
+    if isinstance(qual, str):
+        qual = [qual]
+
+    # i -> iters, o -> openers
+    if constructor is not None:
+        i_seqs = [constructor] * len(seqs)
+        o_seqs = [open] * len(seqs)
+    else:
+        i_seqs, o_seqs = _determine_types_and_openers(seqs)
+
+    i_qual, o_qual = _determine_types_and_openers(qual)
+
+    seqs = [_open_or_none(o, f) for f, o in zip(seqs, o_seqs)]
+    qual = [_open_or_none(o, f) for f, o in zip(qual or [], o_qual or [])]
+
+    if not qual:
+        qual = None
+
+    if not _is_single_iterator_type(i_seqs) and qual is not None:
+        # chaining Fasta/Fastq for sequence is easy, but it gets nasty quick
+        # if seqs is a mix of fasta/fastq, with qual coming in as there aren't
+        # 1-1 mappings. This could be addressed if necessary, but seems like
+        # an unnecessary block of code right now
+        raise ValueError("Cannot handle multiple sequence file types and qual "
+                         "file(s) at the same time.")
+
+    if _is_single_iterator_type(i_seqs):
+        seqs_constructor = i_seqs[0]
+        gen = seqs_constructor(seq=seqs, qual=qual, **kwargs)
+    else:
+        gen = chain(*[c(seq=[fp], **kwargs) for c, fp in zip(i_seqs, seqs)])
+
+    return gen