FIX: a variety of small enhancements and fixes (#62)

Fixes #59 Fixes #58 Fixes #57 Fixes #53 Addresses most of #54
qiime2 · Apr 11, 2017 · e482731 · e482731
1 parent efdb98d
commit e482731
Show file tree

Hide file tree

Showing 9 changed files with 198 additions and 50 deletions.
diff --git a/q2_feature_classifier/_blast.py b/q2_feature_classifier/_blast.py
@@ -15,13 +15,14 @@
                                     _get_default_unassignable_label)
 
 
-def blast(query: DNAFASTAFormat, reference_reads: DNAFASTAFormat,
-          reference_taxonomy: pd.Series, maxaccepts: int=10,
-          perc_identity: float=0.8, strand: str='both', evalue: float=0.001,
-          min_consensus: float=0.51,
-          unassignable_label: str=_get_default_unassignable_label(),
-          num_threads: str=1) -> pd.DataFrame:
-
+def classify_consensus_blast(query: DNAFASTAFormat,
+                             reference_reads: DNAFASTAFormat,
+                             reference_taxonomy: pd.Series, maxaccepts: int=10,
+                             perc_identity: float=0.8, strand: str='both',
+                             evalue: float=0.001, min_consensus: float=0.51,
+                             unassignable_label: str=
+                             _get_default_unassignable_label(),
+                             num_threads: str=1) -> pd.DataFrame:
     perc_identity = perc_identity * 100
     seqs_fp = str(query)
     ref_fp = str(reference_reads)
@@ -38,7 +39,7 @@ def blast(query: DNAFASTAFormat, reference_reads: DNAFASTAFormat,
 
 
 plugin.methods.register_function(
-    function=blast,
+    function=classify_consensus_blast,
     inputs={'query': FeatureData[Sequence],
             'reference_reads': FeatureData[Sequence],
             'reference_taxonomy': FeatureData[Taxonomy]},

diff --git a/q2_feature_classifier/_cutter.py b/q2_feature_classifier/_cutter.py
@@ -115,8 +115,7 @@ def _gen_reads(sequences: DNAIterator,  f_primer: str, r_primer: str,
             continue
         if length > 0:
             amp = amp[:length]
-        if len(amp) > 0:
-            yield amp
+        yield amp
 
 
 def extract_reads(sequences: DNAIterator,  f_primer: str, r_primer: str,
@@ -158,6 +157,6 @@ def extract_reads(sequences: DNAIterator,  f_primer: str, r_primer: str,
                 'r_primer': Str,
                 'identity': Float},
     outputs=[('reads', FeatureData[Sequence])],
-    name='Extract reads from reference.',
+    name='Extract reads from reference',
     description='Extract sequencing-like reads from a reference database.'
 )
diff --git a/q2_feature_classifier/_skl.py b/q2_feature_classifier/_skl.py
@@ -38,8 +38,6 @@ def _extract_reads(reads):
 
 def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1,
             pre_dispatch='2*n_jobs', confidence=-1.):
-    if confidence >= 0.:
-        chunk_size = chunk_size // 101 + 1
     return (m for c in Parallel(n_jobs=n_jobs, batch_size=1,
                                 pre_dispatch=pre_dispatch)
             (delayed(_predict_chunk)(pipeline, separator, confidence, chunk)
@@ -91,6 +89,7 @@ def _predict_chunk_with_conf(pipeline, separator, confidence, chunk):
 
 
 def _chunks(reads, chunk_size):
+    reads = iter(reads)
     while True:
         chunk = list(islice(reads, chunk_size))
         if len(chunk) == 0:

diff --git a/q2_feature_classifier/_vsearch.py b/q2_feature_classifier/_vsearch.py
@@ -15,13 +15,15 @@
                                     _get_default_unassignable_label)
 
 
-def vsearch(query: DNAFASTAFormat, reference_reads: DNAFASTAFormat,
-            reference_taxonomy: pd.Series, maxaccepts: int=10,
-            perc_identity: int=0.8, strand: str='both',
-            min_consensus: float=0.51,
-            unassignable_label: str=_get_default_unassignable_label(),
-            threads: str=1) -> pd.DataFrame:
-
+def classify_consensus_vsearch(query: DNAFASTAFormat,
+                               reference_reads: DNAFASTAFormat,
+                               reference_taxonomy: pd.Series,
+                               maxaccepts: int=10,
+                               perc_identity: int=0.8, strand: str='both',
+                               min_consensus: float=0.51,
+                               unassignable_label: str=
+                               _get_default_unassignable_label(),
+                               threads: str=1) -> pd.DataFrame:
     seqs_fp = str(query)
     ref_fp = str(reference_reads)
     cmd = ['vsearch', '--usearch_global', seqs_fp, '--id', str(perc_identity),
@@ -35,7 +37,7 @@ def vsearch(query: DNAFASTAFormat, reference_reads: DNAFASTAFormat,
 
 
 plugin.methods.register_function(
-    function=vsearch,
+    function=classify_consensus_vsearch,
     inputs={'query': FeatureData[Sequence],
             'reference_reads': FeatureData[Sequence],
             'reference_taxonomy': FeatureData[Taxonomy]},

diff --git a/q2_feature_classifier/classifier.py b/q2_feature_classifier/classifier.py
@@ -10,12 +10,14 @@
 import importlib
 import inspect
 import warnings
+from itertools import chain, islice
 
 import pandas as pd
-from qiime2.plugin import Int, Str, Float, Bool
+from qiime2.plugin import Int, Str, Float, Bool, Choices
 from q2_types.feature_data import FeatureData, Taxonomy, Sequence, DNAIterator
 from sklearn.pipeline import Pipeline
 import sklearn
+from numpy import median, array
 
 from ._skl import fit_pipeline, predict, _specific_fitters
 from ._taxonomic_classifier import TaxonomicClassifier
@@ -94,8 +96,9 @@ def warn_about_sklearn():
     warnings.warn(warning, UserWarning)
 
 
-def fit_classifier(reference_reads: DNAIterator, reference_taxonomy: pd.Series,
-                   classifier_specification: str) -> Pipeline:
+def fit_classifier_sklearn(reference_reads: DNAIterator,
+                           reference_taxonomy: pd.Series,
+                           classifier_specification: str) -> Pipeline:
     warn_about_sklearn()
     spec = json.loads(classifier_specification)
     pipeline = pipeline_from_spec(spec)
@@ -104,20 +107,41 @@ def fit_classifier(reference_reads: DNAIterator, reference_taxonomy: pd.Series,
 
 
 plugin.methods.register_function(
-    function=fit_classifier,
+    function=fit_classifier_sklearn,
     inputs={'reference_reads': FeatureData[Sequence],
             'reference_taxonomy': FeatureData[Taxonomy]},
     parameters={'classifier_specification': Str},
     outputs=[('classifier', TaxonomicClassifier)],
-    name='Train a scikit-learn classifier.',
+    name='Train an almost arbitrary scikit-learn classifier',
     description='Train a scikit-learn classifier to classify reads.'
 )
 
 
-def classify(reads: DNAIterator, classifier: Pipeline,
-             chunk_size: int=262144, n_jobs: int=1,
-             pre_dispatch: str='2*n_jobs', confidence: float=-1.
-             ) -> pd.DataFrame:
+def _autodetect_orientation(reads, classifier, n=100,
+                            read_orientation=None):
+    if read_orientation == 'same':
+        return reads
+    if read_orientation == 'reverse-complement':
+        return (r.reverse_complement() for r in reads)
+    reads = iter(reads)
+    first_n_reads = list(islice(reads, n))
+    result = list(zip(*predict(first_n_reads, classifier, confidence=0.)))
+    _, _, same_confidence = result
+    reversed_n_reads = [r.reverse_complement() for r in first_n_reads]
+    result = list(zip(*predict(reversed_n_reads, classifier, confidence=0.)))
+    _, _, reverse_confidence = result
+    if median(array(same_confidence) - array(reverse_confidence)) > 0.:
+        return chain(first_n_reads, reads)
+    return chain(reversed_n_reads, (r.reverse_complement() for r in reads))
+
+
+def classify_sklearn(reads: DNAIterator, classifier: Pipeline,
+                     chunk_size: int=262144, n_jobs: int=1,
+                     pre_dispatch: str='2*n_jobs', confidence: float=-1.,
+                     read_orientation: str=None
+                     ) -> pd.DataFrame:
+    reads = _autodetect_orientation(
+        reads, classifier, read_orientation=read_orientation)
     predictions = predict(reads, classifier, chunk_size=chunk_size,
                           n_jobs=n_jobs, pre_dispatch=pre_dispatch,
                           confidence=confidence)
@@ -133,20 +157,29 @@ def classify(reads: DNAIterator, classifier: Pipeline,
 
 
 _classify_parameters = {'chunk_size': Int, 'n_jobs': Int, 'pre_dispatch': Str,
-                        'confidence': Float}
+                        'confidence': Float, 'read_orientation':
+                        Str % Choices(['same', 'reverse-complement'])}
 
 
 plugin.methods.register_function(
-    function=classify,
+    function=classify_sklearn,
     inputs={'reads': FeatureData[Sequence],
             'classifier': TaxonomicClassifier},
     parameters=_classify_parameters,
     outputs=[('classification', FeatureData[Taxonomy])],
-    name='Classify reads by taxon.',
+    name='Pre-fitted sklearn-based taxonomy classifier',
     description='Classify reads by taxon using a fitted classifier.',
     parameter_descriptions={'confidence': 'Confidence threshold for limiting '
-                            'taxonomic depth. Negative value disables. '
-                            'Currently experimental. USE WITH CAUTION.'}
+                            'taxonomic depth. Negative value disables '
+                            'Currently experimental. USE WITH CAUTION',
+                            'read_orientation': 'Direction of reads with '
+                            'respect to reference sequences. same will cause '
+                            'reads to be classified unchanged; '
+                            'reverse-complement will cause reads to be '
+                            'reversed and complemented prior to '
+                            'classification. Default is to autodetect based on'
+                            ' the confidence estimates for the first 100 reads'
+                            }
 )
 
 
@@ -205,8 +238,8 @@ def generic_fitter(reference_reads: DNAIterator,
                 'reference_taxonomy': FeatureData[Taxonomy]},
         parameters=parameters,
         outputs=[('classifier', TaxonomicClassifier)],
-        name='Train the ' + name + ' classifier.',
-        description='Create a ' + name + ' classifier for reads'
+        name='Train the ' + name + ' classifier',
+        description='Create a scikit-learn ' + name + ' classifier for reads'
     )
 
 

diff --git a/q2_feature_classifier/custom.py b/q2_feature_classifier/custom.py
@@ -15,7 +15,7 @@
 
 class LowMemoryMultinomialNB(MultinomialNB):
     def __init__(self, alpha=1.0, fit_prior=True, class_prior=None,
-                 chunk_size=-1):
+                 chunk_size=20000):
         self.chunk_size = chunk_size
         super().__init__(alpha=alpha, fit_prior=fit_prior,
                          class_prior=class_prior)
@@ -25,13 +25,14 @@ def fit(self, X, y, sample_weight=None):
             return super().fit(X, y, sample_weight=sample_weight)
 
         classes = numpy.unique(y)
-        for i in range(0, X.shape[1], self.chunk_size):
-            cX = X[i:i+self.chunk_size]
-            cy = y[i:i+self.chunk_size]
+        for i in range(0, X.shape[0], self.chunk_size):
+            upper = min(i+self.chunk_size, X.shape[0])
+            cX = X[i:upper]
+            cy = y[i:upper]
             if sample_weight is None:
                 csample_weight = None
             else:
-                csample_weight = sample_weight[i:i+self.chunk_size]
+                csample_weight = sample_weight[i:upper]
             self.partial_fit(cX, cy, sample_weight=csample_weight,
                              classes=classes)
 

diff --git a/q2_feature_classifier/tests/test_classifier.py b/q2_feature_classifier/tests/test_classifier.py
@@ -7,12 +7,16 @@
 # ----------------------------------------------------------------------------
 
 import json
+import os
 
 from qiime2.sdk import Artifact
 from qiime2.plugins import feature_classifier
 import pandas as pd
+import skbio
 
 from q2_feature_classifier._skl import _specific_fitters
+from q2_feature_classifier.classifier import spec_from_pipeline, \
+    pipeline_from_spec
 
 from . import FeatureClassifierTestPluginBase
 
@@ -42,10 +46,10 @@ def test_fit_classifier(self):
               {'__type__': 'naive_bayes.MultinomialNB',
                'alpha': 0.01}]]
         classifier_specification = json.dumps(classifier_specification)
-        fit_classifier = feature_classifier.methods.fit_classifier
+        fit_classifier = feature_classifier.methods.fit_classifier_sklearn
         result = fit_classifier(reads, self.taxonomy, classifier_specification)
 
-        classify = feature_classifier.methods.classify
+        classify = feature_classifier.methods.classify_sklearn
         result = classify(reads, result.classifier)
 
         ref = self.taxonomy.view(pd.Series).to_dict()
@@ -58,8 +62,8 @@ def test_fit_classifier(self):
 
     def test_fit_specific_classifiers(self):
         # specific and general classifiers should produce the same results
-        gen_fitter = feature_classifier.methods.fit_classifier
-        classify = feature_classifier.methods.classify
+        gen_fitter = feature_classifier.methods.fit_classifier_sklearn
+        classify = feature_classifier.methods.classify_sklearn
         reads = Artifact.import_data(
             'FeatureData[Sequence]',
             self.get_data_path('se-dna-sequences.fasta'))
@@ -76,3 +80,53 @@ def test_fit_specific_classifiers(self):
             sc = result.classification.view(pd.Series).to_dict()
             for taxon in gc:
                 self.assertEqual(gc[taxon], sc[taxon])
+
+    def test_pipeline_serialisation(self):
+        # pipeline inflation and deflation should be inverse operations
+        for name, spec in _specific_fitters:
+            pipeline = pipeline_from_spec(spec)
+            spec_one = spec_from_pipeline(pipeline)
+            pipeline = pipeline_from_spec(spec_one)
+            spec_two = spec_from_pipeline(pipeline)
+            self.assertEqual(spec_one, spec_two)
+
+    def test_classify(self):
+        # test read direction detection and parallel classification
+        classify = feature_classifier.methods.classify_sklearn
+        seq_path = self.get_data_path('se-dna-sequences.fasta')
+        reads = Artifact.import_data('FeatureData[Sequence]', seq_path)
+        raw_reads = skbio.io.read(
+            seq_path, format='fasta', constructor=skbio.DNA)
+        rev_path = os.path.join(self.temp_dir.name, 'rev-dna-sequences.fasta')
+        skbio.io.write((s.reverse_complement() for s in raw_reads),
+                       'fasta', rev_path)
+        rev_reads = Artifact.import_data('FeatureData[Sequence]', rev_path)
+
+        fitter_name = _specific_fitters[0][0]
+        fitter = getattr(feature_classifier.methods,
+                         'fit_classifier_' + fitter_name)
+        classifier = fitter(reads, self.taxonomy).classifier
+
+        result = classify(reads, classifier)
+        fc = result.classification.view(pd.Series).to_dict()
+        result = classify(rev_reads, classifier)
+        rc = result.classification.view(pd.Series).to_dict()
+
+        for taxon in fc:
+            self.assertEqual(fc[taxon], rc[taxon])
+
+        result = classify(reads, classifier,
+                          read_orientation='same')
+        fc = result.classification.view(pd.Series).to_dict()
+        result = classify(rev_reads, classifier,
+                          read_orientation='reverse-complement')
+        rc = result.classification.view(pd.Series).to_dict()
+
+        for taxon in fc:
+            self.assertEqual(fc[taxon], rc[taxon])
+
+        result = classify(reads, classifier, chunk_size=100, n_jobs=2)
+        cc = result.classification.view(pd.Series).to_dict()
+
+        for taxon in fc:
+            self.assertEqual(fc[taxon], cc[taxon])
diff --git a/q2_feature_classifier/tests/test_consensus_assignment.py b/q2_feature_classifier/tests/test_consensus_assignment.py
@@ -7,8 +7,8 @@
 # ----------------------------------------------------------------------------
 
 import pandas as pd
-from q2_feature_classifier._blast import blast
-from q2_feature_classifier._vsearch import vsearch
+from q2_feature_classifier._blast import classify_consensus_blast
+from q2_feature_classifier._vsearch import classify_consensus_vsearch
 from q2_feature_classifier._consensus_assignment import (
     _compute_consensus_annotation,
     _compute_consensus_annotations,
@@ -31,7 +31,8 @@ def setUp(self):
     # Make sure blast and vsearch produce expected outputs
     # but there is no "right" taxonomy assignment.
     def test_blast(self):
-        result = blast(self.reads, self.reads, self.taxonomy)
+        result = classify_consensus_blast(self.reads, self.reads,
+                                          self.taxonomy)
         res = result.Taxon.to_dict()
         tax = self.taxonomy.to_dict()
         right = 0.
@@ -40,7 +41,8 @@ def test_blast(self):
         self.assertGreater(right/len(res), 0.5)
 
     def test_vsearch(self):
-        result = vsearch(self.reads, self.reads, self.taxonomy)
+        result = classify_consensus_vsearch(self.reads, self.reads,
+                                            self.taxonomy)
         res = result.Taxon.to_dict()
         tax = self.taxonomy.to_dict()
         right = 0.