Skip to content

Commit

Permalink
FIX: a variety of small enhancements and fixes (#62)
Browse files Browse the repository at this point in the history
Fixes #59
Fixes #58
Fixes #57
Fixes #53

Addresses most of #54
  • Loading branch information
BenKaehler authored and jairideout committed Apr 11, 2017
1 parent efdb98d commit e482731
Show file tree
Hide file tree
Showing 9 changed files with 198 additions and 50 deletions.
17 changes: 9 additions & 8 deletions q2_feature_classifier/_blast.py
Expand Up @@ -15,13 +15,14 @@
_get_default_unassignable_label)


def blast(query: DNAFASTAFormat, reference_reads: DNAFASTAFormat,
reference_taxonomy: pd.Series, maxaccepts: int=10,
perc_identity: float=0.8, strand: str='both', evalue: float=0.001,
min_consensus: float=0.51,
unassignable_label: str=_get_default_unassignable_label(),
num_threads: str=1) -> pd.DataFrame:

def classify_consensus_blast(query: DNAFASTAFormat,
reference_reads: DNAFASTAFormat,
reference_taxonomy: pd.Series, maxaccepts: int=10,
perc_identity: float=0.8, strand: str='both',
evalue: float=0.001, min_consensus: float=0.51,
unassignable_label: str=
_get_default_unassignable_label(),
num_threads: str=1) -> pd.DataFrame:
perc_identity = perc_identity * 100
seqs_fp = str(query)
ref_fp = str(reference_reads)
Expand All @@ -38,7 +39,7 @@ def blast(query: DNAFASTAFormat, reference_reads: DNAFASTAFormat,


plugin.methods.register_function(
function=blast,
function=classify_consensus_blast,
inputs={'query': FeatureData[Sequence],
'reference_reads': FeatureData[Sequence],
'reference_taxonomy': FeatureData[Taxonomy]},
Expand Down
5 changes: 2 additions & 3 deletions q2_feature_classifier/_cutter.py
Expand Up @@ -115,8 +115,7 @@ def _gen_reads(sequences: DNAIterator, f_primer: str, r_primer: str,
continue
if length > 0:
amp = amp[:length]
if len(amp) > 0:
yield amp
yield amp


def extract_reads(sequences: DNAIterator, f_primer: str, r_primer: str,
Expand Down Expand Up @@ -158,6 +157,6 @@ def extract_reads(sequences: DNAIterator, f_primer: str, r_primer: str,
'r_primer': Str,
'identity': Float},
outputs=[('reads', FeatureData[Sequence])],
name='Extract reads from reference.',
name='Extract reads from reference',
description='Extract sequencing-like reads from a reference database.'
)
3 changes: 1 addition & 2 deletions q2_feature_classifier/_skl.py
Expand Up @@ -38,8 +38,6 @@ def _extract_reads(reads):

def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1,
pre_dispatch='2*n_jobs', confidence=-1.):
if confidence >= 0.:
chunk_size = chunk_size // 101 + 1
return (m for c in Parallel(n_jobs=n_jobs, batch_size=1,
pre_dispatch=pre_dispatch)
(delayed(_predict_chunk)(pipeline, separator, confidence, chunk)
Expand Down Expand Up @@ -91,6 +89,7 @@ def _predict_chunk_with_conf(pipeline, separator, confidence, chunk):


def _chunks(reads, chunk_size):
reads = iter(reads)
while True:
chunk = list(islice(reads, chunk_size))
if len(chunk) == 0:
Expand Down
18 changes: 10 additions & 8 deletions q2_feature_classifier/_vsearch.py
Expand Up @@ -15,13 +15,15 @@
_get_default_unassignable_label)


def vsearch(query: DNAFASTAFormat, reference_reads: DNAFASTAFormat,
reference_taxonomy: pd.Series, maxaccepts: int=10,
perc_identity: int=0.8, strand: str='both',
min_consensus: float=0.51,
unassignable_label: str=_get_default_unassignable_label(),
threads: str=1) -> pd.DataFrame:

def classify_consensus_vsearch(query: DNAFASTAFormat,
reference_reads: DNAFASTAFormat,
reference_taxonomy: pd.Series,
maxaccepts: int=10,
perc_identity: int=0.8, strand: str='both',
min_consensus: float=0.51,
unassignable_label: str=
_get_default_unassignable_label(),
threads: str=1) -> pd.DataFrame:
seqs_fp = str(query)
ref_fp = str(reference_reads)
cmd = ['vsearch', '--usearch_global', seqs_fp, '--id', str(perc_identity),
Expand All @@ -35,7 +37,7 @@ def vsearch(query: DNAFASTAFormat, reference_reads: DNAFASTAFormat,


plugin.methods.register_function(
function=vsearch,
function=classify_consensus_vsearch,
inputs={'query': FeatureData[Sequence],
'reference_reads': FeatureData[Sequence],
'reference_taxonomy': FeatureData[Taxonomy]},
Expand Down
65 changes: 49 additions & 16 deletions q2_feature_classifier/classifier.py
Expand Up @@ -10,12 +10,14 @@
import importlib
import inspect
import warnings
from itertools import chain, islice

import pandas as pd
from qiime2.plugin import Int, Str, Float, Bool
from qiime2.plugin import Int, Str, Float, Bool, Choices
from q2_types.feature_data import FeatureData, Taxonomy, Sequence, DNAIterator
from sklearn.pipeline import Pipeline
import sklearn
from numpy import median, array

from ._skl import fit_pipeline, predict, _specific_fitters
from ._taxonomic_classifier import TaxonomicClassifier
Expand Down Expand Up @@ -94,8 +96,9 @@ def warn_about_sklearn():
warnings.warn(warning, UserWarning)


def fit_classifier(reference_reads: DNAIterator, reference_taxonomy: pd.Series,
classifier_specification: str) -> Pipeline:
def fit_classifier_sklearn(reference_reads: DNAIterator,
reference_taxonomy: pd.Series,
classifier_specification: str) -> Pipeline:
warn_about_sklearn()
spec = json.loads(classifier_specification)
pipeline = pipeline_from_spec(spec)
Expand All @@ -104,20 +107,41 @@ def fit_classifier(reference_reads: DNAIterator, reference_taxonomy: pd.Series,


plugin.methods.register_function(
function=fit_classifier,
function=fit_classifier_sklearn,
inputs={'reference_reads': FeatureData[Sequence],
'reference_taxonomy': FeatureData[Taxonomy]},
parameters={'classifier_specification': Str},
outputs=[('classifier', TaxonomicClassifier)],
name='Train a scikit-learn classifier.',
name='Train an almost arbitrary scikit-learn classifier',
description='Train a scikit-learn classifier to classify reads.'
)


def classify(reads: DNAIterator, classifier: Pipeline,
chunk_size: int=262144, n_jobs: int=1,
pre_dispatch: str='2*n_jobs', confidence: float=-1.
) -> pd.DataFrame:
def _autodetect_orientation(reads, classifier, n=100,
read_orientation=None):
if read_orientation == 'same':
return reads
if read_orientation == 'reverse-complement':
return (r.reverse_complement() for r in reads)
reads = iter(reads)
first_n_reads = list(islice(reads, n))
result = list(zip(*predict(first_n_reads, classifier, confidence=0.)))
_, _, same_confidence = result
reversed_n_reads = [r.reverse_complement() for r in first_n_reads]
result = list(zip(*predict(reversed_n_reads, classifier, confidence=0.)))
_, _, reverse_confidence = result
if median(array(same_confidence) - array(reverse_confidence)) > 0.:
return chain(first_n_reads, reads)
return chain(reversed_n_reads, (r.reverse_complement() for r in reads))


def classify_sklearn(reads: DNAIterator, classifier: Pipeline,
chunk_size: int=262144, n_jobs: int=1,
pre_dispatch: str='2*n_jobs', confidence: float=-1.,
read_orientation: str=None
) -> pd.DataFrame:
reads = _autodetect_orientation(
reads, classifier, read_orientation=read_orientation)
predictions = predict(reads, classifier, chunk_size=chunk_size,
n_jobs=n_jobs, pre_dispatch=pre_dispatch,
confidence=confidence)
Expand All @@ -133,20 +157,29 @@ def classify(reads: DNAIterator, classifier: Pipeline,


_classify_parameters = {'chunk_size': Int, 'n_jobs': Int, 'pre_dispatch': Str,
'confidence': Float}
'confidence': Float, 'read_orientation':
Str % Choices(['same', 'reverse-complement'])}


plugin.methods.register_function(
function=classify,
function=classify_sklearn,
inputs={'reads': FeatureData[Sequence],
'classifier': TaxonomicClassifier},
parameters=_classify_parameters,
outputs=[('classification', FeatureData[Taxonomy])],
name='Classify reads by taxon.',
name='Pre-fitted sklearn-based taxonomy classifier',
description='Classify reads by taxon using a fitted classifier.',
parameter_descriptions={'confidence': 'Confidence threshold for limiting '
'taxonomic depth. Negative value disables. '
'Currently experimental. USE WITH CAUTION.'}
'taxonomic depth. Negative value disables '
'Currently experimental. USE WITH CAUTION',
'read_orientation': 'Direction of reads with '
'respect to reference sequences. same will cause '
'reads to be classified unchanged; '
'reverse-complement will cause reads to be '
'reversed and complemented prior to '
'classification. Default is to autodetect based on'
' the confidence estimates for the first 100 reads'
}
)


Expand Down Expand Up @@ -205,8 +238,8 @@ def generic_fitter(reference_reads: DNAIterator,
'reference_taxonomy': FeatureData[Taxonomy]},
parameters=parameters,
outputs=[('classifier', TaxonomicClassifier)],
name='Train the ' + name + ' classifier.',
description='Create a ' + name + ' classifier for reads'
name='Train the ' + name + ' classifier',
description='Create a scikit-learn ' + name + ' classifier for reads'
)


Expand Down
11 changes: 6 additions & 5 deletions q2_feature_classifier/custom.py
Expand Up @@ -15,7 +15,7 @@

class LowMemoryMultinomialNB(MultinomialNB):
def __init__(self, alpha=1.0, fit_prior=True, class_prior=None,
chunk_size=-1):
chunk_size=20000):
self.chunk_size = chunk_size
super().__init__(alpha=alpha, fit_prior=fit_prior,
class_prior=class_prior)
Expand All @@ -25,13 +25,14 @@ def fit(self, X, y, sample_weight=None):
return super().fit(X, y, sample_weight=sample_weight)

classes = numpy.unique(y)
for i in range(0, X.shape[1], self.chunk_size):
cX = X[i:i+self.chunk_size]
cy = y[i:i+self.chunk_size]
for i in range(0, X.shape[0], self.chunk_size):
upper = min(i+self.chunk_size, X.shape[0])
cX = X[i:upper]
cy = y[i:upper]
if sample_weight is None:
csample_weight = None
else:
csample_weight = sample_weight[i:i+self.chunk_size]
csample_weight = sample_weight[i:upper]
self.partial_fit(cX, cy, sample_weight=csample_weight,
classes=classes)

Expand Down
62 changes: 58 additions & 4 deletions q2_feature_classifier/tests/test_classifier.py
Expand Up @@ -7,12 +7,16 @@
# ----------------------------------------------------------------------------

import json
import os

from qiime2.sdk import Artifact
from qiime2.plugins import feature_classifier
import pandas as pd
import skbio

from q2_feature_classifier._skl import _specific_fitters
from q2_feature_classifier.classifier import spec_from_pipeline, \
pipeline_from_spec

from . import FeatureClassifierTestPluginBase

Expand Down Expand Up @@ -42,10 +46,10 @@ def test_fit_classifier(self):
{'__type__': 'naive_bayes.MultinomialNB',
'alpha': 0.01}]]
classifier_specification = json.dumps(classifier_specification)
fit_classifier = feature_classifier.methods.fit_classifier
fit_classifier = feature_classifier.methods.fit_classifier_sklearn
result = fit_classifier(reads, self.taxonomy, classifier_specification)

classify = feature_classifier.methods.classify
classify = feature_classifier.methods.classify_sklearn
result = classify(reads, result.classifier)

ref = self.taxonomy.view(pd.Series).to_dict()
Expand All @@ -58,8 +62,8 @@ def test_fit_classifier(self):

def test_fit_specific_classifiers(self):
# specific and general classifiers should produce the same results
gen_fitter = feature_classifier.methods.fit_classifier
classify = feature_classifier.methods.classify
gen_fitter = feature_classifier.methods.fit_classifier_sklearn
classify = feature_classifier.methods.classify_sklearn
reads = Artifact.import_data(
'FeatureData[Sequence]',
self.get_data_path('se-dna-sequences.fasta'))
Expand All @@ -76,3 +80,53 @@ def test_fit_specific_classifiers(self):
sc = result.classification.view(pd.Series).to_dict()
for taxon in gc:
self.assertEqual(gc[taxon], sc[taxon])

def test_pipeline_serialisation(self):
# pipeline inflation and deflation should be inverse operations
for name, spec in _specific_fitters:
pipeline = pipeline_from_spec(spec)
spec_one = spec_from_pipeline(pipeline)
pipeline = pipeline_from_spec(spec_one)
spec_two = spec_from_pipeline(pipeline)
self.assertEqual(spec_one, spec_two)

def test_classify(self):
# test read direction detection and parallel classification
classify = feature_classifier.methods.classify_sklearn
seq_path = self.get_data_path('se-dna-sequences.fasta')
reads = Artifact.import_data('FeatureData[Sequence]', seq_path)
raw_reads = skbio.io.read(
seq_path, format='fasta', constructor=skbio.DNA)
rev_path = os.path.join(self.temp_dir.name, 'rev-dna-sequences.fasta')
skbio.io.write((s.reverse_complement() for s in raw_reads),
'fasta', rev_path)
rev_reads = Artifact.import_data('FeatureData[Sequence]', rev_path)

fitter_name = _specific_fitters[0][0]
fitter = getattr(feature_classifier.methods,
'fit_classifier_' + fitter_name)
classifier = fitter(reads, self.taxonomy).classifier

result = classify(reads, classifier)
fc = result.classification.view(pd.Series).to_dict()
result = classify(rev_reads, classifier)
rc = result.classification.view(pd.Series).to_dict()

for taxon in fc:
self.assertEqual(fc[taxon], rc[taxon])

result = classify(reads, classifier,
read_orientation='same')
fc = result.classification.view(pd.Series).to_dict()
result = classify(rev_reads, classifier,
read_orientation='reverse-complement')
rc = result.classification.view(pd.Series).to_dict()

for taxon in fc:
self.assertEqual(fc[taxon], rc[taxon])

result = classify(reads, classifier, chunk_size=100, n_jobs=2)
cc = result.classification.view(pd.Series).to_dict()

for taxon in fc:
self.assertEqual(fc[taxon], cc[taxon])
10 changes: 6 additions & 4 deletions q2_feature_classifier/tests/test_consensus_assignment.py
Expand Up @@ -7,8 +7,8 @@
# ----------------------------------------------------------------------------

import pandas as pd
from q2_feature_classifier._blast import blast
from q2_feature_classifier._vsearch import vsearch
from q2_feature_classifier._blast import classify_consensus_blast
from q2_feature_classifier._vsearch import classify_consensus_vsearch
from q2_feature_classifier._consensus_assignment import (
_compute_consensus_annotation,
_compute_consensus_annotations,
Expand All @@ -31,7 +31,8 @@ def setUp(self):
# Make sure blast and vsearch produce expected outputs
# but there is no "right" taxonomy assignment.
def test_blast(self):
result = blast(self.reads, self.reads, self.taxonomy)
result = classify_consensus_blast(self.reads, self.reads,
self.taxonomy)
res = result.Taxon.to_dict()
tax = self.taxonomy.to_dict()
right = 0.
Expand All @@ -40,7 +41,8 @@ def test_blast(self):
self.assertGreater(right/len(res), 0.5)

def test_vsearch(self):
result = vsearch(self.reads, self.reads, self.taxonomy)
result = classify_consensus_vsearch(self.reads, self.reads,
self.taxonomy)
res = result.Taxon.to_dict()
tax = self.taxonomy.to_dict()
right = 0.
Expand Down

0 comments on commit e482731

Please sign in to comment.