In [1]:
import os

from phytebyte import PhyteByte
from phytebyte.bioactive_cmpd.sources import ChemblBioactiveCompoundSource
from phytebyte.bioactive_cmpd.target_input import GeneTargetsInput
from phytebyte.fingerprinters import Fingerprinter

In [2]:
chembl_db_url = os.environ['CHEMBL_DB_URL']
source = ChemblBioactiveCompoundSource(chembl_db_url)
target_input = GeneTargetsInput(['HMGCR'])

In [3]:
pb = PhyteByte(source, target_input)
pb.set_negative_sampler('Tanimoto', Fingerprinter.create('daylight'))
pb.set_positive_clusterer('abc', Fingerprinter.create('daylight'))
pb.set_fingerprinter('daylight')

In [9]:
%%time

f1_scores = pb.evaluate_models('Random Forest', neg_sample_size_factor=5, true_threshold=0.8)

Found '231' pos samples.
Found '1' clusters.
Found '1155' neg samples
F1: [0.935251798561151]


CPU times: user 53.7 s, sys: 323 ms, total: 54 s
Wall time: 3min 15s


## What part of the deserialization/encoding process is the bottleneck?

To inform the caching strategy.

In [36]:
from phytebyte.food_cmpd.sources.foodb import FoodbFoodCmpdSource
import numpy as np

fcs = FoodbFoodCmpdSource(os.environ['FOODB_URL'])
cmpds = fcs.fetch_all_cmpds()
small_cmpds = cmpds[:1000]
f = Fingerprinter.create('daylight')
small_mols = [f.smiles_to_molecule(c.smiles) for c in small_cmpds]
small_fps = [f._molecule_to_fingerprint(m) for m in small_mols if m]
def fp_to_np(fp):
    if not fp:
        return None
    arr = np.zeros(1024)
    arr[fp.bits] = True
    return arr
small_encoded = [fp_to_np(fp) for fp in small_fps]

In [37]:
# Fetch compounds
%timeit fcs.fetch_all_cmpds()

# Compounds all the way to numpy encoding
%timeit [f.fingerprint_and_encode(c.smiles, 'numpy') for c in small_cmpds]

# Compounds to pybel molecules
%timeit [f.smiles_to_molecule(c.smiles) for c in small_cmpds]

# Molecules to pybel fingerprints
%timeit [f._molecule_to_fingerprint(m) for m in small_mols]

# Fingerprints to numpy arrays
%timeit [fp_to_np(fp) for fp in small_fps]

375 ms ± 4.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.77 s ± 26.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
713 ms ± 14.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


AttributeError: 'NoneType' object has no attribute 'calcfp'