In [3]:
# !pip3 install -U torch_geometric
# !pip3 install -U torch-scatter
# !pip3 install -U torch-sparse
# !pip3 install -U pandas
# !pip3 install -U numpy
# !pip3 install -U matplotlib
# !pip3 install -U sklearn
# !pip3 install -U PyTDC
# !pip3 install -U pre-commit
# !pip3 install -U plotly
# !pip3 install -U tdc
# !pip3 install -U ogb
# !pip3 install ogb==1.2.6

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rdkit
from scipy.sparse import csr_array

from ogb.graphproppred import PygGraphPropPredDataset
from ogb.graphproppred import GraphPropPredDataset
import rdkit.Chem.rdFingerprintGenerator as fpgens
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from tdc.single_pred import Tox
from rdkit.Chem import AllChem
from rdkit import Chem

from time import time



In [5]:
from featurizers.fingerprints import (
    MorganFingerprint,
    AtomPairFingerprint,
    TopologicalTorsionFingerprint,
    MACCSKeysFingerprint,
    ERGFingerprint,
)


from rdkit.Chem import MolFromSmiles

In [6]:
dataset_name = "ogbg-molhiv"
# GraphPropPredDataset(name=dataset_name)

In [7]:
dataset = pd.read_csv(
    f"../dataset/{'_'.join(dataset_name.split('-'))}/mapping/mol.csv.gz"
)
X = dataset["smiles"]
y = dataset["HIV_active"]

In [8]:
n_molecules = X.shape[0]
n_molecules

41127

In [9]:
N_SPLITS = 5
N_REPEATS = 5
N_CORES = [1, 2, 4, -1]
COUNT_TYPES = [False, True]
SPARSE_TYPES = [False, True]

In [10]:
def get_times_emf(transformer_function, **kwargs):
    result = []
    emf_transformer = transformer_function(**kwargs)
    for data_fraction in np.linspace(0, 1, N_SPLITS + 1)[1:]:
        n = int(n_molecules * data_fraction)
        subset = X[:n]
        times = [None for _ in range(N_REPEATS)]
        for i in range(N_REPEATS):
            start = time()
            X_transformed = emf_transformer.transform(subset)
            end = time()
            times[i] = end - start
        result.append(sum(times) / N_REPEATS)
    return np.array(result)

In [31]:
def get_generator_times_rdkit(generator, count, sparse):
    if count:
        fp_function = lambda x : generator.GetCountFingerprint(MolFromSmiles(x)).ToList()
    else:
        fp_function = lambda x : generator.GetFingerprint(MolFromSmiles(x))
    result = []
    for data_fraction in np.linspace(0, 1, N_SPLITS + 1)[1:]:
        n = int(n_molecules * data_fraction)
        subset = X[:n]
        times = [None for _ in range(N_REPEATS)]
        for i in range(N_REPEATS):
            start = time()
            if sparse:
                X_transformed = csr_array([fp_function(x) for x in subset])
            else:
                X_transformed = np.array([fp_function(x) for x in subset])
            end = time()
            times[i] = end - start
        result.append(sum(times) / N_REPEATS)
    return np.array(result)


def get_times_rdkit(func, sparse = False, **kwargs):
    result = []
    for data_fraction in np.linspace(0, 1, N_SPLITS + 1)[1:]:
        n = int(n_molecules * data_fraction)
        subset = X[:n]
        times = [None for _ in range(N_REPEATS)]
        for i in range(N_REPEATS):
            start = time()
            if sparse:
                X_transformed = csr_array([func(MolFromSmiles(x), **kwargs) for x in subset])
            else:
                X_transformed = np.array([func(MolFromSmiles(x), **kwargs) for x in subset])
            end = time()
            times[i] = end - start
        result.append(sum(times) / N_REPEATS)
    return np.array(result)

In [32]:
def plot_results(y_emf, y_rdkit, title="", sparse=None, count=None):
    if sparse is not None:
        if sparse:
            title += " sparse"

    if count is not None:
        if count:
            title += " count"
        else:
            title += " bit"

    X = n_molecules * np.linspace(0, 1, N_SPLITS + 1)[1:]

    plt.rcParams["font.size"] = 20
    fig = plt.figure(figsize=(15, 10))
    ax1 = fig.add_subplot()
    ax1.set_title(title)

    ax1.plot(X, y_emf[0], label="emf time - 1 job")
    ax1.plot(X, y_emf[1], label="emf time - 2 job")
    ax1.plot(X, y_emf[2], label="emf time - 4 job")
    ax1.plot(X, y_emf[3], label="emf time - all jobs")
    ax1.plot(X, y_rdkit, label="rdkit time")

    ax1.set_ylabel("Time of computiation")
    ax1.set_xlabel("Number of finberprints")

    ax1.set_xlim(n_molecules * 0.1, n_molecules * 1.1)
    ax1.set_ylim(bottom=0)

    plt.legend(loc="upper left")
    plt.savefig(title.replace(" ","_") + '.png')
    # plt.show()

## Morgan Fingerprint

In [22]:
morgan_emf_times = [
    [
        [
            get_times_emf(
                MorganFingerprint,
                sparse=sparse,
                count=count,
                n_jobs=n_cores,
            )
            for n_cores in N_CORES
        ]
        for sparse in SPARSE_TYPES
    ]
    for count in COUNT_TYPES
]

KeyboardInterrupt: 

In [None]:
generator = fpgens.GetMorganGenerator()

morgan_rdkit_times = [
    [
            get_generator_times_rdkit(
                generator,
                sparse=sparse,
                count=count,
            )

        for sparse in SPARSE_TYPES
    ]
    for count in COUNT_TYPES
]

In [None]:
for count, i in enumerate(COUNT_TYPES):
    for sparse, j in enumerate(SPARSE_TYPES):
        plot_results(
            morgan_emf_times[i][j],
            morgan_rdkit_times[i][j],
            "Morgan Fingerprint",
            count,
            sparse,
        )

## Atom Pair

In [None]:
atom_pair_emf_times = [
    [
        [
            get_times_emf(
                AtomPairFingerprint,
                sparse=sparse,
                count=count,
                n_jobs=n_cores,
            )
            for n_cores in N_CORES
        ]
        for sparse in SPARSE_TYPES
    ]
    for count in COUNT_TYPES
]

In [None]:
generator = fpgens.GetAtomPairGenerator()

atom_pair_rdkit_times = [
    [
            get_generator_times_rdkit(
                generator,
                sparse=sparse,
                count=count,
            )

        for sparse in SPARSE_TYPES
    ]
    for count in COUNT_TYPES
]

In [None]:
for count, i in enumerate(COUNT_TYPES):
    for sparse, j in enumerate(SPARSE_TYPES):
        plot_results(
            atom_pair_emf_times[i][j],
            atom_pair_rdkit_times[i][j],
            "Atom Pair Fingerprint",
            count,
            sparse,
        )

## Topological Torsion

In [None]:
topological_torsion_emf_times = [
    [
        [
            get_times_emf(
                AtomPairFingerprint,
                sparse=sparse,
                count=count,
                n_jobs=n_cores,
            )
            for n_cores in N_CORES
        ]
        for sparse in SPARSE_TYPES
    ]
    for count in COUNT_TYPES
]

In [None]:
generator = fpgens.GetTopologicalTorsionGenerator()

topological_torsion_rdkit_times = [
    [
            get_generator_times_rdkit(
                generator,
                sparse=sparse,
                count=count,
            )

        for sparse in SPARSE_TYPES
    ]
    for count in COUNT_TYPES
]

In [None]:
for count, i in enumerate(COUNT_TYPES):
    for sparse, j in enumerate(SPARSE_TYPES):
        plot_results(
            topological_torsion_emf_times[i][j],
            topological_torsion_rdkit_times[i][j],
            "Topological Torsion Fingerprint",
            count,
            sparse,
        )

## MACCS Keys

In [36]:
MACCSKeys_emf_times = [[
    get_times_emf(MACCSKeysFingerprint, n_jobs=n_cores, sparse=sparse)
    for n_cores in N_CORES
]for sparse in SPARSE_TYPES]



KeyboardInterrupt: 

In [None]:
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint

MACCSKeys_rdkit_times = [ get_times_rdkit(
    GetMACCSKeysFingerprint, sparse=sparse) for sparse in SPARSE_TYPES]

In [None]:
for sparse, i in enumerate(SPARSE_TYPES):
    plot_results(
        MACCSKeys_emf_times[i],
        MACCSKeys_rdkit_times[i],
        "MACCKeys fingerprint",
        count=None,
        sparse=sparse,
    )

## ERG Fingerprint

In [35]:
ERG_emf_times = [[
    get_times_emf(ERGFingerprint, n_jobs=n_cores,sparse=sparse)
    for n_cores in N_CORES
]for sparse in SPARSE_TYPES]

KeyboardInterrupt: 

In [None]:
from rdkit.Chem.rdReducedGraphs import GetErGFingerprint

ERG_rdkit_times = [get_times_rdkit(GetErGFingerprint,sparse=sparse) for sparse in SPARSE_TYPES]

In [None]:
for sparse, i in enumerate(SPARSE_TYPES):
    plot_results(
        ERG_emf_times[i],
        ERG_rdkit_times[i],
        "ERG fingerprint",
        count=None,
        sparse=sparse,
    )