In [1]:
import numpy as np
import pandas as pd
from time import time
from ogb.graphproppred import GraphPropPredDataset
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
from featurizers.fingerprints import MorganFingerprint

## Dataset Preparation

In [3]:
dataset_name = "ogbg-molhiv"
GraphPropPredDataset(name=dataset_name)

GraphPropPredDataset(41127)

In [4]:
dataset = pd.read_csv(
    f"./dataset/{'_'.join(dataset_name.split('-'))}/mapping/mol.csv.gz"
)
X = dataset["smiles"]
y = dataset["HIV_active"]

In [5]:
n_molecules = X.shape[0]
n_molecules

41127

In [6]:
X_train, X_test, y_train, y_test = (
    train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True))

## RDKit example use

In [7]:
start = time()

X_train_rdkit = [Chem.MolFromSmiles(x) for x in X_train]
X_test_rdkit = [Chem.MolFromSmiles(x) for x in X_test]

X_train_rdkit = [AllChem.GetMorganFingerprintAsBitVect(x, 3) for x in X_train_rdkit]
X_test_rdkit = [AllChem.GetMorganFingerprintAsBitVect(x, 3) for x in X_test_rdkit]
    
X_train_rdkit = np.array(X_train_rdkit)
X_test_rdkit = np.array(X_test_rdkit)

end = time()
print("time: ",end-start,"s")



time:  22.868861198425293 s


In [8]:
clf = RandomForestClassifier(random_state=42,n_jobs=-1)
clf.fit(X_train_rdkit, y_train)

In [9]:
print("ROC AUC score: ",end="")
roc_auc_score(y_test, clf.predict_proba(X_test_rdkit)[:, 1])

ROC AUC score: 

0.8312466320411838

## emf example use

In [10]:
start = time()

emf_morgan = MorganFingerprint(n_jobs=-1, radius = 3)
X_train_emf = emf_morgan.transform(X_train)
X_test_emf = emf_morgan.transform(X_test)

end = time()
print("time: ",end-start,"s")

time:  2.1118969917297363 s


In [11]:
clf = RandomForestClassifier(random_state=42,n_jobs=-1)
clf.fit(X_train_emf, y_train)

In [12]:
print("ROC AUC score: ",end="")
roc_auc_score(y_test, clf.predict_proba(X_test_emf)[:, 1])

ROC AUC score: 

0.8312466320411838

## Sklearn Pipeline use

In [13]:
clf = Pipeline([("FingerprintEncoder",MorganFingerprint(n_jobs=-1, radius = 3)),("RandomForest",RandomForestClassifier(random_state=42,n_jobs=-1))])
clf.fit(X_train, y_train)

In [14]:
print("ROC AUC score: ",end="")
roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

ROC AUC score: 

0.8312466320411838