In [1]:
import pandas as pd
from ogb.graphproppred import GraphPropPredDataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from time import time

In [2]:
from featurizers.fingerprints import (
    MHFP,
    AtomPairFingerprint,
    ERGFingerprint,
    MACCSKeysFingerprint,
    MAP4Fingerprint,
    MorganFingerprint,
    TopologicalTorsionFingerprint,
)

### Data preparation

In [3]:
dataset_name = "ogbg-molhiv"
GraphPropPredDataset(name=dataset_name, root="../dataset")
dataset = pd.read_csv(
    f"../dataset/{'_'.join(dataset_name.split('-'))}/mapping/mol.csv.gz"
)

In [4]:
X = dataset["smiles"]
y = dataset["HIV_active"]

In [5]:
n_molecules = X.shape[0]
n_molecules

41127

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [7]:
records = []

fp_names = [
    "Morgan",
    "Atom Pair",
    "Topological Torsion",
    "MACCS Keys",
    "ERG",
    "MAP4",
    "MHFP",
]
fingerprints = [
    MorganFingerprint,
    AtomPairFingerprint,
    TopologicalTorsionFingerprint,
    MACCSKeysFingerprint,
    ERGFingerprint,
    MAP4Fingerprint,
    MHFP,
]

clf_names = ["RF", "LogReg", "LGBM"]
classifiers = [RandomForestClassifier, LogisticRegression, lgb.LGBMClassifier]
classifier_kwargs = [
    {"random_state": 42, "n_jobs": -1},
    {
        "random_state": 42,
        "class_weight": "balanced",
        "penalty": None,
        "n_jobs": -1,
    },
    {"random_state": 42, "n_jobs": -1, "verbose": 0},
]


for fingerprint, fp_name in zip(fingerprints, fp_names):
    records.append({})
    records[-1]["fp_name"] = fp_name
    print(fp_name, "Fingerprint")
    start = time()
    fp_transformer = fingerprint(n_jobs=-1)
    X_fp_train = fp_transformer.transform(X_train)
    X_fp_test = fp_transformer.transform(X_test)
    end = time()
    execution_time = end - start
    print(f" - Time of fingerprints computing : {round(execution_time,2)}s")
    records[-1]["execution_time"] = execution_time
    for classifier, clf_name, clf_kwargs in zip(
        classifiers, clf_names, classifier_kwargs
    ):
        clf = classifier(**clf_kwargs)
        clf.fit(X_fp_train, y_train)
        score = roc_auc_score(y_test, clf.predict_proba(X_fp_test)[:, 1])
        print(f" - - ROC AUC score for {clf_name} : {int(100*score)}%")
        records[-1][clf_name] = score

Morgan Fingerprint
 - Time of fingerprints computing : 2.53s
 - - ROC AUC score for RF : 83%
 - - ROC AUC score for LogReg : 76%
 - - ROC AUC score for LGBM : 80%
Atom Pair Fingerprint
 - Time of fingerprints computing : 0.71s
 - - ROC AUC score for RF : 81%
 - - ROC AUC score for LogReg : 73%
 - - ROC AUC score for LGBM : 81%
Topological Torsion Fingerprint
 - Time of fingerprints computing : 0.78s
 - - ROC AUC score for RF : 83%
 - - ROC AUC score for LogReg : 76%
 - - ROC AUC score for LGBM : 83%
MACCS Keys Fingerprint
 - Time of fingerprints computing : 2.9s
 - - ROC AUC score for RF : 80%
 - - ROC AUC score for LogReg : 75%
 - - ROC AUC score for LGBM : 82%
ERG Fingerprint
 - Time of fingerprints computing : 1.0s
 - - ROC AUC score for RF : 79%
 - - ROC AUC score for LogReg : 74%
 - - ROC AUC score for LGBM : 80%
MAP4 Fingerprint
 - Time of fingerprints computing : 20.21s
 - - ROC AUC score for RF : 70%
 - - ROC AUC score for LogReg : 62%
 - - ROC AUC score for LGBM : 68%
MHFP Fin

In [8]:
df = pd.DataFrame.from_records(records)

In [9]:
df.to_csv("classification_scores.csv")

In [10]:
df

Unnamed: 0,fp_name,execution_time,RF,LogReg,LGBM
0,Morgan,2.531973,0.831247,0.769766,0.806891
1,Atom Pair,0.712641,0.816381,0.739898,0.817474
2,Topological Torsion,0.781641,0.836429,0.766459,0.834788
3,MACCS Keys,2.901503,0.80714,0.756603,0.82867
4,ERG,0.995269,0.79628,0.745126,0.802825
5,MAP4,20.20806,0.702755,0.628058,0.68176
6,MHFP,31.677383,0.695364,0.592075,0.652272
