In [1]:
import pandas as pd
from ogb.graphproppred import GraphPropPredDataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import lightgbm as lgb

In [2]:
from featurizers.fingerprints import (
    MHFP,
    AtomPairFingerprint,
    ERGFingerprint,
    MACCSKeysFingerprint,
    MAP4Fingerprint,
    MorganFingerprint,
    TopologicalTorsionFingerprint,
)

### Data preparation

In [3]:
dataset_name = "ogbg-molhiv"
GraphPropPredDataset(name=dataset_name, root="../dataset")
dataset = pd.read_csv(
    f"../dataset/{'_'.join(dataset_name.split('-'))}/mapping/mol.csv.gz"
)

In [4]:
X = dataset["smiles"]
y = dataset["HIV_active"]

In [5]:
n_molecules = X.shape[0]
n_molecules

41127

In [6]:
X_train, X_test, y_train, y_test = (
train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True))

## Mogan Fingerprint

In [7]:
RF_clf = Pipeline([("FingerprintEncoder",MorganFingerprint(n_jobs=-1, radius = 3)),("RandomForest",RandomForestClassifier(random_state=42,n_jobs=-1))])
RF_clf.fit(X_train, y_train)
print("ROC AUC score for Random Forest       : ",end="")
print(roc_auc_score(y_test, RF_clf.predict_proba(X_test)[:, 1]))

LogReg_clf = Pipeline([("FingerprintEncoder",MorganFingerprint(n_jobs=-1, radius = 3)),("LogReg",LogisticRegression(class_weight="balanced", penalty=None,n_jobs=-1))])
LogReg_clf.fit(X_train, y_train)
print("ROC AUC score for Logistic Regression : ",end="")
print(roc_auc_score(y_test, LogReg_clf.predict_proba(X_test)[:, 1]))

LGBM_clf = Pipeline([("FingerprintEncoder",MorganFingerprint(n_jobs=-1, radius = 3)),("LGBM",lgb.LGBMClassifier(random_state=42,n_jobs=-1,verbose=0))])
LGBM_clf.fit(X_train, y_train)
print("ROC AUC score for LGBM                : ",end="")
print(roc_auc_score(y_test, LGBM_clf.predict_proba(X_test)[:, 1]))

ROC AUC score for Random Forest       : 0.8312466320411838
ROC AUC score for Logistic Regression : 0.7697658045116741
ROC AUC score for LGBM                : 0.8068905305631249


## Atom Pair Fingerprint

In [8]:
RF_clf = Pipeline([("FingerprintEncoder",AtomPairFingerprint(n_jobs=-1)),("RandomForest",RandomForestClassifier(random_state=42,n_jobs=-1))])
RF_clf.fit(X_train, y_train)
print("ROC AUC score for Random Forest       : ",end="")
print(roc_auc_score(y_test, RF_clf.predict_proba(X_test)[:, 1]))

LogReg_clf = Pipeline([("FingerprintEncoder",AtomPairFingerprint(n_jobs=-1)),("LogReg",LogisticRegression(class_weight="balanced", penalty=None,n_jobs=-1))])
LogReg_clf.fit(X_train, y_train)
print("ROC AUC score for Logistic Regression : ",end="")
print(roc_auc_score(y_test, LogReg_clf.predict_proba(X_test)[:, 1]))


LGBM_clf = Pipeline([("FingerprintEncoder",AtomPairFingerprint(n_jobs=-1)),("LGBM",lgb.LGBMClassifier(random_state=42,n_jobs=-1,verbose=0))])
LGBM_clf.fit(X_train, y_train)
print("ROC AUC score for LGBM                : ",end="")
print(roc_auc_score(y_test, LGBM_clf.predict_proba(X_test)[:, 1]))

ROC AUC score for Random Forest       : 0.8163811147163397
ROC AUC score for Logistic Regression : 0.7398982467717491
ROC AUC score for LGBM                : 0.8174743610485823


## Topological Torsion Fingerprint

In [9]:
RF_clf = Pipeline([("FingerprintEncoder",TopologicalTorsionFingerprint(n_jobs=-1)),("RandomForest",RandomForestClassifier(random_state=42,n_jobs=-1))])
RF_clf.fit(X_train, y_train)
print("ROC AUC score for Random Forest       : ",end="")
print(roc_auc_score(y_test, RF_clf.predict_proba(X_test)[:, 1]))

LogReg_clf = Pipeline([("FingerprintEncoder",TopologicalTorsionFingerprint(n_jobs=-1)),("LogReg",LogisticRegression(class_weight="balanced", penalty=None,n_jobs=-1))])
LogReg_clf.fit(X_train, y_train)
print("ROC AUC score for Logistic Regression : ",end="")
print(roc_auc_score(y_test, LogReg_clf.predict_proba(X_test)[:, 1]))


LGBM_clf = Pipeline([("FingerprintEncoder",TopologicalTorsionFingerprint(n_jobs=-1)),("LGBM",lgb.LGBMClassifier(random_state=42,n_jobs=-1,verbose=0))])
LGBM_clf.fit(X_train, y_train)
print("ROC AUC score for LGBM                : ",end="")
print(roc_auc_score(y_test, LGBM_clf.predict_proba(X_test)[:, 1]))

ROC AUC score for Random Forest       : 0.8364287767743437
ROC AUC score for Logistic Regression : 0.7664591006122096
ROC AUC score for LGBM                : 0.8347883764708081


## MACCS Keys Fingerprint

In [10]:
RF_clf = Pipeline([("FingerprintEncoder",MACCSKeysFingerprint(n_jobs=-1)),("RandomForest",RandomForestClassifier(random_state=42,n_jobs=-1))])
RF_clf.fit(X_train, y_train)
print("ROC AUC score for Random Forest       : ",end="")
print(roc_auc_score(y_test, RF_clf.predict_proba(X_test)[:, 1]))

LogReg_clf = Pipeline([("FingerprintEncoder",MACCSKeysFingerprint(n_jobs=-1)),("LogReg",LogisticRegression(class_weight="balanced", penalty=None,n_jobs=-1))])
LogReg_clf.fit(X_train, y_train)
print("ROC AUC score for Logistic Regression : ",end="")
print(roc_auc_score(y_test, LogReg_clf.predict_proba(X_test)[:, 1]))


LGBM_clf = Pipeline([("FingerprintEncoder",MACCSKeysFingerprint(n_jobs=-1)),("LGBM",lgb.LGBMClassifier(random_state=42,n_jobs=-1,verbose=0))])
LGBM_clf.fit(X_train, y_train)
print("ROC AUC score for LGBM                : ",end="")
print(roc_auc_score(y_test, LGBM_clf.predict_proba(X_test)[:, 1]))

ROC AUC score for Random Forest       : 0.8071400089939629
ROC AUC score for Logistic Regression : 0.756602685534455
ROC AUC score for LGBM                : 0.8286703160583851


## ERG Fingerprint

In [11]:
RF_clf = Pipeline([("FingerprintEncoder",ERGFingerprint(n_jobs=-1)),("RandomForest",RandomForestClassifier(random_state=42,n_jobs=-1))])
RF_clf.fit(X_train, y_train)
print("ROC AUC score for Random Forest       : ",end="")
print(roc_auc_score(y_test, RF_clf.predict_proba(X_test)[:, 1]))

LogReg_clf = Pipeline([("FingerprintEncoder",ERGFingerprint(n_jobs=-1)),("LogReg",LogisticRegression(class_weight="balanced", penalty=None,n_jobs=-1))])
LogReg_clf.fit(X_train, y_train)
print("ROC AUC score for Logistic Regression : ",end="")
print(roc_auc_score(y_test, LogReg_clf.predict_proba(X_test)[:, 1]))


LGBM_clf = Pipeline([("FingerprintEncoder",ERGFingerprint(n_jobs=-1)),("LGBM",lgb.LGBMClassifier(random_state=42,n_jobs=-1,verbose=0))])
LGBM_clf.fit(X_train, y_train)
print("ROC AUC score for LGBM                : ",end="")
print(roc_auc_score(y_test, LGBM_clf.predict_proba(X_test)[:, 1]))

ROC AUC score for Random Forest       : 0.79627973517493
ROC AUC score for Logistic Regression : 0.7451256161055633
ROC AUC score for LGBM                : 0.8028245629456375


## MAP4 Fingerprint

In [12]:
RF_clf = Pipeline([("FingerprintEncoder",MAP4Fingerprint(n_jobs=-1)),("RandomForest",RandomForestClassifier(random_state=42,n_jobs=-1))])
RF_clf.fit(X_train, y_train)
print("ROC AUC score for Random Forest       : ",end="")
print(roc_auc_score(y_test, RF_clf.predict_proba(X_test)[:, 1]))

LogReg_clf = Pipeline([("FingerprintEncoder",MAP4Fingerprint(n_jobs=-1)),("LogReg",LogisticRegression(class_weight="balanced", penalty=None,n_jobs=-1))])
LogReg_clf.fit(X_train, y_train)
print("ROC AUC score for Logistic Regression : ",end="")
print(roc_auc_score(y_test, LogReg_clf.predict_proba(X_test)[:, 1]))


LGBM_clf = Pipeline([("FingerprintEncoder",MAP4Fingerprint(n_jobs=-1)),("LGBM",lgb.LGBMClassifier(random_state=42,n_jobs=-1,verbose=0))])
LGBM_clf.fit(X_train, y_train)
print("ROC AUC score for LGBM                : ",end="")
print(roc_auc_score(y_test, LGBM_clf.predict_proba(X_test)[:, 1]))

ROC AUC score for Random Forest       : 0.7027537747679001
ROC AUC score for Logistic Regression : 0.6280580216763847
ROC AUC score for LGBM                : 0.6817602178933999


## MHFP

In [13]:
RF_clf = Pipeline([("FingerprintEncoder",MHFP(n_jobs=-1)),("RandomForest",RandomForestClassifier(random_state=42,n_jobs=-1))])
RF_clf.fit(X_train, y_train)
print("ROC AUC score for Random Forest       : ",end="")
print(roc_auc_score(y_test, RF_clf.predict_proba(X_test)[:, 1]))

LogReg_clf = Pipeline([("FingerprintEncoder",MHFP(n_jobs=-1)),("LogReg",LogisticRegression(class_weight="balanced", penalty=None,n_jobs=-1))])
LogReg_clf.fit(X_train, y_train)
print("ROC AUC score for Logistic Regression : ",end="")
print(roc_auc_score(y_test, LogReg_clf.predict_proba(X_test)[:, 1]))


LGBM_clf = Pipeline([("FingerprintEncoder",MHFP(n_jobs=-1)),("LGBM",lgb.LGBMClassifier(random_state=42,n_jobs=-1,verbose=0))])
LGBM_clf.fit(X_train, y_train)
print("ROC AUC score for LGBM                : ",end="")
print(roc_auc_score(y_test, LGBM_clf.predict_proba(X_test)[:, 1]))

ROC AUC score for Random Forest       : 0.6953641174854444
ROC AUC score for Logistic Regression : 0.5920745267447248
ROC AUC score for LGBM                : 0.6522716550462798
