In [1]:
import deepchem as dc

tasks, datasets, transformers = dc.molnet.load_tox21(
    featurizer="ECFP",
    splitter="scaffold"
)

train_dataset, valid_dataset, test_dataset = datasets

print("Tox21 loaded ✅")
print("Train/Valid/Test:", len(train_dataset), len(valid_dataset), len(test_dataset))

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
No normalization for NumAmideBonds. Feature removed!
No normalization for NumAtomStereoCenters. Feature removed!
No normalization for NumBridgeheadAtoms. Feature removed!
No normalization for NumHeterocycles. Feature removed!
No normalization for NumSpiroAtoms. Feature removed!
No normalization for NumUnspecifiedAtomStereoCenters. Feature removed!
No normalization for Phi. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (/opt/miniconda3/envs/toxml/lib/python3.11/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-geometric depende

ValueError: No Metadata found in the path /var/folders/m0/qt24snl11xd8rvmvzqjkmryw0000gn/T/tox21-featurized/CircularFingerprint_size_1024/ScaffoldSplitter/BalancingTransformer/train_dir

In [2]:
import os
import deepchem as dc

project_root = os.path.expanduser("~/drug-toxicity-ml")
data_dir = os.path.join(project_root, "data")
save_dir = os.path.join(project_root, "data", "deepchem_cache")

os.makedirs(data_dir, exist_ok=True)
os.makedirs(save_dir, exist_ok=True)

tasks, datasets, transformers = dc.molnet.load_tox21(
    featurizer="ECFP",
    splitter="scaffold",
    reload=False,        # force rebuild instead of using broken cache
    data_dir=data_dir,
    save_dir=save_dir
)

train_dataset, valid_dataset, test_dataset = datasets

print("Tox21 loaded ✅")
print("Train/Valid/Test:", len(train_dataset), len(valid_dataset), len(test_dataset))


[20:57:11] Explicit valence for atom # 8 Al, 6, is greater than permitted
Failed to featurize datapoint 1322, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[20:57:12] Explicit valence for atom # 3 Al, 6, is greater than permitted
Failed to featurize datapoint 2290, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[20:57:12] Explicit valence for atom # 4 Al, 6, is greater than permitted
Failed to featurize datapoint 2297, No

Tox21 loaded ✅
Train/Valid/Test: 6258 782 783


In [3]:
print(train_dataset.X.shape)
print(train_dataset.y.shape)
print(train_dataset.w.shape)


(6258, 1024)
(6258, 12)
(6258, 12)


In [5]:
import numpy as np

X_train, y_train, w_train = train_dataset.X, train_dataset.y, train_dataset.w
X_valid, y_valid, w_valid = valid_dataset.X, valid_dataset.y, valid_dataset.w
X_test,  y_test,  w_test  = test_dataset.X,  test_dataset.y,  test_dataset.w


In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

def safe_auc(model, X, y, w, task_idx):
    mask = (w[:, task_idx] != 0)
    if mask.sum() < 10:
        return np.nan
    y_true = y[mask, task_idx]
    if len(np.unique(y_true)) < 2:
        return np.nan
    y_pred = model.predict_proba(X[mask])[:, 1]
    return roc_auc_score(y_true, y_pred)

xgb_valid_aucs = []
xgb_test_aucs = []

for t in range(y_train.shape[1]):
    mask_tr = (w_train[:, t] != 0)
    if mask_tr.sum() < 50 or len(np.unique(y_train[mask_tr, t])) < 2:
        xgb_valid_aucs.append(np.nan)
        xgb_test_aucs.append(np.nan)
        continue

    model = XGBClassifier(
        n_estimators=400,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        eval_metric="auc",
        tree_method="hist",
        random_state=42
    )

    model.fit(X_train[mask_tr], y_train[mask_tr, t])

    xgb_valid_aucs.append(safe_auc(model, X_valid, y_valid, w_valid, t))
    xgb_test_aucs.append(safe_auc(model, X_test,  y_test,  w_test,  t))

print("XGBoost results")
print("Mean VALID AUC:", round(float(np.nanmean(xgb_valid_aucs)), 4))
print("Mean TEST  AUC:", round(float(np.nanmean(xgb_test_aucs)), 4))


XGBoost results
Mean VALID AUC: 0.7466
Mean TEST  AUC: 0.7215


In [7]:
import pandas as pd
import os

results = pd.DataFrame({
    "task": tasks,
    "xgb_valid_auc": xgb_valid_aucs,
    "xgb_test_auc": xgb_test_aucs
}).sort_values("xgb_test_auc", ascending=False)

project_root = os.path.expanduser("~/drug-toxicity-ml")
results_dir = os.path.join(project_root, "results")
os.makedirs(results_dir, exist_ok=True)

out_path = os.path.join(results_dir, "xgboost_auc_results.csv")
results.to_csv(out_path, index=False)

print("Saved to:", out_path)
results


Saved to: /Users/raheemmalik/drug-toxicity-ml/results/xgboost_auc_results.csv


Unnamed: 0,task,xgb_valid_auc,xgb_test_auc
1,NR-AR-LBD,0.82186,0.848543
2,NR-AhR,0.785477,0.814509
10,SR-MMP,0.7792,0.780169
11,SR-p53,0.709459,0.774691
3,NR-Aromatase,0.712213,0.730355
0,NR-AR,0.80468,0.699774
9,SR-HSE,0.764666,0.697345
8,SR-ATAD5,0.711061,0.696258
5,NR-ER-LBD,0.757317,0.682715
4,NR-ER,0.635017,0.666308


In [8]:
import numpy as np

print("Mean VALID AUC:", round(float(np.nanmean(results["xgb_valid_auc"])), 4))
print("Mean TEST  AUC:", round(float(np.nanmean(results["xgb_test_auc"])), 4))


Mean VALID AUC: 0.7466
Mean TEST  AUC: 0.7215


In [9]:
import pandas as pd
import os

project_root = os.path.expanduser("~/drug-toxicity-ml")
lr_path  = os.path.join(project_root, "results", "logreg_auc_results.csv")
xgb_path = os.path.join(project_root, "results", "xgboost_auc_results.csv")

lr  = pd.read_csv(lr_path)
xgb = pd.read_csv(xgb_path)

# Merge on task
merged = lr.merge(xgb, on="task", how="inner")
merged["test_auc_gain"] = merged["xgb_test_auc"] - merged["test_auc"]

print("Mean TEST AUC (LogReg):", round(merged["test_auc"].mean(), 4))
print("Mean TEST AUC (XGBoost):", round(merged["xgb_test_auc"].mean(), 4))
print("Mean TEST AUC gain:", round(merged["test_auc_gain"].mean(), 4))

merged.sort_values("test_auc_gain", ascending=False).head(12)


Mean TEST AUC (LogReg): 0.6883
Mean TEST AUC (XGBoost): 0.7215
Mean TEST AUC gain: 0.0333


Unnamed: 0,task,valid_auc,test_auc,xgb_valid_auc,xgb_test_auc,test_auc_gain
1,NR-AR-LBD,0.798605,0.744672,0.82186,0.848543,0.103871
6,SR-MMP,0.775773,0.697769,0.7792,0.780169,0.0824
0,NR-AhR,0.744123,0.747085,0.785477,0.814509,0.067424
4,SR-p53,0.668342,0.708906,0.709459,0.774691,0.065786
5,NR-Aromatase,0.718056,0.699446,0.712213,0.730355,0.030909
9,NR-ER,0.615755,0.637411,0.635017,0.666308,0.028896
10,SR-ARE,0.71076,0.61945,0.738214,0.646531,0.027081
11,NR-PPAR-gamma,0.701861,0.595348,0.739602,0.620911,0.025563
8,SR-HSE,0.688851,0.674326,0.764666,0.697345,0.023019
7,NR-AR,0.768879,0.685239,0.80468,0.699774,0.014535
