#  Data Featurization pairwise/dense edges




In [2]:
import os
import pandas as pd
import torch
from rdkit import Chem, RDLogger
from deepchem.feat import PagtnMolGraphFeaturizer
from torch_geometric.data import Data
from tqdm import tqdm

# Silence RDKit warnings for cleaner output
RDLogger.DisableLog('rdApp.warning')

In [3]:
# Featurizer (DeepChem). 
featurizer = PagtnMolGraphFeaturizer(max_length=5)

# CYP3A4

In [3]:
# Configuration
ISOFORM = "3A4"
SPLITS = ["train", "val", "test"]
CSV_DIR = os.path.join("..", "data", "processed")
OUT_ROOT = os.path.join("..", "GraphDataset")
os.makedirs(OUT_ROOT, exist_ok=True)

In [5]:
# Helper: ensure edge_index has shape [2, num_edges] for PyG
def to_edge_index_tensor(edge_index):
    # edge_index may be (num_edges, 2) or (2, num_edges). Convert to LongTensor [2, E].
    ei = torch.tensor(edge_index, dtype=torch.long)
    if ei.dim() == 2 and ei.shape[0] == 2:
        # already [2, E]
        return ei.contiguous()
    elif ei.dim() == 2 and ei.shape[1] == 2:
        # [E, 2] -> transpose
        return ei.t().contiguous()
    else:
        raise ValueError(f"Unexpected edge_index shape: {tuple(ei.shape)}")

In [6]:
# Loop over splits
for split in SPLITS:
    csv_path = os.path.join(CSV_DIR, f"{ISOFORM}_{split}.csv")
    out_dir = os.path.join(OUT_ROOT, ISOFORM, split)
    os.makedirs(out_dir, exist_ok=True)

    df = pd.read_csv(csv_path)
    print(f"Processing {len(df)} molecules for {ISOFORM} [{split}] -> saving to {out_dir}")

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        smiles = row["Drug"]
        try:
            drug_id = int(float(row["Drug_ID"]))
        except Exception:
            drug_id = str(row["Drug_ID"])
        label = int(row["Y"])

        # parse SMILES
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            print(f"[WARN] invalid SMILES at index {idx}: {smiles}")
            continue

        # Use public API: featurize returns a list (even if one mol)
        try:
            feats = featurizer.featurize([mol])   # returns a list-like result
            if len(feats) == 0:
                print(f"[WARN] featurizer returned empty for {drug_id}")
                continue
            f = feats[0]
        except Exception as e:
            print(f"[ERROR] featurizer failed for {drug_id}: {e}")
            continue

        # f.node_features, f.edge_index, f.edge_features are expected attributes
        # convert to torch tensors and ensure correct shapes
        try:
            x = torch.tensor(f.node_features, dtype=torch.float)          # [N_nodes, node_feat_dim]
            edge_attr = torch.tensor(f.edge_features, dtype=torch.float)  # [num_edges, edge_feat_dim]
            edge_index = to_edge_index_tensor(f.edge_index)               # [2, num_edges]
        except Exception as e:
            print(f"[ERROR] Bad featurizer output for {drug_id}: {e}")
            continue

        # Build Data object; save label as float (for BCEWithLogitsLoss) but can be changed later
        data = Data(
            x=x,
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=torch.tensor([label], dtype=torch.float)
        )

        # Save file with integer ID: "<drugid>_<label>.pt"
        fname = f"{drug_id}_{label}.pt"
        torch.save(data, os.path.join(out_dir, fname))

    print(f"→ Completed {split}: saved graphs to {out_dir}\n")


Processing 9862 molecules for 3A4 [train] -> saving to ..\GraphDataset\3A4\train


100%|██████████| 9862/9862 [02:38<00:00, 62.21it/s] 


→ Completed train: saved graphs to ..\GraphDataset\3A4\train

Processing 1232 molecules for 3A4 [val] -> saving to ..\GraphDataset\3A4\val


100%|██████████| 1232/1232 [00:43<00:00, 28.04it/s]


→ Completed val: saved graphs to ..\GraphDataset\3A4\val

Processing 1234 molecules for 3A4 [test] -> saving to ..\GraphDataset\3A4\test


100%|██████████| 1234/1234 [00:19<00:00, 63.56it/s]

→ Completed test: saved graphs to ..\GraphDataset\3A4\test






In [7]:
from glob import glob
import torch
paths = glob("../GraphDataset/3A4/train/*.pt")[:5]
for p in paths:
    d = torch.load(p)
    print(p, "x:", d.x.shape, "edge_index:", d.edge_index.shape, "edge_attr:", d.edge_attr.shape, "y:", d.y)


../GraphDataset/3A4/train\1001112_1.pt x: torch.Size([31, 94]) edge_index: torch.Size([2, 961]) edge_attr: torch.Size([961, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001133_1.pt x: torch.Size([30, 94]) edge_index: torch.Size([2, 900]) edge_attr: torch.Size([900, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001459_1.pt x: torch.Size([35, 94]) edge_index: torch.Size([2, 1225]) edge_attr: torch.Size([1225, 42]) y: tensor([1.])
../GraphDataset/3A4/train\100181_0.pt x: torch.Size([22, 94]) edge_index: torch.Size([2, 484]) edge_attr: torch.Size([484, 42]) y: tensor([0.])
../GraphDataset/3A4/train\100426_0.pt x: torch.Size([28, 94]) edge_index: torch.Size([2, 784]) edge_attr: torch.Size([784, 42]) y: tensor([0.])


  d = torch.load(p)


In [8]:
import torch
d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")
print("N nodes:", d.x.shape[0])
print("node feat dim:", d.x.shape[1])
print("first node features:", d.x[0])
print("edge_index first 10 cols:", d.edge_index[:, :10])
print("edge_attr first 5:", d.edge_attr[:5])
print("label:", d.y)


N nodes: 31
node feat dim: 94
first node features: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0.])
edge_index first 10 cols: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
edge_attr first 5: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
      

  d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")


# CYP1A2

In [4]:
# Configuration
ISOFORM = "1A2"
SPLITS = ["train", "val", "test"]
CSV_DIR = os.path.join("..", "data", "processed")
OUT_ROOT = os.path.join("..", "GraphDataset")
os.makedirs(OUT_ROOT, exist_ok=True)

In [5]:
# Helper: ensure edge_index has shape [2, num_edges] for PyG
def to_edge_index_tensor(edge_index):
    # edge_index may be (num_edges, 2) or (2, num_edges). Convert to LongTensor [2, E].
    ei = torch.tensor(edge_index, dtype=torch.long)
    if ei.dim() == 2 and ei.shape[0] == 2:
        # already [2, E]
        return ei.contiguous()
    elif ei.dim() == 2 and ei.shape[1] == 2:
        # [E, 2] -> transpose
        return ei.t().contiguous()
    else:
        raise ValueError(f"Unexpected edge_index shape: {tuple(ei.shape)}")

In [6]:
# Loop over splits
for split in SPLITS:
    csv_path = os.path.join(CSV_DIR, f"{ISOFORM}_{split}.csv")
    out_dir = os.path.join(OUT_ROOT, ISOFORM, split)
    os.makedirs(out_dir, exist_ok=True)

    df = pd.read_csv(csv_path)
    print(f"Processing {len(df)} molecules for {ISOFORM} [{split}] -> saving to {out_dir}")

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        smiles = row["Drug"]
        try:
            drug_id = int(float(row["Drug_ID"]))
        except Exception:
            drug_id = str(row["Drug_ID"])
        label = int(row["Y"])

        # parse SMILES
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            print(f"[WARN] invalid SMILES at index {idx}: {smiles}")
            continue

        # Use public API: featurize returns a list (even if one mol)
        try:
            feats = featurizer.featurize([mol])   # returns a list-like result
            if len(feats) == 0:
                print(f"[WARN] featurizer returned empty for {drug_id}")
                continue
            f = feats[0]
        except Exception as e:
            print(f"[ERROR] featurizer failed for {drug_id}: {e}")
            continue

        # f.node_features, f.edge_index, f.edge_features are expected attributes
        # convert to torch tensors and ensure correct shapes
        try:
            x = torch.tensor(f.node_features, dtype=torch.float)          # [N_nodes, node_feat_dim]
            edge_attr = torch.tensor(f.edge_features, dtype=torch.float)  # [num_edges, edge_feat_dim]
            edge_index = to_edge_index_tensor(f.edge_index)               # [2, num_edges]
        except Exception as e:
            print(f"[ERROR] Bad featurizer output for {drug_id}: {e}")
            continue

        # Build Data object; save label as float (for BCEWithLogitsLoss) but can be changed later
        data = Data(
            x=x,
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=torch.tensor([label], dtype=torch.float)
        )

        # Save file with integer ID: "<drugid>_<label>.pt"
        fname = f"{drug_id}_{label}.pt"
        torch.save(data, os.path.join(out_dir, fname))

    print(f"→ Completed {split}: saved graphs to {out_dir}\n")


Processing 10063 molecules for 1A2 [train] -> saving to ..\GraphDataset\1A2\train


100%|██████████| 10063/10063 [02:22<00:00, 70.65it/s]


→ Completed train: saved graphs to ..\GraphDataset\1A2\train

Processing 1257 molecules for 1A2 [val] -> saving to ..\GraphDataset\1A2\val


100%|██████████| 1257/1257 [00:37<00:00, 33.12it/s]


→ Completed val: saved graphs to ..\GraphDataset\1A2\val

Processing 1259 molecules for 1A2 [test] -> saving to ..\GraphDataset\1A2\test


100%|██████████| 1259/1259 [00:47<00:00, 26.56it/s]

→ Completed test: saved graphs to ..\GraphDataset\1A2\test






In [7]:
from glob import glob
import torch
paths = glob("../GraphDataset/3A4/train/*.pt")[:5]
for p in paths:
    d = torch.load(p)
    print(p, "x:", d.x.shape, "edge_index:", d.edge_index.shape, "edge_attr:", d.edge_attr.shape, "y:", d.y)


../GraphDataset/3A4/train\1001112_1.pt x: torch.Size([31, 94]) edge_index: torch.Size([2, 961]) edge_attr: torch.Size([961, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001133_1.pt x: torch.Size([30, 94]) edge_index: torch.Size([2, 900]) edge_attr: torch.Size([900, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001459_1.pt x: torch.Size([35, 94]) edge_index: torch.Size([2, 1225]) edge_attr: torch.Size([1225, 42]) y: tensor([1.])
../GraphDataset/3A4/train\100181_0.pt x: torch.Size([22, 94]) edge_index: torch.Size([2, 484]) edge_attr: torch.Size([484, 42]) y: tensor([0.])
../GraphDataset/3A4/train\100426_0.pt x: torch.Size([28, 94]) edge_index: torch.Size([2, 784]) edge_attr: torch.Size([784, 42]) y: tensor([0.])


  d = torch.load(p)


In [8]:
import torch
d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")
print("N nodes:", d.x.shape[0])
print("node feat dim:", d.x.shape[1])
print("first node features:", d.x[0])
print("edge_index first 10 cols:", d.edge_index[:, :10])
print("edge_attr first 5:", d.edge_attr[:5])
print("label:", d.y)


N nodes: 31
node feat dim: 94
first node features: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0.])
edge_index first 10 cols: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
edge_attr first 5: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
      

  d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")


# CYP2C9

In [9]:
# Configuration
ISOFORM = "2C9"
SPLITS = ["train", "val", "test"]
CSV_DIR = os.path.join("..", "data", "processed")
OUT_ROOT = os.path.join("..", "GraphDataset")
os.makedirs(OUT_ROOT, exist_ok=True)

In [10]:
# Helper: ensure edge_index has shape [2, num_edges] for PyG
def to_edge_index_tensor(edge_index):
    # edge_index may be (num_edges, 2) or (2, num_edges). Convert to LongTensor [2, E].
    ei = torch.tensor(edge_index, dtype=torch.long)
    if ei.dim() == 2 and ei.shape[0] == 2:
        # already [2, E]
        return ei.contiguous()
    elif ei.dim() == 2 and ei.shape[1] == 2:
        # [E, 2] -> transpose
        return ei.t().contiguous()
    else:
        raise ValueError(f"Unexpected edge_index shape: {tuple(ei.shape)}")

In [11]:
# Loop over splits
for split in SPLITS:
    csv_path = os.path.join(CSV_DIR, f"{ISOFORM}_{split}.csv")
    out_dir = os.path.join(OUT_ROOT, ISOFORM, split)
    os.makedirs(out_dir, exist_ok=True)

    df = pd.read_csv(csv_path)
    print(f"Processing {len(df)} molecules for {ISOFORM} [{split}] -> saving to {out_dir}")

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        smiles = row["Drug"]
        try:
            drug_id = int(float(row["Drug_ID"]))
        except Exception:
            drug_id = str(row["Drug_ID"])
        label = int(row["Y"])

        # parse SMILES
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            print(f"[WARN] invalid SMILES at index {idx}: {smiles}")
            continue

        # Use public API: featurize returns a list (even if one mol)
        try:
            feats = featurizer.featurize([mol])   # returns a list-like result
            if len(feats) == 0:
                print(f"[WARN] featurizer returned empty for {drug_id}")
                continue
            f = feats[0]
        except Exception as e:
            print(f"[ERROR] featurizer failed for {drug_id}: {e}")
            continue

        # f.node_features, f.edge_index, f.edge_features are expected attributes
        # convert to torch tensors and ensure correct shapes
        try:
            x = torch.tensor(f.node_features, dtype=torch.float)          # [N_nodes, node_feat_dim]
            edge_attr = torch.tensor(f.edge_features, dtype=torch.float)  # [num_edges, edge_feat_dim]
            edge_index = to_edge_index_tensor(f.edge_index)               # [2, num_edges]
        except Exception as e:
            print(f"[ERROR] Bad featurizer output for {drug_id}: {e}")
            continue

        # Build Data object; save label as float (for BCEWithLogitsLoss) but can be changed later
        data = Data(
            x=x,
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=torch.tensor([label], dtype=torch.float)
        )

        # Save file with integer ID: "<drugid>_<label>.pt"
        fname = f"{drug_id}_{label}.pt"
        torch.save(data, os.path.join(out_dir, fname))

    print(f"→ Completed {split}: saved graphs to {out_dir}\n")


Processing 9673 molecules for 2C9 [train] -> saving to ..\GraphDataset\2C9\train


100%|██████████| 9673/9673 [02:26<00:00, 65.90it/s] 


→ Completed train: saved graphs to ..\GraphDataset\2C9\train

Processing 1209 molecules for 2C9 [val] -> saving to ..\GraphDataset\2C9\val


100%|██████████| 1209/1209 [00:15<00:00, 75.63it/s] 


→ Completed val: saved graphs to ..\GraphDataset\2C9\val

Processing 1210 molecules for 2C9 [test] -> saving to ..\GraphDataset\2C9\test


100%|██████████| 1210/1210 [00:18<00:00, 66.42it/s]

→ Completed test: saved graphs to ..\GraphDataset\2C9\test






In [12]:
from glob import glob
import torch
paths = glob("../GraphDataset/3A4/train/*.pt")[:5]
for p in paths:
    d = torch.load(p)
    print(p, "x:", d.x.shape, "edge_index:", d.edge_index.shape, "edge_attr:", d.edge_attr.shape, "y:", d.y)


../GraphDataset/3A4/train\1001112_1.pt x: torch.Size([31, 94]) edge_index: torch.Size([2, 961]) edge_attr: torch.Size([961, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001133_1.pt x: torch.Size([30, 94]) edge_index: torch.Size([2, 900]) edge_attr: torch.Size([900, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001459_1.pt x: torch.Size([35, 94]) edge_index: torch.Size([2, 1225]) edge_attr: torch.Size([1225, 42]) y: tensor([1.])
../GraphDataset/3A4/train\100181_0.pt x: torch.Size([22, 94]) edge_index: torch.Size([2, 484]) edge_attr: torch.Size([484, 42]) y: tensor([0.])
../GraphDataset/3A4/train\100426_0.pt x: torch.Size([28, 94]) edge_index: torch.Size([2, 784]) edge_attr: torch.Size([784, 42]) y: tensor([0.])


  d = torch.load(p)


In [13]:
import torch
d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")
print("N nodes:", d.x.shape[0])
print("node feat dim:", d.x.shape[1])
print("first node features:", d.x[0])
print("edge_index first 10 cols:", d.edge_index[:, :10])
print("edge_attr first 5:", d.edge_attr[:5])
print("label:", d.y)


N nodes: 31
node feat dim: 94
first node features: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0.])
edge_index first 10 cols: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
edge_attr first 5: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
      

  d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")


# 2C9_train_downsampled.csv

In [4]:
# Configuration
ISOFORM = "2C9"
SPLITS = ["train", "val", "test"]
CSV_DIR = os.path.join("..", "data", "processed")
OUT_ROOT = os.path.join("..", "GraphDataset")
os.makedirs(OUT_ROOT, exist_ok=True)

In [5]:
# Helper: ensure edge_index has shape [2, num_edges] for PyG
def to_edge_index_tensor(edge_index):
    # edge_index may be (num_edges, 2) or (2, num_edges). Convert to LongTensor [2, E].
    ei = torch.tensor(edge_index, dtype=torch.long)
    if ei.dim() == 2 and ei.shape[0] == 2:
        # already [2, E]
        return ei.contiguous()
    elif ei.dim() == 2 and ei.shape[1] == 2:
        # [E, 2] -> transpose
        return ei.t().contiguous()
    else:
        raise ValueError(f"Unexpected edge_index shape: {tuple(ei.shape)}")

In [6]:
# Loop over splits
for split in SPLITS:
    csv_path = os.path.join(CSV_DIR, f"{ISOFORM}_{split}.csv")
    out_dir = os.path.join(OUT_ROOT, ISOFORM, split)
    os.makedirs(out_dir, exist_ok=True)

    df = pd.read_csv(csv_path)
    print(f"Processing {len(df)} molecules for {ISOFORM} [{split}] -> saving to {out_dir}")

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        smiles = row["Drug"]
        try:
            drug_id = int(float(row["Drug_ID"]))
        except Exception:
            drug_id = str(row["Drug_ID"])
        label = int(row["Y"])

        # parse SMILES
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            print(f"[WARN] invalid SMILES at index {idx}: {smiles}")
            continue

        # Use public API: featurize returns a list (even if one mol)
        try:
            feats = featurizer.featurize([mol])   # returns a list-like result
            if len(feats) == 0:
                print(f"[WARN] featurizer returned empty for {drug_id}")
                continue
            f = feats[0]
        except Exception as e:
            print(f"[ERROR] featurizer failed for {drug_id}: {e}")
            continue

        # f.node_features, f.edge_index, f.edge_features are expected attributes
        # convert to torch tensors and ensure correct shapes
        try:
            x = torch.tensor(f.node_features, dtype=torch.float)          # [N_nodes, node_feat_dim]
            edge_attr = torch.tensor(f.edge_features, dtype=torch.float)  # [num_edges, edge_feat_dim]
            edge_index = to_edge_index_tensor(f.edge_index)               # [2, num_edges]
        except Exception as e:
            print(f"[ERROR] Bad featurizer output for {drug_id}: {e}")
            continue

        # Build Data object; save label as float (for BCEWithLogitsLoss) but can be changed later
        data = Data(
            x=x,
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=torch.tensor([label], dtype=torch.float)
        )

        # Save file with integer ID: "<drugid>_<label>.pt"
        fname = f"{drug_id}_{label}.pt"
        torch.save(data, os.path.join(out_dir, fname))

    print(f"→ Completed {split}: saved graphs to {out_dir}\n")


Processing 6550 molecules for 2C9 [train] -> saving to ..\GraphDataset\2C9\train


100%|██████████| 6550/6550 [03:03<00:00, 35.72it/s]


→ Completed train: saved graphs to ..\GraphDataset\2C9\train

Processing 1209 molecules for 2C9 [val] -> saving to ..\GraphDataset\2C9\val


100%|██████████| 1209/1209 [00:24<00:00, 50.12it/s] 


→ Completed val: saved graphs to ..\GraphDataset\2C9\val

Processing 1210 molecules for 2C9 [test] -> saving to ..\GraphDataset\2C9\test


100%|██████████| 1210/1210 [00:52<00:00, 23.16it/s]

→ Completed test: saved graphs to ..\GraphDataset\2C9\test






In [7]:
from glob import glob
import torch
paths = glob("../GraphDataset/3A4/train/*.pt")[:5]
for p in paths:
    d = torch.load(p)
    print(p, "x:", d.x.shape, "edge_index:", d.edge_index.shape, "edge_attr:", d.edge_attr.shape, "y:", d.y)


../GraphDataset/3A4/train\1001112_1.pt x: torch.Size([31, 94]) edge_index: torch.Size([2, 961]) edge_attr: torch.Size([961, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001133_1.pt x: torch.Size([30, 94]) edge_index: torch.Size([2, 900]) edge_attr: torch.Size([900, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001459_1.pt x: torch.Size([35, 94]) edge_index: torch.Size([2, 1225]) edge_attr: torch.Size([1225, 42]) y: tensor([1.])
../GraphDataset/3A4/train\100181_0.pt x: torch.Size([22, 94]) edge_index: torch.Size([2, 484]) edge_attr: torch.Size([484, 42]) y: tensor([0.])
../GraphDataset/3A4/train\100426_0.pt x: torch.Size([28, 94]) edge_index: torch.Size([2, 784]) edge_attr: torch.Size([784, 42]) y: tensor([0.])


  d = torch.load(p)


In [8]:
import torch
d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")
print("N nodes:", d.x.shape[0])
print("node feat dim:", d.x.shape[1])
print("first node features:", d.x[0])
print("edge_index first 10 cols:", d.edge_index[:, :10])
print("edge_attr first 5:", d.edge_attr[:5])
print("label:", d.y)


N nodes: 31
node feat dim: 94
first node features: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0.])
edge_index first 10 cols: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
edge_attr first 5: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
      

  d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")


# CYP2C19

In [14]:
# Configuration
ISOFORM = "2C19"
SPLITS = ["train", "val", "test"]
CSV_DIR = os.path.join("..", "data", "processed")
OUT_ROOT = os.path.join("..", "GraphDataset")
os.makedirs(OUT_ROOT, exist_ok=True)

In [15]:
# Helper: ensure edge_index has shape [2, num_edges] for PyG
def to_edge_index_tensor(edge_index):
    # edge_index may be (num_edges, 2) or (2, num_edges). Convert to LongTensor [2, E].
    ei = torch.tensor(edge_index, dtype=torch.long)
    if ei.dim() == 2 and ei.shape[0] == 2:
        # already [2, E]
        return ei.contiguous()
    elif ei.dim() == 2 and ei.shape[1] == 2:
        # [E, 2] -> transpose
        return ei.t().contiguous()
    else:
        raise ValueError(f"Unexpected edge_index shape: {tuple(ei.shape)}")

In [16]:
# Loop over splits
for split in SPLITS:
    csv_path = os.path.join(CSV_DIR, f"{ISOFORM}_{split}.csv")
    out_dir = os.path.join(OUT_ROOT, ISOFORM, split)
    os.makedirs(out_dir, exist_ok=True)

    df = pd.read_csv(csv_path)
    print(f"Processing {len(df)} molecules for {ISOFORM} [{split}] -> saving to {out_dir}")

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        smiles = row["Drug"]
        try:
            drug_id = int(float(row["Drug_ID"]))
        except Exception:
            drug_id = str(row["Drug_ID"])

        label = int(row["Y"])

        # parse SMILES
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            print(f"[WARN] invalid SMILES at index {idx}: {smiles}")
            continue

        # Use public API: featurize returns a list (even if one mol)
        try:
            feats = featurizer.featurize([mol])   # returns a list-like result
            if len(feats) == 0:
                print(f"[WARN] featurizer returned empty for {drug_id}")
                continue
            f = feats[0]
        except Exception as e:
            print(f"[ERROR] featurizer failed for {drug_id}: {e}")
            continue

        # f.node_features, f.edge_index, f.edge_features are expected attributes
        # convert to torch tensors and ensure correct shapes
        try:
            x = torch.tensor(f.node_features, dtype=torch.float)          # [N_nodes, node_feat_dim]
            edge_attr = torch.tensor(f.edge_features, dtype=torch.float)  # [num_edges, edge_feat_dim]
            edge_index = to_edge_index_tensor(f.edge_index)               # [2, num_edges]
        except Exception as e:
            print(f"[ERROR] Bad featurizer output for {drug_id}: {e}")
            continue

        # Build Data object; save label as float (for BCEWithLogitsLoss) but can be changed later
        data = Data(
            x=x,
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=torch.tensor([label], dtype=torch.float)
        )

        # Save file with integer ID: "<drugid>_<label>.pt"
        fname = f"{drug_id}_{label}.pt"
        torch.save(data, os.path.join(out_dir, fname))

    print(f"→ Completed {split}: saved graphs to {out_dir}\n")


Processing 10132 molecules for 2C19 [train] -> saving to ..\GraphDataset\2C19\train


100%|██████████| 10132/10132 [02:28<00:00, 68.16it/s]


→ Completed train: saved graphs to ..\GraphDataset\2C19\train

Processing 1266 molecules for 2C19 [val] -> saving to ..\GraphDataset\2C19\val


100%|██████████| 1266/1266 [00:19<00:00, 66.04it/s]


→ Completed val: saved graphs to ..\GraphDataset\2C19\val

Processing 1267 molecules for 2C19 [test] -> saving to ..\GraphDataset\2C19\test


100%|██████████| 1267/1267 [00:19<00:00, 64.40it/s]

→ Completed test: saved graphs to ..\GraphDataset\2C19\test






In [17]:
from glob import glob
import torch
paths = glob("../GraphDataset/3A4/train/*.pt")[:5]
for p in paths:
    d = torch.load(p)
    print(p, "x:", d.x.shape, "edge_index:", d.edge_index.shape, "edge_attr:", d.edge_attr.shape, "y:", d.y)


../GraphDataset/3A4/train\1001112_1.pt x: torch.Size([31, 94]) edge_index: torch.Size([2, 961]) edge_attr: torch.Size([961, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001133_1.pt x: torch.Size([30, 94]) edge_index: torch.Size([2, 900]) edge_attr: torch.Size([900, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001459_1.pt x: torch.Size([35, 94]) edge_index: torch.Size([2, 1225]) edge_attr: torch.Size([1225, 42]) y: tensor([1.])
../GraphDataset/3A4/train\100181_0.pt x: torch.Size([22, 94]) edge_index: torch.Size([2, 484]) edge_attr: torch.Size([484, 42]) y: tensor([0.])
../GraphDataset/3A4/train\100426_0.pt x: torch.Size([28, 94]) edge_index: torch.Size([2, 784]) edge_attr: torch.Size([784, 42]) y: tensor([0.])


  d = torch.load(p)


In [18]:
import torch
d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")
print("N nodes:", d.x.shape[0])
print("node feat dim:", d.x.shape[1])
print("first node features:", d.x[0])
print("edge_index first 10 cols:", d.edge_index[:, :10])
print("edge_attr first 5:", d.edge_attr[:5])
print("label:", d.y)


N nodes: 31
node feat dim: 94
first node features: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0.])
edge_index first 10 cols: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
edge_attr first 5: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
      

  d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")


# CYP2D6

In [19]:
# Configuration
ISOFORM = "2D6"
SPLITS = ["train", "val", "test"]
CSV_DIR = os.path.join("..", "data", "processed")
OUT_ROOT = os.path.join("..", "GraphDataset")
os.makedirs(OUT_ROOT, exist_ok=True)

In [20]:
# Helper: ensure edge_index has shape [2, num_edges] for PyG
def to_edge_index_tensor(edge_index):
    # edge_index may be (num_edges, 2) or (2, num_edges). Convert to LongTensor [2, E].
    ei = torch.tensor(edge_index, dtype=torch.long)
    if ei.dim() == 2 and ei.shape[0] == 2:
        # already [2, E]
        return ei.contiguous()
    elif ei.dim() == 2 and ei.shape[1] == 2:
        # [E, 2] -> transpose
        return ei.t().contiguous()
    else:
        raise ValueError(f"Unexpected edge_index shape: {tuple(ei.shape)}")

In [21]:
# Loop over splits
for split in SPLITS:
    csv_path = os.path.join(CSV_DIR, f"{ISOFORM}_{split}.csv")
    out_dir = os.path.join(OUT_ROOT, ISOFORM, split)
    os.makedirs(out_dir, exist_ok=True)

    df = pd.read_csv(csv_path)
    print(f"Processing {len(df)} molecules for {ISOFORM} [{split}] -> saving to {out_dir}")

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        smiles = row["Drug"]
        try:
            drug_id = int(float(row["Drug_ID"]))
        except Exception:
            drug_id = str(row["Drug_ID"])
        label = int(row["Y"])

        # parse SMILES
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            print(f"[WARN] invalid SMILES at index {idx}: {smiles}")
            continue

        # Use public API: featurize returns a list (even if one mol)
        try:
            feats = featurizer.featurize([mol])   # returns a list-like result
            if len(feats) == 0:
                print(f"[WARN] featurizer returned empty for {drug_id}")
                continue
            f = feats[0]
        except Exception as e:
            print(f"[ERROR] featurizer failed for {drug_id}: {e}")
            continue

        # f.node_features, f.edge_index, f.edge_features are expected attributes
        # convert to torch tensors and ensure correct shapes
        try:
            x = torch.tensor(f.node_features, dtype=torch.float)          # [N_nodes, node_feat_dim]
            edge_attr = torch.tensor(f.edge_features, dtype=torch.float)  # [num_edges, edge_feat_dim]
            edge_index = to_edge_index_tensor(f.edge_index)               # [2, num_edges]
        except Exception as e:
            print(f"[ERROR] Bad featurizer output for {drug_id}: {e}")
            continue

        # Build Data object; save label as float (for BCEWithLogitsLoss) but can be changed later
        data = Data(
            x=x,
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=torch.tensor([label], dtype=torch.float)
        )

        # Save file with integer ID: "<drugid>_<label>.pt"
        fname = f"{drug_id}_{label}.pt"
        torch.save(data, os.path.join(out_dir, fname))

    print(f"→ Completed {split}: saved graphs to {out_dir}\n")


Processing 8176 molecules for 2D6 [train] -> saving to ..\GraphDataset\2D6\train


100%|██████████| 8176/8176 [04:41<00:00, 29.05it/s]


→ Completed train: saved graphs to ..\GraphDataset\2D6\train

Processing 1022 molecules for 2D6 [val] -> saving to ..\GraphDataset\2D6\val


100%|██████████| 1022/1022 [00:36<00:00, 27.87it/s]


→ Completed val: saved graphs to ..\GraphDataset\2D6\val

Processing 1022 molecules for 2D6 [test] -> saving to ..\GraphDataset\2D6\test


100%|██████████| 1022/1022 [00:34<00:00, 29.37it/s]

→ Completed test: saved graphs to ..\GraphDataset\2D6\test






In [22]:
from glob import glob
import torch
paths = glob("../GraphDataset/3A4/train/*.pt")[:5]
for p in paths:
    d = torch.load(p)
    print(p, "x:", d.x.shape, "edge_index:", d.edge_index.shape, "edge_attr:", d.edge_attr.shape, "y:", d.y)


../GraphDataset/3A4/train\1001112_1.pt x: torch.Size([31, 94]) edge_index: torch.Size([2, 961]) edge_attr: torch.Size([961, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001133_1.pt x: torch.Size([30, 94]) edge_index: torch.Size([2, 900]) edge_attr: torch.Size([900, 42]) y: tensor([1.])
../GraphDataset/3A4/train\1001459_1.pt x: torch.Size([35, 94]) edge_index: torch.Size([2, 1225]) edge_attr: torch.Size([1225, 42]) y: tensor([1.])
../GraphDataset/3A4/train\100181_0.pt x: torch.Size([22, 94]) edge_index: torch.Size([2, 484]) edge_attr: torch.Size([484, 42]) y: tensor([0.])
../GraphDataset/3A4/train\100426_0.pt x: torch.Size([28, 94]) edge_index: torch.Size([2, 784]) edge_attr: torch.Size([784, 42]) y: tensor([0.])


  d = torch.load(p)


In [23]:
import torch
d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")
print("N nodes:", d.x.shape[0])
print("node feat dim:", d.x.shape[1])
print("first node features:", d.x[0])
print("edge_index first 10 cols:", d.edge_index[:, :10])
print("edge_attr first 5:", d.edge_attr[:5])
print("label:", d.y)


N nodes: 31
node feat dim: 94
first node features: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0.])
edge_index first 10 cols: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
edge_attr first 5: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
      

  d = torch.load("../GraphDataset/3A4/train/1001112_1.pt")
