In [2]:
# Standard library
from collections import defaultdict
from typing import List, Tuple
import importlib.resources as pkg_resources
from multiprocessing.pool import ThreadPool

# Third-party scientific stack
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import eigsh

# RDKit
from rdkit import Chem

# PyTorch core
import torch
import torch.nn as nn
import torch.nn.functional as F

# PyTorch Geometric
from torch_geometric.data import Data, Batch, DataLoader
from torch_geometric.nn import GCNConv, GINConv, BatchNorm, global_mean_pool
from torch_geometric.loader import DataLoader as PyGDataLoader

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
def set_seed(seed):
    """
    Fix all random seeds for reproducibility across Python, NumPy, and PyTorch.
    """
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(59)

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_mean_pool

class PolyatomicNet(nn.Module):
    def __init__(
        self,
        input_dim,
        graph_feat_dim=58,
        hidden_dim=64,
        output_dim=1,
        num_layers=2,
        heads=4,
        dropout=0.2
    ):
        super(PolyatomicNet, self).__init__()
        # Store graph feature dimension for reshaping
        self.graph_feat_dim = graph_feat_dim

        # Build GNN layers
        self.gnn_layers = nn.ModuleList()
        # First layer: multi-head
        self.gnn_layers.append(
            GATConv(input_dim, hidden_dim, heads=heads, dropout=dropout)
        )
        # Remaining layers: single-head
        for _ in range(num_layers - 1):
            self.gnn_layers.append(
                GATConv(hidden_dim * heads, hidden_dim, heads=1, dropout=dropout)
            )

        # Project graph-level features
        self.graph_feat_proj = nn.Sequential(
            nn.Linear(graph_feat_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
        )

        # Final MLP head combines GNN output + graph_feats
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        # Handle empty edge_index: use self-loops
        if edge_index.numel() == 0 or edge_index.size(1) == 0:
            N = x.size(0)
            edge_index = torch.stack([torch.arange(N, device=x.device), torch.arange(N, device=x.device)], dim=0)

        # GNN forward
        for gnn in self.gnn_layers:
            x = F.elu(gnn(x, edge_index))

        # Global pooling over nodes
        x = global_mean_pool(x, batch)  # [batch_size, hidden_dim]

        # Project graph-level features
        graph_feats_flat = data.graph_feats  # [batch_size, graph_feat_dim]
        batch_size = int(batch.max().item()) + 1
        g_feats = graph_feats_flat.view(batch_size, self.graph_feat_dim)
        g_feats = self.graph_feat_proj(g_feats)

        # Concatenate and final MLP
        out = torch.cat([x, g_feats], dim=1)
        out = self.fc(out)
        return out.view(-1)


In [6]:
def collate_with_graph_feats(batch_list):
    graph_feats = torch.stack([data.graph_feats for data in batch_list], dim=0)
    for data in batch_list:
        del data.graph_feats
    batched = Batch.from_data_list(batch_list)
    batched.graph_feats = graph_feats
    return batched

In [7]:
from torch.amp import autocast, GradScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

scaler_grad = GradScaler()

def train(model, loader, optimizer, loss_fn, scaler=scaler_grad, accum_steps=8):
    model.train()
    total_loss = 0.0
    optimizer.zero_grad()

    for i, batch in enumerate(loader):
        batch = batch.to(device)
        with autocast(device_type='cuda', dtype=torch.float16):
            output = model(batch)
            loss = loss_fn(output, batch.y.view(-1)) / accum_steps

        scaler_grad.scale(loss).backward()

        if (i + 1) % accum_steps == 0 or (i + 1 == len(loader)):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item() * batch.num_graphs * accum_steps

    return total_loss / len(loader.dataset)

def compute_metrics_with_ci(trues, preds, n_boot=2000, alpha=0.05, seed=42):
    trues = np.array(trues)
    preds = np.array(preds)
    mae = mean_absolute_error(trues, preds)
    rmse = np.sqrt(mean_squared_error(trues, preds))

    rng = np.random.RandomState(seed)
    mae_samples, rmse_samples = [], []
    n = len(trues)
    for _ in range(n_boot):
        idx = rng.randint(0, n, n)
        t, p = trues[idx], preds[idx]
        mae_samples.append(mean_absolute_error(t, p))
        rmse_samples.append(np.sqrt(mean_squared_error(t, p)))

    lower, upper = 100 * alpha / 2, 100 * (1 - alpha / 2)
    mae_ci = (np.percentile(mae_samples, lower), np.percentile(mae_samples, upper))
    rmse_ci = (np.percentile(rmse_samples, lower), np.percentile(rmse_samples, upper))
    return {'mae': mae, 'mae_ci': mae_ci, 'rmse': rmse, 'rmse_ci': rmse_ci}

def evaluate(model, loader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad(), autocast(device_type='cuda', dtype=torch.float16):
        for batch in loader:
            batch = batch.to(device)
            out = model(batch)
            preds.append(out.view(-1))
            trues.append(batch.y.view(-1))
    preds = torch.cat(preds)
    trues = torch.cat(trues)
    return torch.sqrt(torch.mean((preds - trues) ** 2)).item()

def evaluate_with_ci(model, loader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad(), autocast(device_type='cuda', dtype=torch.float16):
        for batch in loader:
            batch = batch.to(device)
            out = model(batch)
            y_true = batch.y.view(-1).cpu().tolist()
            y_pred = out.view(-1).cpu().tolist()
            trues.extend(y_true)
            preds.extend(y_pred)
    return compute_metrics_with_ci(trues, preds)


In [8]:
loading = True
if loading:
    all_data = torch.load('/content/drive/MyDrive/all_data_lipophil_enriched.pt', weights_only=False)
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    ys = np.array([d.y.item() for d in all_data]).reshape(-1, 1)
    ys_scaled = scaler.fit_transform(ys)
    for i, d in enumerate(all_data):
        d.y = torch.tensor([ys_scaled[i][0]], dtype=torch.float32)

In [9]:
from transformers import get_linear_schedule_with_warmup

In [10]:
import warnings
warnings.filterwarnings("ignore", message=".*An output with one or more elements was resized.*")
warnings.filterwarnings("ignore", message=".*FutureWarning:.*")
warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")

In [24]:
data_list = all_data
from torch_geometric.loader import DataLoader
train_n = int(0.8*len(data_list))
train_ds, test_ds = torch.utils.data.random_split(data_list, [train_n,len(data_list)-train_n])
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, num_workers=8, pin_memory=True)
test_loader  = DataLoader(test_ds, batch_size=16)

input_dim = all_data[0].x.size(1)
model     = PolyatomicNet(input_dim=input_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', patience=10, factor=0.5, verbose=True
)
loss_fn   = nn.SmoothL1Loss(beta=0.5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [25]:
history = {}
N = 40
for epoch in range(1, N):
    tr_loss = train(model, train_loader, optimizer, loss_fn)
    metrics = evaluate_with_ci(model, test_loader)
    print(f"Epoch {epoch:02d} | Train Loss: {tr_loss:.4f} | "
          f"Test MAE: {metrics['mae']:.4f} (95% CI [{metrics['mae_ci'][0]:.4f}, {metrics['mae_ci'][1]:.4f}]) | "
          f"Test RMSE: {metrics['rmse']:.4f} (95% CI [{metrics['rmse_ci'][0]:.4f}, {metrics['rmse_ci'][1]:.4f}])")
    history[epoch] = metrics
    scheduler.step(metrics['mae'])

Epoch 01 | Train Loss: 0.6222 | Test MAE: 0.7606 (95% CI [0.7189, 0.8018]) | Test RMSE: 0.9735 (95% CI [0.9211, 1.0230])
Epoch 02 | Train Loss: 0.6076 | Test MAE: 0.7551 (95% CI [0.7178, 0.7941]) | Test RMSE: 0.9431 (95% CI [0.8970, 0.9887])
Epoch 03 | Train Loss: 0.5994 | Test MAE: 0.7489 (95% CI [0.7114, 0.7874]) | Test RMSE: 0.9372 (95% CI [0.8905, 0.9832])
Epoch 04 | Train Loss: 0.5988 | Test MAE: 0.7475 (95% CI [0.7098, 0.7868]) | Test RMSE: 0.9405 (95% CI [0.8930, 0.9875])
Epoch 05 | Train Loss: 0.5968 | Test MAE: 0.7461 (95% CI [0.7079, 0.7855]) | Test RMSE: 0.9391 (95% CI [0.8917, 0.9861])
Epoch 06 | Train Loss: 0.5979 | Test MAE: 0.7514 (95% CI [0.7140, 0.7901]) | Test RMSE: 0.9399 (95% CI [0.8934, 0.9861])
Epoch 07 | Train Loss: 0.5990 | Test MAE: 0.7454 (95% CI [0.7072, 0.7848]) | Test RMSE: 0.9384 (95% CI [0.8907, 0.9852])
Epoch 08 | Train Loss: 0.5987 | Test MAE: 0.7483 (95% CI [0.7106, 0.7871]) | Test RMSE: 0.9383 (95% CI [0.8910, 0.9847])
Epoch 09 | Train Loss: 0.5980 | 

In [26]:
final = history[N-1]
print("*"*20)
print(f"Test MAE: {final['mae']:.4f} (95% CI [{final['mae_ci'][0]:.4f}, {final['mae_ci'][1]:.4f}])")
print(f"Test RMSE: {final['rmse']:.4f} (95% CI [{final['rmse_ci'][0]:.4f}, {final['rmse_ci'][1]:.4f}])")
print("*"*20)

********************
Test MAE: 0.7468 (95% CI [0.7089, 0.7849])
Test RMSE: 0.9352 (95% CI [0.8884, 0.9800])
********************
