In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# LLM based model

In [4]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from tdc.single_pred import Tox
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Load dataset
data = Tox(name="hERG_Karim")
df = data.get_data()

# Define a custom dataset class
class SMILESDataset(Dataset):
    def __init__(self, smiles, labels, tokenizer, max_length=128):
        self.smiles = smiles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        smile = self.smiles[idx]
        label = self.labels[idx]
        encoded = self.tokenizer(smile, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        encoded_input = {key: val.squeeze(0) for key, val in encoded.items()}
        return encoded_input, torch.tensor(label, dtype=torch.float)

    
"""    
# Load model and tokenizer from local paths; 
local_model_path = './huggingface/ChemBERTa-zinc-base-v1'  # Path where pytorch_model.bin and config.json are stored
model = AutoModelForSequenceClassification.from_pretrained(local_model_path, num_labels=1) 
# or online: model = AutoModelForSequenceClassification.from_pretrained(seyonec/ChemBERTa-zinc-base-v1)
"""


local_tokenizer_path = './huggingface/PubChem10M_SMILES_BPE_450k'  # Path where tokenizer files are stored
tokenizer = AutoTokenizer.from_pretrained(local_tokenizer_path) 
# or online: tokenizer = AutoTokenizer.from_pretrained(seyonec/PubChem10M_SMILES_BPE_450k) 



from transformers import RobertaConfig, RobertaForSequenceClassification

# Load existing configuration and modify it
config_path = './huggingface/ChemBERTa-zinc-base-v1/config.json'
model_config = RobertaConfig.from_pretrained(config_path)
model_config.vocab_size = 7924  # Updating the vocab size
model_config.num_labels = 1  # Ensure this is set for binary classification

# Reinitialize the model with updated configuration
model = RobertaForSequenceClassification(config=model_config)




# Prepare data
smiles = df['Drug'].tolist()
labels = df['Y'].tolist()
train_smiles, val_smiles, train_labels, val_labels = train_test_split(smiles, labels, test_size=0.1, random_state=42)
train_dataset = SMILESDataset(train_smiles, train_labels, tokenizer)
val_dataset = SMILESDataset(val_smiles, val_labels, tokenizer)


# Data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


# Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.BCEWithLogitsLoss()


def train_epoch(model, data_loader, optimizer, device, criterion):
    model.train()
    total_loss = 0
    for inputs, labels in data_loader:
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device).unsqueeze(1)
        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)



def eval_model(model, data_loader, device):
    model.eval()
    predictions = []
    real_values = []
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs = {key: val.to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            preds = torch.sigmoid(outputs.logits)
            predictions.extend(preds.flatten().tolist())
            real_values.extend(labels.tolist())
    return roc_auc_score(real_values, predictions)



Found local copy...
Loading...
Done!


In [3]:
# Train the model
num_epochs = 30
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device, criterion)
    val_auc = eval_model(model, val_loader, device)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val AUC: {val_auc:.4f}')


Epoch 1, Train Loss: 0.5799, Val AUC: 0.7816
Epoch 2, Train Loss: 0.5507, Val AUC: 0.7922
Epoch 3, Train Loss: 0.5265, Val AUC: 0.7896
Epoch 4, Train Loss: 0.4985, Val AUC: 0.8095
Epoch 9, Train Loss: 0.3870, Val AUC: 0.8307
Epoch 10, Train Loss: 0.3588, Val AUC: 0.8304
Epoch 11, Train Loss: 0.3360, Val AUC: 0.8326
Epoch 12, Train Loss: 0.3195, Val AUC: 0.8301
Epoch 13, Train Loss: 0.2934, Val AUC: 0.8262
Epoch 14, Train Loss: 0.2779, Val AUC: 0.8308
Epoch 15, Train Loss: 0.2622, Val AUC: 0.8315
Epoch 16, Train Loss: 0.2430, Val AUC: 0.8353
Epoch 17, Train Loss: 0.2284, Val AUC: 0.8294
Epoch 18, Train Loss: 0.2289, Val AUC: 0.8301
Epoch 19, Train Loss: 0.2714, Val AUC: 0.8283
Epoch 20, Train Loss: 0.2347, Val AUC: 0.8331
Epoch 21, Train Loss: 0.2291, Val AUC: 0.8328
Epoch 22, Train Loss: 0.2186, Val AUC: 0.8299
Epoch 23, Train Loss: 0.1849, Val AUC: 0.8384
Epoch 24, Train Loss: 0.1856, Val AUC: 0.8406
Epoch 25, Train Loss: 0.1710, Val AUC: 0.8298
Epoch 26, Train Loss: 0.1561, Val AUC: 