In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Import necessary libraries
import os
import sys
from pathlib import Path
# Ensure the src directory is accessible
sys.path.append('./src')

# Get data

In [2]:
from get_data import get_data
data_sets = get_data() 

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
100%|██████████| 306893/306893 [01:43<00:00, 2962.10it/s]


# LLM based model

In [3]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, RobertaConfig, RobertaForSequenceClassification


from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


# Define a custom dataset class
class SMILESDataset(Dataset):
    def __init__(self, smiles, labels, tokenizer, max_length=128):
        self.smiles = smiles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        smile = self.smiles[idx]
        label = self.labels[idx]
        encoded = self.tokenizer(smile, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        encoded_input = {key: val.squeeze(0) for key, val in encoded.items()}
        return encoded_input, torch.tensor(label, dtype=torch.float)

    
# get pretrained tokenizer
local_tokenizer_path = './huggingface/PubChem10M_SMILES_BPE_450k'  # Path where tokenizer files are stored
tokenizer = AutoTokenizer.from_pretrained(local_tokenizer_path) 
# if online: tokenizer = AutoTokenizer.from_pretrained(seyonec/PubChem10M_SMILES_BPE_450k) 


# Load existing configuration and modify it
config_path = './huggingface/ChemBERTa-zinc-base-v1/config.json'
model_config = RobertaConfig.from_pretrained(config_path)
model_config.vocab_size = 7924  # Updating the vocab size
model_config.num_labels = 1  # Ensure this is set for binary classification


# Reinitialize the model with updated configuration
model = RobertaForSequenceClassification(config=model_config)


# Prepare data
train_smiles = data_sets['du']['train']['Drug'].tolist()
train_labels = data_sets['du']['train']['Y'].tolist()
valid_smiles = data_sets['du']['valid']['Drug'].tolist()
valid_labels = data_sets['du']['valid']['Y'].tolist()

train_dataset = SMILESDataset(train_smiles, train_labels, tokenizer)
valid_dataset = SMILESDataset(valid_smiles, valid_labels, tokenizer)


# Data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)


# Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.BCEWithLogitsLoss()


def train_epoch(model, data_loader, optimizer, device, criterion):
    model.train()
    total_loss = 0
    for inputs, labels in data_loader:
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device).unsqueeze(1)
        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)



In [5]:
from evaluate import eval

def eval_model(model, data_loader, device, verbose=False):
    model.eval()
    predictions = []
    real_values = []
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs = {key: val.to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            preds = torch.sigmoid(outputs.logits)
            predictions.extend(preds.flatten().tolist())
            real_values.extend(labels.tolist())
    
    return eval(real_values, predictions, verbose=verbose)  
    


In [None]:
# Train the model
checkpoint_dir = './src/checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)
epoch_start = 0
num_epochs = 100
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device, criterion)
    valid_metrics = eval_model(model, valid_loader, device)
    print(f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Validation AUC: {valid_metrics['roc_auc']:.4f}")

    # Save the checkpoint every two epochs
    if epoch % 1 == 0:
        checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pt')
        torch.save({
            'epoch': epoch,
            'model_state_dict': mo\del.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss,
        }, checkpoint_path)

Epoch 1, Train Loss: 0.1836, Validation AUC: 0.6143
Epoch 2, Train Loss: 0.1835, Validation AUC: 0.5809
Epoch 3, Train Loss: 0.1834, Validation AUC: 0.4781
Epoch 4, Train Loss: 0.1834, Validation AUC: 0.4689
Epoch 5, Train Loss: 0.1833, Validation AUC: 0.4249
Epoch 6, Train Loss: 0.1833, Validation AUC: 0.3361
Epoch 7, Train Loss: 0.1832, Validation AUC: 0.4912
Epoch 8, Train Loss: 0.1833, Validation AUC: 0.4917
Epoch 9, Train Loss: 0.1832, Validation AUC: 0.5010
Epoch 10, Train Loss: 0.1832, Validation AUC: 0.5421
Epoch 11, Train Loss: 0.1831, Validation AUC: 0.4102
Epoch 12, Train Loss: 0.1832, Validation AUC: 0.4956
Epoch 13, Train Loss: 0.1831, Validation AUC: 0.5271
Epoch 14, Train Loss: 0.1831, Validation AUC: 0.3513
Epoch 15, Train Loss: 0.1832, Validation AUC: 0.5000
Epoch 16, Train Loss: 0.1831, Validation AUC: 0.5458
Epoch 17, Train Loss: 0.1832, Validation AUC: 0.4449
Epoch 18, Train Loss: 0.1832, Validation AUC: 0.5000
Epoch 19, Train Loss: 0.1832, Validation AUC: 0.4973
Ep