In [8]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Import necessary libraries
import os
import sys
from pathlib import Path
# Ensure the src directory is accessible
sys.path.append('./src')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Get data

In [2]:
from get_data import get_data
data_split = get_data()

Found local copy...
Loading...
Done!
100%|██████████| 13445/13445 [00:05<00:00, 2300.59it/s]


Data loaded and split successfully.
Train data samples: 9411
Validation data samples: 1344
Test data samples: 2690


In [3]:
data_split.keys()

dict_keys(['train', 'valid', 'test'])

# LLM based model

In [9]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, RobertaConfig, RobertaForSequenceClassification


from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


# Define a custom dataset class
class SMILESDataset(Dataset):
    def __init__(self, smiles, labels, tokenizer, max_length=128):
        self.smiles = smiles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        smile = self.smiles[idx]
        label = self.labels[idx]
        encoded = self.tokenizer(smile, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        encoded_input = {key: val.squeeze(0) for key, val in encoded.items()}
        return encoded_input, torch.tensor(label, dtype=torch.float)

    
# get pretrained tokenizer
local_tokenizer_path = './huggingface/PubChem10M_SMILES_BPE_450k'  # Path where tokenizer files are stored
tokenizer = AutoTokenizer.from_pretrained(local_tokenizer_path) 
# if online: tokenizer = AutoTokenizer.from_pretrained(seyonec/PubChem10M_SMILES_BPE_450k) 


# Load existing configuration and modify it
config_path = './huggingface/ChemBERTa-zinc-base-v1/config.json'
model_config = RobertaConfig.from_pretrained(config_path)
model_config.vocab_size = 7924  # Updating the vocab size
model_config.num_labels = 1  # Ensure this is set for binary classification


# Reinitialize the model with updated configuration
model = RobertaForSequenceClassification(config=model_config)


# Prepare data
train_smiles = data_split['train']['Drug'].tolist()
train_labels = data_split['train']['Y'].tolist()
valid_smiles = data_split['valid']['Drug'].tolist()
valid_labels = data_split['valid']['Y'].tolist()

train_dataset = SMILESDataset(train_smiles, train_labels, tokenizer)
valid_dataset = SMILESDataset(valid_smiles, valid_labels, tokenizer)


# Data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)


# Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.BCEWithLogitsLoss()


def train_epoch(model, data_loader, optimizer, device, criterion):
    model.train()
    total_loss = 0
    for inputs, labels in data_loader:
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device).unsqueeze(1)
        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)



In [39]:
from evaluate import eval

def eval_model(model, data_loader, device, verbose=False):
    model.eval()
    predictions = []
    real_values = []
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs = {key: val.to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            preds = torch.sigmoid(outputs.logits)
            predictions.extend(preds.flatten().tolist())
            real_values.extend(labels.tolist())
    
    return eval(real_values, predictions, verbose=verbose)  
    


In [10]:
# Train the model
checkpoint_dir = './src/checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)
epoch_start = 0
num_epochs = 100
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device, criterion)
    valid_auc = eval_model(model, valid_loader, device)
    print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}, Validation AUC: {valid_auc:.4f}')

    # Save the checkpoint every two epochs
    if epoch % 2 == 0:
        checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pt')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss,
        }, checkpoint_path)

Epoch 0, Train Loss: 0.7045, Validation AUC: 0.6686
Epoch 1, Train Loss: 0.6587, Validation AUC: 0.6814
Epoch 2, Train Loss: 0.6275, Validation AUC: 0.6957
Epoch 3, Train Loss: 0.6023, Validation AUC: 0.6974
Epoch 4, Train Loss: 0.5713, Validation AUC: 0.7160
Epoch 5, Train Loss: 0.5515, Validation AUC: 0.7141
Epoch 6, Train Loss: 0.5574, Validation AUC: 0.7082
Epoch 7, Train Loss: 0.5515, Validation AUC: 0.7004
Epoch 8, Train Loss: 0.5215, Validation AUC: 0.7176
Epoch 9, Train Loss: 0.4969, Validation AUC: 0.6972
Epoch 10, Train Loss: 0.4867, Validation AUC: 0.7060
Epoch 11, Train Loss: 0.4885, Validation AUC: 0.7150
Epoch 12, Train Loss: 0.4569, Validation AUC: 0.7243
Epoch 13, Train Loss: 0.4369, Validation AUC: 0.7203
Epoch 14, Train Loss: 0.4313, Validation AUC: 0.7155
Epoch 15, Train Loss: 0.4649, Validation AUC: 0.6933
Epoch 16, Train Loss: 0.4396, Validation AUC: 0.6959
Epoch 17, Train Loss: 0.5053, Validation AUC: 0.6674
Epoch 18, Train Loss: 0.5241, Validation AUC: 0.6733
Epo

KeyboardInterrupt: 

# Evaluation on Test Data

In [19]:
def load_checkpoint(model, optimizer, checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    print(f'Loaded checkpoint from epoch {epoch} with loss {loss:.4f}')
    return model, optimizer, epoch, loss

# select the best model:
epoch_n = 22
checkpoint_path = f'./src/checkpoints/checkpoint_epoch_{epoch_n}.pt'
model, optimizer, start_epoch, train_loss = load_checkpoint(model, optimizer, checkpoint_path)


Loaded checkpoint from epoch 22 with loss 0.4981


In [46]:
test_smiles = data_split['test']['Drug'].tolist()
test_labels = data_split['test']['Y'].tolist()
test_dataset = SMILESDataset(test_smiles, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
test_performance = eval_model(model, test_loader, device, verbose=True)

Accuracy: 0.7100371747211895
ROC AUC Score: 0.7865136360293936
Precision: 0.7118512780790085
Recall: 0.6925395629238885
F1 Score: 0.7020626432391139
Matthews Correlation Coefficient: 0.4199131877537424
Specificity (Negative Prediction Accuracy): 0.727072633895818
