In [1]:
#Multi-task Learning

In [1]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

# Set random seed
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

# Load multi-task dataset
file_path = 'cti.csv'
multi_task_df = pd.read_csv(file_path)

# Check and process null values
multi_task_df.iloc[:, 1:] = multi_task_df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')  # Convert non-numeric data to NaN
labels = multi_task_df.iloc[:, 1:].values

# Create label masks, marking positions of non-null values
label_masks = ~pd.isna(labels)
labels = np.where(pd.isna(labels), -1, labels)  # Fill NaN values with -1
labels = labels.astype(int)  # Convert to integer type for classification tasks
labels = torch.tensor(labels, dtype=torch.long)  # Use long type as classification tasks typically use integer labels
label_masks = torch.tensor(label_masks, dtype=torch.float)

# Specify local model path
model_path = "DeepChem/ChemBERTa-77M-MTR"

# Load tokenizer and model configuration from local path
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
config.num_labels = labels.shape[1]  # Dynamically determine the number of labels
config.output_hidden_states = True
config.output_attentions = True  # Ensure attention weights are output

# Load model with updated configuration from local path
model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config)

print("Tokenizer and model loaded successfully.")

# Tokenize input data
inputs = tokenizer(list(multi_task_df.iloc[:, 0]), padding=True, truncation=True, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {key: val.to(device) for key, val in inputs.items()}
labels = labels.to(device)
label_masks = label_masks.to(device)

# Split inputs into two parts
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Split into training and test sets
train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels, train_label_masks, test_label_masks = train_test_split(
    input_ids, attention_mask, labels, label_masks, test_size=0.2, random_state=seed
)

# Recombine into training and test set inputs
train_inputs = {'input_ids': train_input_ids, 'attention_mask': train_attention_mask}
test_inputs = {'input_ids': test_input_ids, 'attention_mask': test_attention_mask}

# Create dataset class
class MultiTaskDataset(Dataset):
    def __init__(self, inputs, labels, label_masks):
        self.inputs = inputs
        self.labels = labels
        self.label_masks = label_masks

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item['labels'] = self.labels[idx]
        item['label_masks'] = self.label_masks[idx]
        return item

# Create dataset objects for training and test sets
train_dataset = MultiTaskDataset(train_inputs, train_labels, train_label_masks)
test_dataset = MultiTaskDataset(test_inputs, test_labels, test_label_masks)

# Create data loaders
batch_size = 128
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.to(device)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Use Cosine Annealing learning rate scheduler
scheduler = CosineAnnealingLR(optimizer, T_max=32)

# Improved Focal Loss implementation
def focal_loss_with_dynamic_alpha(outputs, labels, label_masks, gamma=2):
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='none')
    active_loss = labels != -1
    active_loss = active_loss & (label_masks > 0)  # Apply mask

    # Calculate the number of positive and negative samples
    num_positive = torch.sum(labels[active_loss] == 1, dim=0).float()
    num_negative = torch.sum(labels[active_loss] == 0, dim=0).float()

    # Dynamically calculate α value, higher α for fewer positive samples
    alpha = num_negative / (num_positive + num_negative + 1e-8)
    
    # Dynamic sample-level adjustment of alpha
    alpha_factor = labels[active_loss].float() * alpha + (1 - labels[active_loss].float()) * (1 - alpha)

    # Calculate basic cross-entropy loss
    losses = loss_fn(outputs[active_loss], labels[active_loss].float())

    # Calculate prediction probability p_t
    probas = torch.sigmoid(outputs[active_loss])

    # Adjust probability based on true labels
    pt = probas * labels[active_loss].float() + (1 - probas) * (1 - labels[active_loss].float())

    # Calculate Focal Loss term
    focal_weight = (1 - pt) ** gamma

    # Apply dynamically adjusted α value and focal weight
    focal_loss = alpha_factor * focal_weight * losses

    # Apply mask
    masked_losses = focal_loss * label_masks[active_loss]

    # Return average loss
    return masked_losses.sum() / label_masks.sum()

# Evaluation function, calculate loss, accuracy and AUC
def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            label_masks = batch['label_masks']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
            loss = focal_loss_with_dynamic_alpha(outputs, labels, label_masks)
            total_loss += loss.item()
            
            preds = torch.sigmoid(outputs).cpu().numpy()  # Get prediction probabilities
            all_preds.append(preds)
            all_labels.append(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    
    binary_preds = (all_preds > 0.5).astype(int)
    accuracy = accuracy_score(all_labels[all_labels != -1], binary_preds[all_labels != -1])

    # Calculate AUC-ROC
    auc = roc_auc_score(all_labels[all_labels != -1], all_preds[all_labels != -1])

    return avg_loss, accuracy, auc

# Train model and evaluate
num_epochs = 100
for epoch in range(num_epochs):
    # Training step
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        label_masks = batch['label_masks']
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
        loss = focal_loss_with_dynamic_alpha(outputs, labels, label_masks)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss}")
    
    # Validation step
    avg_val_loss, val_accuracy, val_auc = evaluate_model(model, test_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss}, Accuracy: {val_accuracy}, AUC: {val_auc}")

# Save model
model.save_pretrained("saved_model")
tokenizer.save_pretrained("saved_model")

# Get chemical molecule representations and predicted values
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.hidden_states[-1][:, 0, :]  # Get representation of [CLS] token
    predictions = torch.sigmoid(outputs.logits)  # Get prediction probability for each label

# Concatenate representations and predicted values
enhanced_embeddings = torch.cat((embeddings, predictions), dim=1)

print("Enhanced embeddings and predictions obtained successfully.")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer and model loaded successfully.
Epoch 1/100, Training Loss: 0.0452306866645813
Epoch 1/100, Validation Loss: 0.04944154433906078, Accuracy: 0.5821389195148843, AUC: 0.6111940590109183
Epoch 2/100, Training Loss: 0.046448223758488894
Epoch 2/100, Validation Loss: 0.04912199266254902, Accuracy: 0.607497243660419, AUC: 0.6551910646602873
Epoch 3/100, Training Loss: 0.045555304270237684
Epoch 3/100, Validation Loss: 0.04879484139382839, Accuracy: 0.6218302094818081, AUC: 0.6917124765295148
Epoch 4/100, Training Loss: 0.04427354782819748
Epoch 4/100, Validation Loss: 0.04845667444169521, Accuracy: 0.6306504961411246, AUC: 0.7229896616753968
Epoch 5/100, Training Loss: 0.045422644820064306
Epoch 5/100, Validation Loss: 0.048111945390701294, Accuracy: 0.643329658213892, AUC: 0.7464128460932568
Epoch 6/100, Training Loss: 0.044879944529384375
Epoch 6/100, Validation Loss: 0.047745537012815475, Accuracy: 0.6615214994487321, AUC: 0.7629259905932968
Epoch 7/100, Training Loss: 0.04411932