In [9]:
!/usr/local/bin/python3.10 -m pip install pandas
!/usr/local/bin/python3.10 -m pip install scikit-learn
!/usr/local/bin/python3.10 -m pip install torch
!/usr/local/bin/python3.10 -m pip install transformers
!/usr/local/bin/python3.10 -m pipinstall numpy

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/35/d3/83a3e7144da980604a20e27b6f1e8a2164ab324310d69a82f2cff1da6326/scikit_learn-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata
  Downloading scikit_learn-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.5.0 from https://files.pythonhosted.org/packages/b1/64/67efd36ed232b9b107ad8435d0f0ebec28e5e6f782ededbd1ab4a37a0100/scipy-1.11.1-cp310-cp310-macosx_10_9_x86_64.whl.metadata
  Downloading scipy-1.11.1-cp310-cp310-macosx_10_9_x86_64.whl.metadata (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.1/54.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting joblib>=1.1.1 (from scikit-learn)
  Obtaining dependency information for joblib>=1.1.1 from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf

In [10]:

import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import os

from data.HIV_data import load_HIV
from data.tox21_data import load_tox21

  from .autonotebook import tqdm as notebook_tqdm


In [11]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using device {device}")


using device cpu


In [12]:


labels, df = load_HIV()
print(labels)
print("Data Loaded and Preprocessing Finished")

num_classes = 2
num_labels = len(df.columns) - 1
max_sequence_length = df['smiles'].apply(len).max()
#print(vocab_size)
batch_size = 32
num_epochs = 1
learning_rate = 0.001


                                              smiles  HIV_active_false  \
0  CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...              True   
1  C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...              True   
2                   CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21              True   
3    Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1              True   
4                             O=S(=O)(O)CCS(=O)(=O)O              True   

   HIV_active_true  
0            False  
1            False  
2            False  
3            False  
4            False  
['HIV_active']
Data Loaded and Preprocessing Finished


In [13]:



train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print("Train Test Split Finished")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = num_labels)
model.to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print("Model and Tokenizer Initialized")

# Tokenize and preprocess data
encoded_data_train = tokenizer.batch_encode_plus(
    train_df['smiles'].tolist(),
    add_special_tokens=True,
    padding=True,
    return_attention_mask=True,
    max_length=max_sequence_length,
    return_tensors='pt',
    truncation=True
)
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_df.drop(columns=['smiles']).values, dtype=torch.float)
print("Data tokenization and preprocessing complete")

# Create DataLoader
train_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
print("Train DataLoader created")

# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

label_accuracies = {label: 0.0 for label in labels}
label_sample_counts = {label: 0 for label in labels}



Train Test Split Finished


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and Tokenizer Initialized
Data tokenization and preprocessing complete
Train DataLoader created


In [14]:

# Training loop
print("STARTING TRAINING")
model.train()
for epoch in range(num_epochs):
    total_samples = 0
    for batch_input_ids, batch_attention_masks, batch_labels in train_loader:
        optimizer.zero_grad()

        batch_input_ids = batch_input_ids.to(device)
        batch_attention_masks = batch_attention_masks.to(device)
        batch_labels = batch_labels.to(device)  

        outputs = model(batch_input_ids, attention_mask=batch_attention_masks)
        logits = outputs.logits

        batch_loss = 0.0
        for logit_idx in range(len(logits)):
            logit_loss = 0.0
            for i in range(0, num_labels, num_classes):
                subarray_logits = logits[logit_idx]
                subarray_logits = subarray_logits[i: i + num_classes]

                best_class = np.argmax(subarray_logits.detach().numpy(), axis=-1)
                subarray_logits[best_class] = 1
                for arg in range(len(subarray_logits)):
                    if arg == best_class:
                        continue
                    subarray_logits[arg] = 0
                
                subarray_labels = batch_labels[logit_idx][i: i + num_classes]

                loss = torch.nn.functional.cross_entropy(subarray_logits, subarray_labels)
                logit_loss += loss

                #Calculating accuracy
                temp = int((i)/num_classes)
                label_sample_counts[labels[temp]] += 1.0
                if subarray_labels[best_class] == 1:
                    label_accuracies[labels[temp]] += 1.0
            batch_loss += logit_loss
        print(f"Batch Loss: {batch_loss}")

        for label in labels:
            print(f"Accuracy for {label}: {label_accuracies[label]/label_sample_counts[label]}")
        
        
        
        batch_loss.backward()
        optimizer.step()

        label_accuracies = {label: 0.0 for label in labels}
        label_sample_counts = {label: 0 for label in labels}

torch.save(model.state_dict(), os.path.join(os.getcwd(), 'models', 'tox21.pth'))


STARTING TRAINING
Batch Loss: 10.512185096740723
Accuracy for HIV_active: 0.65625
