In [1]:
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
df = pd.read_csv("../../data/HIV.csv")
print(f"size: {df.shape[0]}")
df.head(1)

size: 41127


Unnamed: 0,smiles,activity,HIV_active
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0


In [3]:
from torch.utils.data import random_split

[0.8 *len(df)], [0.1 *len(df)], [0.1 *len(df)]

([32901.6], [4112.7], [4112.7])

In [4]:
np.savetxt("../../data/HIV_smiles.npy", np.array(df.smiles), fmt='%s')
np.savetxt("../../data/HIV_labels.npy", np.array(df.HIV_active), fmt='%s')

In [5]:
def tokenizer(smiles):
    ## https://github.com/topazape/LSTM_Chem/blob/master/lstm_chem/utils/smiles_tokenizer2.py

    atoms = [
        'Al', 'As', 'B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'K', 'Li', 'N',
        'Na', 'O', 'P', 'S', 'Se', 'Si', 'Te'
    ]
    special = [
        '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5',
        '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
    ]
    padding = ['<eos>', '<sos>', '<pad>']

    vocab = sorted(atoms, key=len, reverse=True) + special + padding

    table_2_chars = list(filter(lambda x: len(x) == 2, vocab))
    table_1_chars = list(filter(lambda x: len(x) == 1, vocab))

    smiles = smiles + ' '
    N = len(smiles)
    token = []
    i = 0
    while (i < N):
        c1 = smiles[i]
        c2 = smiles[i:i + 2]

        if c2 in table_2_chars:
            token.append(c2)
            i += 2
            continue

        if c1 in table_1_chars:
            token.append(c1)
            i += 1
            continue

        i += 1
    
    vocab_dict = {}
    for i, tok in enumerate(vocab):
        vocab_dict[tok] = i

    encoded = [vocab_dict[t] for t in token]
    return encoded

In [6]:
if __name__ == "__main__":
    input_ids = []
    for line in tqdm(df.smiles):
        ids = tokenizer(line)
        input_ids.append(ids)

  0%|          | 0/41127 [00:00<?, ?it/s]

In [7]:
len(input_ids)

41127

In [8]:
## dirs
smiles_dir = "../../data/HIV_smiles.npy"
labels_dir = "../../data/HIV_labels.npy"

In [9]:
class CustomDataset(Dataset):
    def __init__(self, data, label, tokenizer):
        self.data = data
        self.label = label
        self.tokenizer = tokenizer
        
        self.data = np.loadtxt(data, dtype=object)    
        self.label = np.loadtxt(label, dtype=float)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]
        label = self.label[idx]
        input_id = tokenizer(text)
        return torch.tensor(input_id, dtype=torch.long), torch.tensor(label, dtype=torch.float)

In [10]:
if __name__ == "__main__":
    dataset = CustomDataset(smiles_dir, labels_dir, tokenizer)

In [11]:
from torch.utils.data import random_split

[0.8 *len(dataset)], [0.1 *len(dataset)], [0.1 *len(dataset)]

([32901.6], [4112.7], [4112.7])

In [12]:
train_dataset, test_dataset, val_dataset = random_split(dataset, [32903, 4112, 4112]) # 80, 10, 10
# len(train_dataset + test_dataset + val_dataset)

In [13]:
for smiles, labels in train_dataset:
    print("Input ID:\n " ,smiles)
    print("Label:\n" ,labels)
    break

Input ID:
  tensor([10, 18, 40, 27, 41, 40, 19, 15, 20, 40, 19, 15, 10, 28, 16, 10, 19, 10,
        16, 20, 10, 19, 16, 20, 10, 28, 16, 20, 40, 19, 16, 20, 41, 27])
Label:
 tensor(1.)


In [14]:
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for data, label in batch:
        ## smiles (copy smiles without...)
        processed_text = data.clone().detach() 
        text_list.append(processed_text)
        ## label
        label_list.append(label)
        ## length
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)

    lengths = torch.tensor(lengths)
        ## padding
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
                       
    return padded_text_list, label_list, lengths

In [15]:
collate_batch(val_dataset)

(tensor([[10, 16, 40,  ...,  0,  0,  0],
         [15,  0,  0,  ...,  0,  0,  0],
         [16, 23, 18,  ...,  0,  0,  0],
         ...,
         [10, 16, 40,  ...,  0,  0,  0],
         [10, 16, 40,  ...,  0,  0,  0],
         [10, 10, 10,  ...,  0,  0,  0]]),
 tensor([0., 0., 0.,  ..., 0., 0., 0.]),
 tensor([74,  1, 51,  ..., 49, 54, 35]))

In [16]:
train_dataloader = DataLoader(train_dataset, batch_size=4,shuffle=False, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=4,shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=4,shuffle=False, collate_fn=collate_batch)

In [17]:
text_batch, label_batch, length_batch = next(iter(train_dataloader))
print(text_batch)

tensor([[10, 18, 40, 27, 41, 40, 19, 15, 20, 40, 19, 15, 10, 28, 16, 10, 19, 10,
         16, 20, 10, 19, 16, 20, 10, 28, 16, 20, 40, 19, 16, 20, 41, 27,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [10, 10, 16, 10, 19, 23, 16, 20, 10, 19, 23, 10, 15, 10, 19, 23, 18, 20,
         15, 40, 27, 40, 40, 19, 10, 20, 40, 40, 40, 27, 10, 20, 10, 19, 23, 16,
         20, 16, 10, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [10, 16, 10, 19, 23, 16, 20, 10, 19, 10, 20, 15, 10, 19, 23, 16, 20, 10,
         19, 10, 10, 18, 10, 20, 15, 10, 19, 23, 16, 20, 10, 19, 10, 40, 27, 40,
         40, 40, 40, 40, 27, 20, 15, 10, 19, 23, 16, 20, 16, 10, 19, 10, 20, 19,
         10, 20, 10,  0,  0,  0,  0,  0,  0,  0],
        [10, 10, 19, 23, 16, 20, 16, 40, 27, 40, 40, 28, 40, 19, 40, 40, 27, 16,
         10, 19, 10, 20, 23, 16, 20, 10,

In [18]:
print(label_batch)

tensor([1., 0., 0., 0.])


In [19]:
print(length_batch)

tensor([34, 40, 57, 64])


In [20]:
print(text_batch.shape)

torch.Size([4, 64])


In [21]:
## building an rnn model
## many to one classificaton 

In [26]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size,
                 num_layers, fc_hidden_size, output_size):
        super(). __init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, input_dim (embedding))
        # batch_dim = number of samples per batch
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, num_layers, batch_first=True) 
        # only use the hidden_size, since its a many to one
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)  
        
        self.relu = nn.ReLU()
        # output_size, use 1 for binary classification
        self.fc2 = nn.Linear(fc_hidden_size, output_size) 
        
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        # text dim: [sentence length, batch size]
        # text_length = [batch size]
        
        # [sentence len, batch size] => [sentence len, batch size, embedding size]
        out = self.embedding(text)
        
        # pack sequence to avoid using <paddings> during computations (saves computations)
        # lengths needs to be on cpu
        out = nn.utils.rnn.pack_padded_sequence(out, text_lengths.cpu().numpy(),
                                                enforce_sorted=False, batch_first=True)
        # =>  [batch_size, sentence len,  embedding dim]
        
        # Propagate input through LSTM
        out, (hidden, cell) = self.rnn(out) ## lstm with input, hidden, and internal (cell) state
        # out dim: [batch size, sentence length, hidden dim]
        # cell dim: [num layers, batch size, hidden dim]
        # hidden dim: [num_layers, batch_size, hidden dim]
        
        # use final hidden state from the last layer as an input to fc1
        # Index hidden state of last time step
        # hidden.size() --> [num_layers, batch_size, hidden dim]
        out = hidden[-1, :, :]  
        # new out.size() --> batch_size, hidden dim
        out = self.fc1(out) ## first dense layer
        out = self.relu(out)
        out = self.fc2(out) ## final layer
        out = self.sigmoid(out) 
        return out

In [27]:
vocab_size = 60 ## len of vocab size
embed_dim = 70 ## input size, usually around 50-500 dimensions
num_layers = 1 ## number of recurrent layers, 2 would mean stacking 2 layers to form stacked LSTM
rnn_hidden_size = 100 ## usually around 100-500 dimensions
fc_hidden_size = 100 ## usually around 100-500 dimensions
output_size = 1 ## since the output is between 0 and 1, thus 1 dimension

In [28]:
## Now we will instantiate the class LSTM1 object.

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size,
            num_layers, fc_hidden_size, output_size)

In [29]:
model

RNN(
  (embedding): Embedding(60, 70, padding_idx=0)
  (rnn): LSTM(70, 100, batch_first=True)
  (fc1): Linear(in_features=100, out_features=100, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [30]:
def train(dataloader):
    # model training mode (gradient computation)
    model.train()
    # initailiz acc, and loss at zero 
    total_acc, total_loss = 0, 0
    for idx, (text, label, length) in enumerate (dataloader):
        # reset gradients to zero before each instance
        optimizer.zero_grad()
        # label predictions (forward papagation)
        # squeeze(1) => drop superficial one dimensional from a tensor
        predicted_label = model(text, length).squeeze(1) # or [:,0]
        # loss calculation
        loss = loss_fn(predicted_label, label)
        # compute gradients (backward propagation) 
        # to minimize loss functions with gradient descent
        loss.backward()
        # update parameters based on the computed gradients
        optimizer.step()
        ## logging
        if not idx % 1500:
            print('| Epoch: %03d/%03d | Batch %03d/%03d | Loss: %.4f'
                  %(epoch +1, num_epochs, idx, len(dataloader), loss))
        # compute total accuracy
        # if pred label is >=0.5 to probability of true truth accuracy count increases
        # summation of the largest value which yields predicted class label
        total_acc += ((predicted_label >= 0.5).float() == label).float().sum().item()
        # compute total loss after back prop and parameter update
        total_loss += loss.item() * label.size(0)
    # compare true labels with the predicted labels to compute accuracy
    return total_acc/len(dataloader.dataset), \
            total_loss/len(dataloader.dataset)

In [31]:
def evaluate(dataloader):
    # model evaluation mode (no gradient computation)
    model.eval()
    # initailize acc, and loss at zero 
    total_acc, total_loss = 0, 0
    # disabling gradient calculation
    with torch.no_grad():
        for text, label, length in dataloader:
            # label predictions (forward papagation)
            # squeeze(1) => drop superficial one dimensional from a tensor
            predicted_label = model(text, length).squeeze(1) # reshape
            # loss calculation
            loss = loss_fn(predicted_label, label)
            # compute total accuracy
            # if pred label is >=0.5 to probability of true truth accuracy count increases
            # summation of the largest value which yields predicted class label
            total_acc += ((predicted_label >= 0.5).float() == label).float().sum().item()
            # compute total loss after back prop and parameter update
            total_loss += loss.item() * label.size(0)
            # compare true labels with the predicted labels to compute accuracy
        # compare true labels with the predicted labels to compute accuracy
        return total_acc/len(dataloader.dataset), \
                total_loss/len(dataloader.dataset)

In [32]:
# for binnary classification we use
# single class membership probability output (binary cross entropy loss)
loss_fn = nn.BCELoss()
# Adam Optimizer to update parameters based on the computed gradients
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [34]:
torch.manual_seed(1)
num_epochs = 3

start_time = time.time()
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dataloader)
    acc_val , loss_val = evaluate(val_dataloader)
    print('| Epoch: %03d/%03d | Train Acc: %.2f%% | Valid. Acc: %.2f%%'
          %(epoch + 1, num_epochs, 100 * acc_train, 100 * acc_val))
    print(f'| Time elapsed: {(time.time() - start_time) / 60:.2f} min')

| Epoch: 001/003 | Batch 000/8226 | Loss: 0.0043
| Epoch: 001/003 | Batch 1500/8226 | Loss: 0.0303
| Epoch: 001/003 | Batch 3000/8226 | Loss: 0.0505
| Epoch: 001/003 | Batch 4500/8226 | Loss: 0.0167
| Epoch: 001/003 | Batch 6000/8226 | Loss: 0.0315
| Epoch: 001/003 | Batch 7500/8226 | Loss: 0.0291
| Epoch: 001/003 | Train Acc: 96.52% | Valid. Acc: 96.64%
| Time elapsed: 4.88 min
| Epoch: 002/003 | Batch 000/8226 | Loss: 0.0369
| Epoch: 002/003 | Batch 1500/8226 | Loss: 0.0259
| Epoch: 002/003 | Batch 3000/8226 | Loss: 0.0477
| Epoch: 002/003 | Batch 4500/8226 | Loss: 0.0265
| Epoch: 002/003 | Batch 6000/8226 | Loss: 0.0268
| Epoch: 002/003 | Batch 7500/8226 | Loss: 0.0392
| Epoch: 002/003 | Train Acc: 96.53% | Valid. Acc: 96.64%
| Time elapsed: 9.58 min
| Epoch: 003/003 | Batch 000/8226 | Loss: 0.0536
| Epoch: 003/003 | Batch 1500/8226 | Loss: 0.0228
| Epoch: 003/003 | Batch 3000/8226 | Loss: 0.0311
| Epoch: 003/003 | Batch 4500/8226 | Loss: 0.0146
| Epoch: 003/003 | Batch 6000/8226 | 

In [35]:
acc_test, _ = evaluate(test_dataloader)
print('Test Acc: %.2f'
      %(100 * acc_test))

Test Acc: 96.06
