In [145]:
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import OrderedDict

In [146]:
cuda_id = torch.cuda.current_device()
print(f"Name of CUDA device:\
      {torch.cuda.get_device_name(cuda_id)}")

Name of CUDA device:      NVIDIA GeForce GT 740M


In [147]:
device = torch.device("cpu" if torch.cuda.is_available() else "cuda")

In [188]:
device

device(type='cpu')

In [459]:
df = pd.read_csv("../../data/her_molecules.csv")
print(f"size: {df.shape[0]}")
df.head()

size: 2593


Unnamed: 0,name,smiles,IC50,units,activity,pIC50
0,CHEMBL477,Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,1600000.0,nM,low,15.20412
1,CHEMBL433520,CCN(CC)CC(O)CNc1cc2c(Nc3cccc(Br)c3)ncnc2cn1,1100000.0,nM,low,15.041393
2,CHEMBL440298,N#C/C(=C\c1ccc(O)c(O)c1)C(=O)NCCCCc1ccccc1,501187.23,nM,low,14.7
3,CHEMBL162034,CN(C)CCNc1cc2c(Nc3cccc(Br)c3)ncnc2cn1,379000.0,nM,low,14.578639
4,CHEMBL56319,CCOC(=O)C(Cc1ccccc1)NC(=O)/C(C#N)=C/c1ccc(O)c(...,331131.12,nM,low,14.52


In [151]:
df.activity.unique()

array(['low', 'medium', 'high'], dtype=object)

In [152]:
df.activity.value_counts()

low       1097
high      1048
medium     448
Name: activity, dtype: int64

In [153]:
class CustomDataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]

In [154]:
if __name__ == "__main__":
    dataset = CustomDataset(df.smiles, df.activity)

In [155]:
from torch.utils.data import random_split

[0.8 *len(dataset)], [0.1 *len(dataset)], [0.1 *len(dataset)]

([2074.4], [259.3], [259.3])

In [156]:
train_dataset, test_dataset, val_dataset = random_split(dataset, [2075, 259, 259]) # 80, 10, 10
len(train_dataset + test_dataset + val_dataset)

2593

In [157]:
for smiles, labels in train_dataset:
    print("Input ID:\n " ,smiles)
    print("Label:\n" ,labels)
    break

Input ID:
  CO/N=C/c1c(N)ncnc1Nc1cccc(Br)c1
Label:
 medium


In [158]:
## https://github.com/topazape/LSTM_Chem/blob/master/lstm_chem/utils/smiles_tokenizer2.py

class SmilesTokenizer(object):
    def __init__(self):
        atoms = [
            'Al', 'As', 'B', 'Br', 'C', 'Cl', 'F', 'H', 'I', 'K', 'Li', 'N',
            'Na', 'O', 'P', 'S', 'Se', 'Si', 'Te'
        ]
        special = [
            '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5',
            '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
        ]

        self.table = sorted(atoms, key=len, reverse=True) + special 

        self.table_2_chars = list(filter(lambda x: len(x) == 2, self.table))
        self.table_1_chars = list(filter(lambda x: len(x) == 1, self.table))
        self.vocab_dict = {}

    def tokenize(self, smiles):
        smiles = smiles + ' '
        N = len(smiles)
        token = []
        i = 0
        while (i < N):
            c1 = smiles[i]
            c2 = smiles[i:i + 2]

            if c2 in self.table_2_chars:
                token.append(c2)
                i += 2
                continue

            if c1 in self.table_1_chars:
                token.append(c1)
                i += 1
                continue

            i += 1

        return np.asarray(token, dtype=object)
        
    def vocaburaly(self):
        vocab_dict = {}
        for i, tok in enumerate(self.table):
            vocab_dict[tok] = i
        return vocab_dict
    
    def index_encode(self, tokenized_smiles):
        vocab_dict = {}
        for i, tok in enumerate(self.table):
            vocab_dict[tok] = i
        encoded = [vocab_dict[t] for t in tokenized_smiles ]
        return encoded

In [159]:
tokenizer = SmilesTokenizer()
tokens = [tokenizer.tokenize(x) for x in df.smiles]
vocabulary = tokenizer.vocaburaly()
indexed_smiles = [tokenizer.index_encode(x) for x in tokens]

In [160]:
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(vocabulary.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab1 = vocab(ordered_dict)
vocab1.insert_token("<pad>", 0)
vocab1.insert_token("<unk>", 1)
vocab1.set_default_index(1)

In [161]:
sorted_by_freq_tuples[:7]

[('s', 43), ('o', 42), ('n', 41), ('c', 40), ('te', 39), ('se', 38), ('-', 37)]

In [162]:
vocab1.get_itos()[:10]

['<pad>', '<unk>', 's', 'o', 'n', 'c', 'te', 'se', '-', '+']

In [163]:
print(f"Vocabulary size: {len(vocab1)}")

Vocabulary size: 45


In [164]:
num_class = len(set([label for (text, label) in train_dataset]))
print(f"Number of classes: {num_class}")

Number of classes: 3


In [165]:
text_pipeline = lambda x: [vocab1[token] for token in tokenizer.tokenize(x)]
text_pipeline("O=S(=O)")

[29, 22, 27, 26, 22, 29, 25]

In [166]:
label_pipeline = lambda x: 0 if x == "low" else (1 if x == "medium" else 2)
label_pipeline("low")

0

In [215]:
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for   _text, _label in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text),
                                      dtype=torch.long)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list,
                                 dtype=torch.long)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list,
                                                     batch_first=True)  
    return padded_text_list, label_list

In [238]:
collate_batch(val_dataset)

(tensor([[35, 22, 35,  ...,  0,  0,  0],
         [35,  5, 18,  ...,  0,  0,  0],
         [35, 35, 29,  ...,  0,  0,  0],
         ...,
         [30, 21, 35,  ...,  0,  0,  0],
         [ 5, 18,  5,  ...,  0,  0,  0],
         [35, 29,  5,  ...,  0,  0,  0]]),
 tensor([2, 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 0, 0, 2, 2, 0, 2, 2, 2, 0, 1, 2, 2,
         0, 0, 2, 0, 1, 0, 0, 1, 2, 0, 2, 2, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 2, 2,
         2, 0, 2, 2, 2, 0, 2, 1, 0, 2, 1, 0, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 2, 2,
         0, 2, 2, 2, 0, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 1, 1, 0, 0, 1, 2, 1, 1, 0,
         0, 2, 0, 2, 1, 0, 2, 0, 2, 2, 0, 1, 1, 0, 0, 2, 2, 2, 0, 1, 1, 0, 1, 1,
         0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 1, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 1, 1, 2,
         0, 0, 0, 1, 0, 2, 0, 2, 1, 1, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2,
         2, 1, 0, 2, 0, 2, 2, 0, 2, 1, 2, 2, 2, 2, 2, 2, 1, 0, 0, 0, 1, 0, 0, 2,
         2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 0, 0,
         

In [169]:
train_dataloader = DataLoader(train_dataset,batch_size=20, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset,batch_size=20, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=20, shuffle=True, collate_fn=collate_batch)

In [240]:
text_batch, label_batch = next(iter(test_dataloader))
print(text_batch[:2])
print(label_batch)
print(text_batch.shape)

tensor([[35, 24, 35, 33, 23, 26, 30,  5, 18,  4,  5,  4,  5, 17, 24,  4, 33, 23,
          5, 26,  8,  5, 16,  5,  5,  5, 26, 30, 35, 26, 22, 29, 25, 35, 35, 35,
         35, 35, 35, 35, 26, 22, 29, 25, 30, 29, 25,  5,  5, 16, 25,  5,  5, 18,
         17, 25,  5, 18,  5,  5,  5,  5,  5, 18,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [29, 22, 35, 26, 30,  5, 18,  5,  5,  5, 26, 43, 25,  5,  5, 18, 25,  5,
         18,  5,  5, 26, 43, 25,  5,  5, 26, 43, 25,  5, 18, 29,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
tensor([2, 0, 0, 2, 2, 0, 2, 2, 0, 0, 2, 1, 2, 2, 1, 2, 0, 2, 0, 

In [171]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        # in_channels; num of channels, we use 1 since its text only
        # out_channels; output feature map
        # kernel_size; the size of filters i.e. [n-grams size x emb_dim]
        # if 3 filters are provided, we get 3 convolution layers
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (filt, embedding_dim)) 
                                    for filt in filter_sizes
                                    ])
        
        # dropout for regularization
        self.dropout = nn.Dropout(dropout)
        
        # if we have 3 filter sizes, we get e linear layers
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        # text = [batch size, sent len]
    def forward(self, text):
        
        # rearange the original tensor to 1 dimensional tensor
        # text dim ==> # text = [batch size, sent len]
#         text = text.permute(1,0)
        
        # add embedding dimension to text
        # [batch size, sent len] ==> [batch size, sent len, emb dim]
        embedded = self.embedding(text)
        
        # add 1 dimensional 
        # [batch size, sent len, emb dim]==> [batch size, 1, sent len, emb dim]
        embedded = embedded.unsqueeze(1)
        
        # application of Relu activation function
        # foward pass of embedd text to convolution layers
        # squeeze() to drop superficial 3 dimensional from a tensor
        # conved dim ==> [batch size, n_filters, sent len * len(filter_sizes)]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        
        # max pooling to decrease num of features (sub-sampling)
        # squeeze() to drop superficial 2 dimensional from a tensor
        # pooled dim ==> [batch size, n_filters * len(filter_sizes)]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # concatenate feature outputs
        # cat dim ==> [batch size, n_filters * len(filter_sizes)]
        cat = torch.cat(pooled, dim = 1)
        
        # dropout on concatenated output filters
        # out dim ==> [batch size, n_filters * len(filter_sizes)]
        out = self.dropout(cat)
        
        # pass filter outputs to linear layer for prediction
        # pred dim ==> [batch size, n_filters * len(filter_sizes)]
        pred = self.fc(out)
        return pred

#### Here, I'll instantiate the network. First up, defining the hyperparameters.
    vocab_size: Size of our vocabulary or the range of values for our input, word tokens.
    output_size: Size of our desired output; the number of class scores we want to output.
    embedding_dim: Number of columns in the embedding lookup table; size of our embeddings.
    num_filters: Number of filters that each convolutional layer produces as output.
    filter_sizes: A list of kernel sizes; one convolutional layer will be created for each kernel size.


In [172]:
INPUT_DIM = 60 ## len of vocab size
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = len(df.activity.unique()) ## num_classes 3
DROPOUT = 0.5
PAD_IDX = 0

torch.manual_seed(1)
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [173]:
model

CNN(
  (embedding): Embedding(60, 100, padding_idx=0)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=3, bias=True)
)

In [179]:
def train(dataloader):
    # model training mode (gradient computation)
    model.train()
    # initailiz acc, and loss at zero 
    total_acc, total_loss = 0, 0
    for idx, (text, label) in enumerate (dataloader):
        text = text.to(device)
        label = label.to(device)
        # reset gradients to zero before each instance
        optimizer.zero_grad()
        # label predictions (forward papagation)
        # squeeze(1) => drop superficial one dimensional from a tensor
        predicted_label = model(text).squeeze(1) # or [:,0]
        # loss calculation
        loss = loss_fn(predicted_label, label)
        # compute gradients (backward propagation) 
        # to minimize loss functions with gradient descent
        loss.backward()
        # update parameters based on the computed gradients
        optimizer.step()
        # logging
        if not idx % 50:
            print(f"Epoch: {epoch + 1:04d}/{num_epochs:0d} | "
                  f"Batch {idx:03d}/{len(dataloader):03d} | "
                  f"Loss: {loss:.4f}")
        # compute total accuracy
        # return an indice of the max value of all elements
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        # compute total loss after back prop and parameter update
        total_loss += loss.item() * label.size(0)
    # compare true labels with the predicted labels to compute accuracy
    return total_acc/len(dataloader.dataset), \
            total_loss/len(dataloader.dataset)

In [180]:
def evaluate(dataloader):
    # model evaluation mode (no gradient computation)
    model.eval()
    # initailize acc, and loss at zero 
    total_acc, total_loss = 0, 0
    # disabling gradient calculation
    with torch.no_grad():
        for text, label in dataloader:
            text = text.to(device)
            label = label.to(device)
            # label predictions (forward papagation)
            # squeeze(1) => drop superficial one dimensional from a tensor
            predicted_label = model(text).squeeze(1) # reshape
            # loss calculation
            loss = loss_fn(predicted_label, label)
            # compute total accuracy
            # return an indice of the max value of all elements
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            # compute total loss after back prop and parameter update
            total_loss += loss.item() * label.size(0)
        # compare true labels with the predicted labels to compute accuracy
        return total_acc/len(dataloader.dataset), \
                total_loss/len(dataloader.dataset)

In [181]:
# for multiclass classification we use
loss_fn = nn.CrossEntropyLoss()
# Adam Optimizer to update parameters based on the computed gradients
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [182]:
model = model.to(device)
loss_fn = loss_fn.to(device)

In [183]:
torch.manual_seed(1)
num_epochs = 20

start_time = time.time()
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dataloader)
    acc_val , loss_val = evaluate(val_dataloader)
    print(f"Train Acc.: {100 * acc_train:.2f}%"
          f"\nValid Acc.: {100 * acc_val:.2f}%")
    print(f'Time elapsed: {(time.time() - start_time) / 60:.2f} min')

Epoch: 0001/20 | Batch 000/104 | Loss: 0.3934
Epoch: 0001/20 | Batch 050/104 | Loss: 0.4900
Epoch: 0001/20 | Batch 100/104 | Loss: 0.6955
Train Acc.: 72.53%
Valid Acc.: 73.36%
Time elapsed: 0.06 min
Epoch: 0002/20 | Batch 000/104 | Loss: 0.5976
Epoch: 0002/20 | Batch 050/104 | Loss: 0.4894
Epoch: 0002/20 | Batch 100/104 | Loss: 0.7770
Train Acc.: 74.22%
Valid Acc.: 73.36%
Time elapsed: 0.12 min
Epoch: 0003/20 | Batch 000/104 | Loss: 0.6237
Epoch: 0003/20 | Batch 050/104 | Loss: 0.7085
Epoch: 0003/20 | Batch 100/104 | Loss: 0.4727
Train Acc.: 73.49%
Valid Acc.: 69.11%
Time elapsed: 0.20 min
Epoch: 0004/20 | Batch 000/104 | Loss: 0.5131
Epoch: 0004/20 | Batch 050/104 | Loss: 0.6451
Epoch: 0004/20 | Batch 100/104 | Loss: 0.6127
Train Acc.: 74.46%
Valid Acc.: 71.04%
Time elapsed: 0.26 min
Epoch: 0005/20 | Batch 000/104 | Loss: 0.4203
Epoch: 0005/20 | Batch 050/104 | Loss: 0.2641
Epoch: 0005/20 | Batch 100/104 | Loss: 0.5239
Train Acc.: 74.07%
Valid Acc.: 68.73%
Time elapsed: 0.33 min
Epoch

In [184]:
acc_test, _ = evaluate(test_dataloader)
print(f"Test Acc.: {100 * acc_test:.2f}%")

Test Acc.: 76.06%


In [None]:
# The only difference here is that instead of using a sigmoid function to squash the input between 0 and 1,
# we use the argmax to get the highest predicted class index. 
# or we use sofmax activation function and squash the input to range 0 and 1 and sum them to 1
# in this case we get both the class label pred and the predicted probability

In [454]:
## https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/2%20-%20Upgraded%20Sentiment%20Analysis.ipynb
## https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/2_lstm.ipynb

sentiment_label = {0: "low",
                   1: "medium",
                   2: "high"}

def predict(text, text_pipeline):
    # evaluation mode (no gradients)
    with torch.no_grad():
        # tokenize and index the tokens
        processed_text = torch.tensor(text_pipeline(text))
        # add a batch dimension
        processed_text = processed_text.unsqueeze(0).to(device)
        # prediction
        prediction = model(processed_text) # logits
        # reduction real numbers to values between 0 and 1 and sum to 1
        probability = torch.softmax(prediction, dim=1)
        # get the max value of all elements in the input tensor
        predicted_probability, predicted_class, = torch.max(probability, dim=1)
        # convert tensor holding a single value into an integer
        return predicted_class.item(), predicted_probability.item()

In [456]:
text = "Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O"
pred_class, pred_proba = predict(text, text_pipeline)

print(f'Predicted Class: {pred_class} = {sentiment_label[pred_class]}')
print(f'Probability: {pred_proba}')

Predicted Class: 0 = low
Probability: 0.8340631723403931


In [458]:

text = "O=C(C#CCCN1CCOCC1)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncn"
pred_class, pred_proba = predict(text, text_pipeline)

print(f'Predicted Class: {pred_class} = {sentiment_label[pred_class]}')
print(f'Probability: {pred_proba}')

Predicted Class: 2 = high
Probability: 0.7394805550575256
