In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
from rdkit import Chem

## Transformer components

In [None]:
# transformer class components source: https://www.datacamp.com/tutorial/building-a-transformer-with-py-torch

In [23]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value
        
        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)
        
        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [24]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [25]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)

        # adjust for odd/even dimensions
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term[:(d_model + 1) // 2])
       
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [26]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        attn_output = self.self_attn(x, x, x)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

## SMILES Autoenocoder

In [8]:
class bidirectional_GRU_AE(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=3, embedding_dim=10):
        super(bidirectional_GRU_AE, self).__init__()

        # TEST WHETHER CHAR_TO_INT DICT IS NECESSARY WITH THIS
        # embedding layer to convert integer indices to dense float vectors
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.encoder_ouput = []

        self.encoder = nn.GRU(
            input_size=embedding_dim, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            bidirectional=True,
            # input and output tensors are (batch_size, seq_len) format
            batch_first = True      
        )

        self.decoder = nn.GRU(
            # for bidirectional GRU, input size is doubled (each hidden layer as forward and backward state)
            input_size=hidden_size * 2,     
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            bidirectional=True, 
            batch_first = True
        )

        # fully connected layer
        # passing through Tanh activation function (self.tanh = nn.Tanh) for nonlinearity
        self.fc = nn.Sequential(
            nn.Linear(hidden_size * 2, input_size),  # Linear layer
            nn.Tanh(),  # Tanh activation
        )

    def prob_to_char_out(self, output):
        _, predicted_classes = torch.max(output, dim=2)
        return predicted_classes     

    def forward(self, x):
        # encoding
        # embed input integer indices to floating point vectors
        # after embedding, shape of x becomes (batch size, sequence length, embedding dimensions)
        x = self.embedding(x)

        # encoder_output shape is (batch size, sequence length, hidden_size * 2)
        # hidden shape is (num_layers * 2, batch size, hidden_size)
        encoder_output, hidden = self.encoder(x)
    
        return encoder_output

In [9]:
# load input
data = pd.read_csv('dataset/filtered_gc_spec.csv')
input_MS = pd.Series(data["Spectrum"])
output_SMILES = pd.Series(data["SMILES"])

# filter input by length of SMILES (<77)
output_SMILES_filtered = output_SMILES[output_SMILES.str.len() < 77]

# filter spectra according to the filtering of SMILES
input_MS_filtered = input_MS.loc[output_SMILES_filtered.index]

In [11]:
import random 

# add padding the the SMILES strings according to the max length 
max_length = 77
output_smiles_filtered = output_SMILES_filtered.apply(lambda x: x.ljust(max_length, ' '))
output_smiles_tokens = output_SMILES_filtered.apply(lambda x: list(x))
flattened_token_list = [item for sublist in output_smiles_tokens for item in sublist]
unique_characters = set(flattened_token_list)

# mapping from characters to integers
char_to_int = {char: i for i, char in enumerate(unique_characters)}

int_list_smiles = [
    [char_to_int[char] for char in sublist]
    for sublist in output_smiles_tokens]

# add padding according to the longest list
# input to the model will be input_token_mapping
# add padded value so all lists are of same length 
for entry in int_list_smiles:
    entry.extend([0] * (max_length - len(entry)))

# take a small fraction of the total data for testing purposes
smiles_list = int_list_smiles[0:5]

# replicate randomized smiles by shuffling tokens in the list
for sub_list in smiles_list:
    random.shuffle(sub_list)


# create a tensor from smiles_list-- this will be the input to the SMILES autoencoder
smiles_tensor = torch.tensor(smiles_list, dtype=float) 

In [14]:
# train bidirectional GRU AE model

# Hyperparameters
seq_len = 77
input_size = 64     # vocabulary size  - number of unique chars in SMILES  
embed_dim = 10     
hidden_dim = 10    
num_layers = 3
batch_size = 1
learning_rate = 0.0001
#dropout = 0.1


# instantiate model
model = bidirectional_GRU_AE(input_size, hidden_dim, num_layers, embed_dim)

input = smiles_tensor.long()
smiles_encoder_output = model.forward(input)


In [45]:
smiles_encoder_output.shape

torch.Size([5, 77, 20])

## Prepare MS Data

In [17]:
# load input
data = pd.read_csv('dataset/filtered_gc_spec.csv')
input_MS = pd.Series(data["Spectrum"])
output_SMILES = pd.Series(data["SMILES"])

# filter input by length of SMILES (<77)
output_SMILES_filtered = output_SMILES[output_SMILES.str.len() < 77]

# filter spectra according to the filtering of SMILES
input_MS_filtered = input_MS.loc[output_SMILES_filtered.index]

In [18]:
# create a list of of all tokens for the MS strings-- every token consists as one entry in the list
tokens = [peak.lower() for spec in input_MS_filtered for peak in spec.split()]
# append token that will be used for padding
tokens.append('0')

# tokenize each entry in the MS data
tokenized_input_MS = [spec.split() for spec in input_MS_filtered]

# Create a set of unique tokens
unique_tokens = set(tokens)

# Vocabulary size is the number of unique tokens
vocab_size = len(unique_tokens)

# mapping from characters to integers
token_mapping = {char: i for i, char in enumerate(unique_tokens)}

spectra_lengths = []
for spec in tokenized_input_MS:
    length = len(spec)
    spectra_lengths.append(length)

# add padding to tokenized spec strings-- all strings will be of length 524
padded_length = max(spectra_lengths)
for spec in tokenized_input_MS:
    spec.extend('0' * (padded_length - len(spec)))

# convert SMILES tokenized lists into integer lists
input_token_mapping = [
    [token_mapping[char] for char in sublist]
    for sublist in tokenized_input_MS]

In [19]:
# create tensor from a small portion of data
test_list = input_token_mapping[0:5]

spec_tensor = torch.tensor(test_list)

# create one hot embeddings for spec_tensor
spec_embeddings = torch.nn.functional.one_hot(spec_tensor, num_classes=vocab_size).float()

In [20]:
spec_embeddings.shape

torch.Size([5, 524, 506844])

In [21]:
import torch.nn.functional as F
# Input tensor
# Step 1: Reduce the second dimension (524 -> 77) using adaptive pooling
spec_embeddings_reduced = F.adaptive_avg_pool1d(spec_embeddings.permute(0, 2, 1), output_size=77)  # Permute to (5, 506844, 524)
spec_embeddings_reduced = spec_embeddings_reduced.permute(0, 2, 1)  # Back to (5, 77, 506844)

# Step 2: Reduce the last dimension (506844 -> 9) using adaptive pooling
spec_final_embeddings = F.adaptive_avg_pool1d(spec_embeddings_reduced, output_size=20)  # Now (5, 77, 20)
print(f'Spec embedding shape: {spec_final_embeddings.shape}')

Spec embedding shape: torch.Size([5, 77, 20])


## Train Transformer Encoder

In [38]:
class Transformer(nn.Module):
    def __init__(self, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, encoder_embeddings):
        super(Transformer, self).__init__()
        self.encoder_embedding = encoder_embeddings
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.dropout = nn.Dropout(dropout)


    def forward(self, src):

        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding))
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output)

        output = enc_output
        return output

In [39]:
src_vocab_size = vocab_size # determined from the unique number of tokens from spectra data
#tgt_vocab_size = 44 # specified from the SMILES token mapping dictionary-- there were 44 unique tokens 
d_model = 20 # determined from the embedding size-- 20 was the embedding size used in the SMILES AE
num_heads = 2 # d_model must be divisible by the number of heads--d_model is 20
num_layers = 6 #from original paper
d_ff = 2048 #from original paper-- dimensionality of the inner layer in the feed-forward network 
max_seq_length = 77 # maximum sequence length for positional encoding 
dropout = 0.1
transformer = Transformer(d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, spec_final_embeddings)

In [51]:
def RMSE(y_pred, y):
    return torch.sqrt(torch.mean((y_pred-y)**2))


criterion = RMSE
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

losses = []
for epoch in range(100):
    optimizer.zero_grad()
    output = transformer(spec_final_embeddings)
    loss = criterion(output.contiguous(), spec_final_embeddings)
    losses.append(loss.item())
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")


Epoch: 1, Loss: 0.9561964869499207
Epoch: 2, Loss: 0.9560401439666748
Epoch: 3, Loss: 0.9558040499687195
Epoch: 4, Loss: 0.9556138515472412
Epoch: 5, Loss: 0.955420196056366
Epoch: 6, Loss: 0.955214262008667
Epoch: 7, Loss: 0.9550080299377441
Epoch: 8, Loss: 0.9548062086105347
Epoch: 9, Loss: 0.9546112418174744
Epoch: 10, Loss: 0.9544100761413574
Epoch: 11, Loss: 0.9542122483253479
Epoch: 12, Loss: 0.9540103077888489
Epoch: 13, Loss: 0.9538094997406006
Epoch: 14, Loss: 0.953611433506012
Epoch: 15, Loss: 0.9534114599227905
Epoch: 16, Loss: 0.9532116055488586
Epoch: 17, Loss: 0.953013002872467
Epoch: 18, Loss: 0.9528116583824158
Epoch: 19, Loss: 0.9526110887527466
Epoch: 20, Loss: 0.9524118304252625
Epoch: 21, Loss: 0.9522145986557007
Epoch: 22, Loss: 0.9520140290260315
Epoch: 23, Loss: 0.9518147706985474
Epoch: 24, Loss: 0.9516147375106812
Epoch: 25, Loss: 0.9514146447181702
Epoch: 26, Loss: 0.9512162804603577
Epoch: 27, Loss: 0.9510171413421631
Epoch: 28, Loss: 0.9508178234100342
Epoch