## Spectra Encoder: Transformer

Primary reference: https://www.nature.com/articles/s42004-023-00932-3#Sec19

Using this paper as a framework, the purpose of this transformer is to take input GC-MS spectral data and output embeddings to be passed to the SMILES decoder. The reference used images of GC-MS data and implemented a CNN; we intend to use a transformer instead.  

#### Supplemental references:
https://jalammar.github.io/illustrated-transformer/ (Illustrated overview of Transformer function)

https://nlp.seas.harvard.edu/2018/04/03/attention.html (Harvard coding annotation of original Transformation paper)

https://www.datacamp.com/tutorial/building-a-transformer-with-py-torch (Datacamp Transformer tutorial)

Notebook overview:
1. Define model building blocks
2. Encoding
3. Decoding
4. Training
5. Evaluation


## Preparing the input data

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import pandas as pd
import numpy as np
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
import csv

In [2]:
# finding unique characters in the SMILES column of training data 

unique_characters = set() 

with open('dataset/filtered_gc_spec.csv', 'r') as f:
    reader = csv.DictReader(f)  
    for row in reader:
        for char in row["SMILES"]:
            unique_characters.add(char)  # Add each character to the set

print(len(unique_characters))  


45


In [3]:
# finding unique tuples in the spectral training data
unique_tuples = set()  

with open('dataset/filtered_gc_spec.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        spectrum_data = row["Spectrum"]
        tuples = spectrum_data.split() 
        for tup in tuples:
            if ':' in tup and tup.count(':') == 1: 
                unique_tuples.add(tup)

print(len(unique_tuples))  


517627


So we need to go from a "vocabulary" of 517627 unique tuples to 45 unique characters

In [4]:
# load input and output datasets for training

# load dataset
data = pd.read_csv('dataset/filtered_gc_spec.csv')
input_MS = pd.Series(data["Spectrum"])
output_SMILES = pd.Series(data["SMILES"])

assert len(input_MS)==len(output_SMILES) #sanity check to ensure correct loading

In [5]:
#filter input by length of SMILES (<77 as per SMILES encoder)

output_SMILES_filtered = output_SMILES[output_SMILES.str.len() < 77]

#filter input by the same indices
input_MS_filtered = input_MS.loc[output_SMILES_filtered.index]

assert len(input_MS_filtered)==len(output_SMILES_filtered) #sanity check to ensure correct filtering

print(f"Number of GC-MS Spectra for input: {len(input_MS_filtered)}")
print(f"Number of SMILES sequences for output: {len(output_SMILES_filtered)}")

Number of GC-MS Spectra for input: 18792
Number of SMILES sequences for output: 18792


# Defining model components

In [6]:
#defining the multihead attention class 

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
       
        # model parameters
        self.d_model = d_model 
        self.num_heads = num_heads # number of attention heads
        self.d_k = d_model // num_heads # dimension of each head's key, query, and value
        
        #transformation of inputs
        self.W_q = nn.Linear(d_model, d_model) # query transformation
        self.W_k = nn.Linear(d_model, d_model) # key transformation
        self.W_v = nn.Linear(d_model, d_model) # value transformation
        self.W_o = nn.Linear(d_model, d_model) # output transformation
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # apply mask if necessary (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        # softmax to convert to probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)
        
        # output = matrix multiplication of attention probabilities x values
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        # reshape the input to have num_heads for multi-head attention
        #allows us to process multiple heads at the same time: parallel computing
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        # combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        # apply linear transformations to inputs, split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        # scaled dot-product attention on split heads - mask
        # default mask is none - we can change this if we want to mask out certain values
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # combine heads and apply output linear transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [7]:
#each layers contains feed-forward network - applied to each position
# two linear transformations and a reLU activation

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff): #inputs - dimensions and inner-layer dimensions
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [8]:
#embeddings class converts input/output tokens to vectors specified by the dimensions of our model
#softmax converts the output to probabilities

class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [9]:
#positional encoding is used to inject token position info into the input
#otherwise transformer has no info about token position in the input sequence
#essentially uses offset sin/cos graphs based on position. freq/offset is different for each dimension

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]
    #adds positional info to the input

## Implementing model

In [10]:
#defining encoder layer for class
#steps: multiattention, position feed forward, 2x layer normalization, dropout

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads) #self attention mechanism
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff) #attention output appended to original input
        self.norm1 = nn.LayerNorm(d_model) #normalization
        self.norm2 = nn.LayerNorm(d_model) #normalization
        self.dropout = nn.Dropout(dropout) #dropout
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [76]:
# Full Transformer class

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model) #takes in source vocab size 
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length) #max sequence length for positional encoding

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout) #dropout

    def forward(self, src, src_mask=None):
        x = self.encoder_embedding(src)  
        x = self.positional_encoding(x)  
        
        for layer in self.encoder_layers:
            x = layer(x, src_mask)  

            x = self.dropout(x)

        return x #output is fully embedded spectra

## Training

In [49]:
# create a list of of all tokens for the MS strings-- every token consists as one entry in the list
tokens = [peak.lower() for spec in input_MS_filtered for peak in spec.split()]

# tokenize each entry in the MS data
tokenized_input_MS = [spec.split() for spec in input_MS_filtered]


# Create a set of unique tokens
unique_tokens = set(tokens)

# Vocabulary size is the number of unique tokens
vocab_size = len(unique_tokens)

# mapping from characters to integers
token_mapping = {char: i for i, char in enumerate(unique_tokens)}

# convert SMILES tokenized lists into integer lists
input_token_mapping = [
    [token_mapping[char] for char in sublist]
    for sublist in tokenized_input_MS]


In [None]:
# determine the length of the longest list
lengths = []
for spec in input_token_mapping:
    length = len(spec)
    lengths.append(length)

max_length = max(lengths)

# add padding according to the longest list
# input to the model will be input_token_mapping
for entry in input_token_mapping:
    entry.extend([0] * (max_length - len(entry)))

In [93]:
max_length

524

In [95]:
src_vocab_size = vocab_size # determined from the unique number of tokens
tgt_vocab_size = 5000 #need to specify
d_model = max_length # determined from the embedding size 
num_heads = 4 # d_model must be divisible by the number of heads-- d_model is 524
num_layers = 6 #from original paper
d_ff = 2048 #from original paper-- dimensionality of the inner layer in the feed-forward network 
max_seq_length = max_length # maximum sequence length for positional encoding 
dropout = 0.1
transformer = Transformer(src_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

: 

In [94]:
test_list = input_token_mapping[0:6]
token_tensor = torch.tensor(test_list) # Create a tensor from the list
transformer.forward(token_tensor)

RuntimeError: shape '[6, 524, 8, 65]' is invalid for input of size 1647456