# Model Description
- Apply a transformer based model to pfam/unirep_50 data and extract the embedding features
> In this tutorial, we train nn.TransformerEncoder model on a language modeling task. The language modeling task is to assign a probability for the likelihood of a given word (or a sequence of words) to follow a sequence of words. A sequence of tokens are passed to the embedding layer first, followed by a positional encoding layer to account for the order of the word (see the next paragraph for more details). The nn.TransformerEncoder consists of multiple layers of nn.TransformerEncoderLayer. Along with the input sequence, a square attention mask is required because the self-attention layers in nn.TransformerEncoder are only allowed to attend the earlier positions in the sequence. For the language modeling task, any tokens on the future positions should be masked. To have the actual words, the output of nn.TransformerEncoder model is sent to the final Linear layer, which is followed by a log-Softmax function.

## Math and model formulation and code reference:
- Attention is all you need https://arxiv.org/abs/1706.03762
- ResNet https://towardsdatascience.com/understanding-and-visualizing-resnets-442284831be8
- MIT Visualization http://jalammar.github.io/illustrated-transformer/
- An Annotated transformer http://nlp.seas.harvard.edu/2018/04/03/attention.html#a-real-world-example

In [1]:
import math
import torch.nn as nn
import argparse
import random
import warnings
import numpy as np
import torch
import torch.nn.functional as F
from torch import optim
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.autograd import Variable
import itertools
import pandas as pd
# seed = 7
# torch.manual_seed(seed)
# np.random.seed(seed)

# pfamA_motors = pd.read_csv("../data/pfamA_motors.csv")
# df_dev = pd.read_csv("../data/df_dev.csv")
# pfamA_motors = pfamA_motors.iloc[:,1:]
# clan_train_dat = pfamA_motors.groupby("clan").head(4000)
# clan_train_dat = clan_train_dat.sample(frac=1).reset_index(drop=True)
# clan_test_dat = pfamA_motors.loc[~pfamA_motors["id"].isin(clan_train_dat["id"]),:].groupby("clan").head(400)

# clan_train_dat.shape

# def df_to_tup(dat):
#     data = []
#     for i in range(dat.shape[0]):
#         row = dat.iloc[i,:]
#         tup = (row["seq"],row["clan"])
#         data.append(tup)
#     return data

# clan_training_data = df_to_tup(clan_train_dat)
# clan_test_data = df_to_tup(clan_test_dat)
# for seq,clan in clan_training_data:
#     print(seq)
#     print(clan)
#     break


aminoacid_list = [
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
    'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
]
clan_list = ["actin_like","tubulin_c","tubulin_binding","p_loop_gtpase"]
        
aa_to_ix = dict(zip(aminoacid_list, np.arange(1, 21)))
clan_to_ix = dict(zip(clan_list, np.arange(0, 4)))

def word_to_index(seq,to_ix):
    "Returns a list of indices (integers) from a list of words."
    return [to_ix.get(word, 0) for word in seq]

ix_to_aa = dict(zip(np.arange(1, 21), aminoacid_list))
ix_to_clan = dict(zip(np.arange(0, 4), clan_list))

def index_to_word(ixs,ix_to): 
    "Returns a list of words, given a list of their corresponding indices."
    return [ix_to.get(ix, 'X') for ix in ixs]

def prepare_sequence(seq):
    idxs = word_to_index(seq[0:-1],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_labels(seq):
    idxs = word_to_index(seq[1:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

prepare_labels('YCHXXXXX')



tensor([2, 7, 0, 0, 0, 0, 0])

In [2]:
uniref50_01 = pd.read_csv("../data/uniref50_01.tsv",sep = "\t",header=None)
uniref50_01.shape

(846396, 2)

In [34]:
# set device
device  = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=2)

In [35]:
class PositionalEncoding(nn.Module):
    """
    PositionalEncoding module injects some information about the relative or absolute position of
    the tokens in the sequence. The positional encodings have the same dimension as the embeddings 
    so that the two can be summed. Here, we use sine and cosine functions of different frequencies.
    """
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)

    def forward(self, x):
#         x = x + self.pe[:x.size(0), :]
#         print("x.size() : ", x.size())
#         print("self.pe.size() :", self.pe[:x.size(0),:,:].size())
        x = torch.add(x ,Variable(self.pe[:x.size(0),:,:], requires_grad=False))
        return self.dropout(x)

In [36]:

    
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != src.size(0):
            device = src.device
            mask = self._generate_square_subsequent_mask(src.size(0)).to(device = device)
            self.src_mask = mask
#         print("src.device: ", src.device)
        src = self.encoder(src) * math.sqrt(self.ninp)
#         print("self.encoder(src) size: ", src.size())
        src = self.pos_encoder(src)
#         print("elf.pos_encoder(src) size: ", src.size())
        output = self.transformer_encoder(src, self.src_mask)
#         print("output size: ", output.size())
        output = self.decoder(output)
        return output

In [37]:
ntokens = len(aminoacid_list) + 1 # the size of vocabulary
emsize = 768 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 6 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 12 # the number of heads in the multiheadattention models
dropout = 0.1 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout)

In [38]:
import time

In [39]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

model.to(device)
model.train() # Turn on the train mode

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (linear1): Linear(in_features=768, out_features=200, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=200, out_features=768, bias=True)
        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (linear1): Linear(in_features=768, out_features=20

In [40]:
start_time = time.time()
print_every = 1
# loss_vector = []

for epoch in np.arange(0, uniref50_01.shape[0]): 
    seq = uniref50_01.iloc[epoch, 1]
    if(len(seq)>4000):
        continue
    sentence_in = prepare_sequence(seq)
    targets = prepare_labels(seq)
#     sentence_in = sentence_in.to(device = device)
    sentence_in = sentence_in.unsqueeze(1).to(device = device)
    targets = targets.to(device = device)
    
    optimizer.zero_grad()
    output = model(sentence_in)
    
    print("targets size: ", targets.size())
    loss = criterion(output.view(-1, ntokens), targets)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    if epoch % print_every == 0:
        print(f"At Epoch: %.1f"% epoch)
        print(f"Loss %.4f"% loss)
#     loss_vector.append(loss)
    break
  


targets size:  torch.Size([188])
At Epoch: 3.0
Loss 4.3162


In [41]:
start_time = time.time()
print_every = 1000
# loss_vector = []

thresh = 0
for epoch in np.arange(0, uniref50_01.shape[0]): 
    seq = uniref50_01.iloc[epoch, 1]
    if(len(seq)>4000):
        continue 
    sentence_in = prepare_sequence(seq)
    targets = prepare_labels(seq)
#     sentence_in = sentence_in.to(device = device)
    sentence_in = sentence_in.unsqueeze(1).to(device = device)
    targets = targets.to(device = device)
    
    optimizer.zero_grad()
    output = model(sentence_in)
    
#     print("targets size: ", targets.size())
    loss = criterion(output.view(-1, ntokens), targets)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    if epoch > thresh:
        print(f"At Epoch: %.1f"% epoch)
        print(f"Loss %.4f"% loss)
        elapsed = time.time() - start_time
        print(f"time elapsed %.4f"% elapsed)
        thresh+=print_every
#     loss_vector.append(loss)

  

At Epoch: 3.0
Loss 12.0238
time elapsed 0.0535


RuntimeError: CUDA out of memory. Tried to allocate 718.00 MiB (GPU 2; 15.90 GiB total capacity; 14.76 GiB already allocated; 141.75 MiB free; 320.53 MiB cached)

In [None]:
torch.save(model.state_dict(), "../data/transformer_encoder_uniref_201012.pt")