# Documentation
> The following code adopted from transformer_encoder_ver2 (a smaller embedding and FDN with the hope of getting less concentrated features). It further exapanded by applying a masked language model. 

> In the previous model, a mask is applied so that at each word, it sees only the word prior to it. (When predicting #2 word, it sees only #1 word, and all rest are masked to -INF), so that the model follows an auto-regressive manner

> In the MLM setting, certain proportion of the sentense is randomly masked (15% in BERT), and they are masked throughout the training process. The loss is only on those positions's correctness. In the TransformerEncoderLayer, the mask pass in should be changed and should mask the position of mask to -INF while keeping all the rest to 0.


In [1]:
import math
import torch.nn as nn
import argparse
import random
import warnings
import numpy as np
import torch
import torch.nn.functional as F
from torch import optim
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.autograd import Variable
import itertools
import pandas as pd
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import math

seed = 7
torch.manual_seed(seed)
np.random.seed(seed)


pfamA_motors = pd.read_csv("../../data/pfamA_motors.csv")
df_dev = pd.read_csv("../../data/df_dev.csv")
motor_toolkit = pd.read_csv("../../data/motor_tookits.csv")

pfamA_motors_balanced = pfamA_motors.groupby('clan').apply(lambda _df: _df.sample(4500,random_state=1))
pfamA_motors_balanced = pfamA_motors_balanced.apply(lambda x: x.reset_index(drop = True))

pfamA_target_name = ["PF00349","PF00022","PF03727","PF06723",\
                       "PF14450","PF03953","PF12327","PF00091","PF10644",\
                      "PF13809","PF14881","PF00063","PF00225","PF03028"]

pfamA_target = pfamA_motors.loc[pfamA_motors["pfamA_acc"].isin(pfamA_target_name),:]


# shuffle pfamA_target and pfamA_motors_balanced
pfamA_target = pfamA_target.sample(frac = 1)
pfamA_target_ind = pfamA_target.iloc[:,0]
print(pfamA_target_ind[0:5])
print(pfamA_motors_balanced.shape)

pfamA_motors_balanced = pfamA_motors_balanced.sample(frac = 1) 
pfamA_motors_balanced_ind = pfamA_motors_balanced.iloc[:,0]
print(pfamA_motors_balanced_ind[0:5])
print(pfamA_target.shape)



179519      179519
1414859    1414859
12920        12920
1415258    1415258
13385        13385
Name: Unnamed: 0, dtype: int64
(18000, 6)
13493    180756
1539     166414
2688     131988
1691      37094
188      130155
Name: Unnamed: 0, dtype: int64
(59149, 6)


In [2]:
aminoacid_list = [
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
    'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
]
clan_list = ["actin_like","tubulin_c","tubulin_binding","p_loop_gtpase"]
        
aa_to_ix = dict(zip(aminoacid_list, np.arange(1, 21)))
clan_to_ix = dict(zip(clan_list, np.arange(0, 4)))

def word_to_index(seq,to_ix):
    "Returns a list of indices (integers) from a list of words."
    return [to_ix.get(word, 0) for word in seq]

ix_to_aa = dict(zip(np.arange(1, 21), aminoacid_list))
ix_to_clan = dict(zip(np.arange(0, 4), clan_list))

def index_to_word(ixs,ix_to): 
    "Returns a list of words, given a list of their corresponding indices."
    return [ix_to.get(ix, 'X') for ix in ixs]



In [3]:
def prepare_sequence(seq):
    idxs = word_to_index(seq[:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

# def prepare_labels(seq):
#     idxs = word_to_index(seq[1:],aa_to_ix)
#     return torch.tensor(idxs, dtype=torch.long)

def prepare_eval(seq):
    idxs = word_to_index(seq[:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

prepare_sequence('YCHXXXXX')

tensor([20,  2,  7,  0,  0,  0,  0,  0])

In [4]:
# set device
device  = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
class PositionalEncoding(nn.Module):
    """
    PositionalEncoding module injects some information about the relative or absolute position of
    the tokens in the sequence. The positional encodings have the same dimension as the embeddings 
    so that the two can be summed. Here, we use sine and cosine functions of different frequencies.
    """
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)

    def forward(self, x):
#         x = x + self.pe[:x.size(0), :]
#         print("x.size() : ", x.size())
#         print("self.pe.size() :", self.pe[:x.size(0),:,:].size())
        x = torch.add(x ,Variable(self.pe[:x.size(0),:,:], requires_grad=False))
        return self.dropout(x)

    
    
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout,activation='gelu')
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    
    def _generate_square_mlm_mask(self, sz, mask_frac=0.15):
        # 0's are the masked position
        zeros_num = int(sz * mask_frac)
        ones_num = sz - zeros_num
        lm_mask = torch.cat([torch.zeros(zeros_num), torch.ones(ones_num)])
        lm_mask = lm_mask[torch.randperm(sz)]
        masked_ind = lm_mask.eq(0)
        lm_mask = lm_mask.repeat(sz, 1)
        mask = lm_mask.float().masked_fill(lm_mask == 0, float('-inf')).masked_fill(lm_mask == 1, float(0.0))
        return mask,masked_ind
        
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if model.training:
            device = src.device
            mask,masked_ind = self._generate_square_mlm_mask(src.size(0))
            mask = mask.to(device)
            self.src_mask = mask
            self.src_mask_ind = masked_ind
            
#         print("src.device: ", src.device)
        src = self.encoder(src) * math.sqrt(self.ninp)
#         print("self.encoder(src) size: ", src.size())
        src = self.pos_encoder(src)
#         print("elf.pos_encoder(src) size: ", src.size())
        output = self.transformer_encoder(src, self.src_mask)
#         print("output size: ", output.size())
        output = self.decoder(output)
        return output, self.src_mask_ind

In [6]:
ntokens = len(aminoacid_list) + 1 # the size of vocabulary
emsize = 12 # embedding dimension
nhid = 100 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 6 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 12 # the number of heads in the multiheadattention models
dropout = 0.1 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout)

In [7]:
criterion = nn.CrossEntropyLoss()
lr = 3.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [8]:
model.to(device)
model.train() # Turn on the train mode

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=12, out_features=12, bias=True)
        )
        (linear1): Linear(in_features=12, out_features=100, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=100, out_features=12, bias=True)
        (norm1): LayerNorm((12,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((12,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=12, out_features=12, bias=True)
        )
        (linear1): Linear(in_features=12, out_features=100, bias=T

In [9]:
import time

In [10]:
start_time = time.time()
print_every = 1000
# loss_vector = []

for epoch in np.arange(0, pfamA_motors_balanced.shape[0]): 
    seq = pfamA_motors_balanced.iloc[epoch, 3]
    print(len(seq))
    sentence_in = prepare_sequence(seq)
#     sentence_in = sentence_in.to(device = device)
    sentence_in = sentence_in.unsqueeze(1).to(device = device)

    optimizer.zero_grad()
    output,mask_ind = model(sentence_in)
#     print(mask_ind)
    targets = sentence_in[mask_ind]
    targets = targets.to(device = device)
    
    print(targets.squeeze(1).size())
    print(output[mask_ind].squeeze(1).size())

    loss = criterion(output[mask_ind].squeeze(1), targets.squeeze(1))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    if epoch % print_every == 0:
        print(f"At Epoch: %.1f"% epoch)
        print(f"Loss %.4f"% loss)
        elapsed = time.time() - start_time
        print(f"time elapsed %.4f"% elapsed)
#         torch.save(model.state_dict(), "../../data/transformer_encoder_201025.pt")
#     loss_vector.append(loss)
    break

137
torch.Size([20])
torch.Size([20, 21])
At Epoch: 0.0
Loss 2.9990
time elapsed 0.0545


In [11]:
start_time = time.time()
print_every = 1000
# loss_vector = []

for epoch in np.arange(0, pfamA_motors_balanced.shape[0]): 
    seq = pfamA_motors_balanced.iloc[epoch, 3]
#     print(len(seq))
    sentence_in = prepare_sequence(seq)
#     sentence_in = sentence_in.to(device = device)
    sentence_in = sentence_in.unsqueeze(1).to(device = device)
#     print(sentence_in.size())
    optimizer.zero_grad()
    output,mask_ind = model(sentence_in)
#     print(mask_ind)
    targets = sentence_in[mask_ind]
    targets = targets.to(device = device)
    
#     print(targets.squeeze(1).size())
#     print(output[mask_ind].squeeze(1).size())

    loss = criterion(output[mask_ind].squeeze(1), targets.squeeze(1))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    if epoch % print_every == 0:
        print(f"At Epoch: %.1f"% epoch)
        print(f"Loss %.4f"% loss)
        elapsed = time.time() - start_time
        print(f"time elapsed %.4f"% elapsed)
        torch.save(model.state_dict(), "../../data/transformer_encoder_mlm_201025.pt")
#     loss_vector.append(loss)

At Epoch: 0.0
Loss 2.7891
time elapsed 0.0447
At Epoch: 1000.0
Loss 0.0325
time elapsed 20.6619
At Epoch: 2000.0
Loss 0.0132
time elapsed 40.8814
At Epoch: 3000.0
Loss 0.2113
time elapsed 60.8712
At Epoch: 4000.0
Loss 0.0011
time elapsed 80.6062
At Epoch: 5000.0
Loss 0.0002
time elapsed 100.3380
At Epoch: 6000.0
Loss 0.0116
time elapsed 119.9446
At Epoch: 7000.0
Loss 0.0004
time elapsed 139.4941
At Epoch: 8000.0
Loss 0.0005
time elapsed 158.8835
At Epoch: 9000.0
Loss 0.0002
time elapsed 177.8303
At Epoch: 10000.0
Loss 0.0009
time elapsed 197.3433
At Epoch: 11000.0
Loss 0.0004
time elapsed 217.1073
At Epoch: 12000.0
Loss 0.0002
time elapsed 236.8729
At Epoch: 13000.0
Loss 0.0001
time elapsed 256.5046
At Epoch: 14000.0
Loss 0.2801
time elapsed 276.4871
At Epoch: 15000.0
Loss 0.0349
time elapsed 296.3113
At Epoch: 16000.0
Loss 0.0001
time elapsed 316.0182
At Epoch: 17000.0
Loss 0.0001
time elapsed 335.5370


In [12]:
torch.save(model.state_dict(), "../../data/mini_transformer_encoder_mlm_balanced.pt")

In [13]:
print('done')

done


## Proceed weight updates using the entire pfam_motor set

In [14]:
start_time = time.time()
print_every = 1000
# loss_vector = []

for epoch in np.arange(0, pfamA_target.shape[0]): 
    seq = pfamA_target.iloc[epoch, 3]
#     print(len(seq))
    sentence_in = prepare_sequence(seq)
#     sentence_in = sentence_in.to(device = device)
    sentence_in = sentence_in.unsqueeze(1).to(device = device)
#     print(sentence_in.size())
    optimizer.zero_grad()
    output,mask_ind = model(sentence_in)
#     print(mask_ind)
    targets = sentence_in[mask_ind]
    targets = targets.to(device = device)
    
#     print(targets.squeeze(1).size())
#     print(output[mask_ind].squeeze(1).size())

    loss = criterion(output[mask_ind].squeeze(1), targets.squeeze(1))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    if epoch % print_every == 0:
        print(f"At Epoch: %.1f"% epoch)
        print(f"Loss %.4f"% loss)
        elapsed = time.time() - start_time
        print(f"time elapsed %.4f"% elapsed)
#         torch.save(model.state_dict(), "../../data/transformer_encoder_mlm_201025.pt")
#     loss_vector.append(loss)

At Epoch: 0.0
Loss 0.0004
time elapsed 0.0398
At Epoch: 1000.0
Loss 0.0180
time elapsed 19.9810
At Epoch: 2000.0
Loss 0.3997
time elapsed 39.8189
At Epoch: 3000.0
Loss 0.0003
time elapsed 59.7628
At Epoch: 4000.0
Loss 0.0957
time elapsed 79.6739
At Epoch: 5000.0
Loss 0.0000
time elapsed 99.5836
At Epoch: 6000.0
Loss 0.0004
time elapsed 119.5286
At Epoch: 7000.0
Loss 0.0001
time elapsed 139.5472
At Epoch: 8000.0
Loss 0.1972
time elapsed 159.5110
At Epoch: 9000.0
Loss 0.0060
time elapsed 179.5563
At Epoch: 10000.0
Loss 0.0038
time elapsed 199.4711
At Epoch: 11000.0
Loss 0.0000
time elapsed 219.6897
At Epoch: 12000.0
Loss 0.0001
time elapsed 239.8744
At Epoch: 13000.0
Loss 0.0002
time elapsed 259.8663
At Epoch: 14000.0
Loss 0.0014
time elapsed 279.3699
At Epoch: 15000.0
Loss 0.0005
time elapsed 300.5702
At Epoch: 16000.0
Loss 0.0007
time elapsed 321.4982
At Epoch: 17000.0
Loss 0.0000
time elapsed 341.3100
At Epoch: 18000.0
Loss 0.0000
time elapsed 360.9946
At Epoch: 19000.0
Loss 0.0696
ti

In [15]:
torch.save(model.state_dict(), "../../data/mini_transformer_encoder_mlm_balanced_target.pt")

In [16]:
print("done")

done
