# Documentation
> 201013: This notebook generate embedding vectors for pfam_motors, df_dev, and motor_toolkit from the models that currently finished training:
    - lstm5: 
    - transformer_encoder
    - seq2seq_attention_mini



In [10]:
import torch
import torch.nn as nn 
import torch.optim as optim 

import torchvision 
import torchvision.transforms as transforms 
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, IterableDataset, DataLoader
# import tqdm
import numpy as np
import pandas as pd

import math

## Load datasets

In [3]:

pfamA_motors = pd.read_csv("../data/pfamA_motors.csv")
df_dev = pd.read_csv("../data/df_dev.csv")
motor_toolkit = pd.read_csv("../data/motor_tookits.csv")


In [4]:
motor_toolkit.iloc[0,7]

'MASQPNSSAKKKEEKGKNIQVVVRCRPFNLAERKASAHSIVECDPVRKEVSVRTGGLADKSSRKTYTFDMVFGASTKQIDVYRSVVCPILDEVIMGYNCTIFAYGQTGTGKTFTMEGERSPNEEYTWEEDPLAGIIPRTLHQIFEKLTDNGTEFSVKVSLLEIYNEELFDLLNPSSDVSERLQMFDDPRNKRGVIIKGLEEITVHNKDEVYQILEKGAAKRTTAATLMNAYSSRSHSVFSVTIHMKETTIDGEELVKIGKLNLVDLAGSENIGRSGAVDKRAREAGNINQSLLTLGRVITALVERTPHVPYRESKLTRILQDSLGGRTRTSIIATISPASLNLEETLSTLEYAHRAKNILNKPEVNQKLTKKALIKEYTEEIERLKRDLAAAREKNGVYISEENFRVMSGKLTVQEEQIVELIEKIGAVEEELNRVTELFMDNKNELDQCKSDLQNKTQELETTQKHLQETKLQLVKEEYITSALESTEEKLHDAASKLLNTVEETTKDVSGLHSKLDRKKAVDQHNAEAQDIFGKNLNSLFNNMEELIKDGSSKQKAMLEVHKTLFGNLLSSSVSALDTITTVALGSLTSIPENVSTHVSQIFNMILKEQSLAAESKTVLQELINVLKTDLLSSLEMILSPTVVSILKINSQLKHIFKTSLTVADKIEDQKKELDGFLSILCNNLHELQENTICSLVESQKQCGNLTEDLKTIKQTHSQELCKLMNLWTERFCALEEKCENIQKPLSSVQENIQQKSKDIVNKMTFHSQKFCADSDGFSQELRNFNQEGTKLVEESVKHSDKLNGNLEKISQETEQRCESLNTRTVYFSEQWVSSLNEREQELHNLLEVVSQCCEASSSDITEKSDGRKAAHEKQHNIFLDQMTIDEDKLIAQNLELNETIKIGLTKLNCFLEQDLKLDIPTGTTPQRKSYLYPSTLVRTEPREHLLDQLKRKQPELLMMLNCSENNKEETIPDVDVEEAVLGQYTEEPLSQEPSVDA

In [25]:
pfamA_motors.iloc[1,3]

'LVLVLNCGSSSLKFAIVDAETGAEHLTGLAECLGLPEARMKWKLDGKHEAQLGAGAAHEEALSFMVETILASKPELKANLGAIGHRIVHGGEQFTQSALICDQVLKGIQDAATFAPLHNPAHLIGIEAAKHNFPELQNVAVFDTAFHQTMPEESFLYALPYNLYKEHGIRRYGMHGTSHLFITREVAGLLNKPVEEVNIINCHLGNGASVCAIKNGQSVDTSMGLTPLEGLVMGTRCGDIDPAIIFHLHDALGYSVEQINNMLTKESGLQGLTEVTSDCRFVEDNYGEKEEATRAMDVFCHRLAKYVAGYTASLEGRLDAITFTGGIGENSGPIREMVLNRLAIFGIEVDSEANLKARFGGEGTITTANSRIPAMVISTNEELVIAE'

## Load helper functions for modeling evaluation


In [5]:
aminoacid_list = [
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
    'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
]
clan_list = ["actin_like","tubulin_c","tubulin_binding","p_loop_gtpase"]
        
aa_to_ix = dict(zip(aminoacid_list, np.arange(1, 21)))
clan_to_ix = dict(zip(clan_list, np.arange(0, 4)))

def word_to_index(seq,to_ix):
    "Returns a list of indices (integers) from a list of words."
    return [to_ix.get(word, 0) for word in seq]

ix_to_aa = dict(zip(np.arange(1, 21), aminoacid_list))
ix_to_clan = dict(zip(np.arange(0, 4), clan_list))

def index_to_word(ixs,ix_to): 
    "Returns a list of words, given a list of their corresponding indices."
    return [ix_to.get(ix, 'X') for ix in ixs]

def prepare_sequence(seq):
    idxs = word_to_index(seq[0:-1],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_labels(seq):
    idxs = word_to_index(seq[1:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_eval(seq):
    idxs = word_to_index(seq[:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

prepare_labels('YCHXXXXX')

tensor([2, 7, 0, 0, 0, 0, 0])

In [6]:
# set device
device  = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Define LSTM5 model with pre-trained weights

In [1]:
# Hyperparameters
input_size = len(aminoacid_list) + 1
num_layers = 1
hidden_size = 128
output_size = len(aminoacid_list) + 1
embedding_size= 10
learning_rate = 0.001

# Create Bidirectional LSTM
class BRNN(nn.Module):
    def __init__(self,input_size, embedding_size, hidden_size, num_layers, output_size):
        super(BRNN,self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.log_softmax = nn.LogSoftmax(dim= 1)
        self.aa_embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(input_size = embedding_size, 
                            hidden_size = hidden_size,
                            num_layers = num_layers, 
                            bidirectional = True)
        #hidden_state: a forward and a backward state for each layer of LSTM
        self.fc = nn.Linear(hidden_size*2, output_size)
    
    def aa_encoder(self, input): 
        "Helper function to map single aminoacids to the embedding space."
        projected = self.embedding(input)
        return projected 
    

    def forward(self,seq):
        # embed each aa to the embedded space
        embedding_tensor = self.aa_embedding(seq)

        # initialization could be neglected as the default is 0 for h0 and c0
        # initialize hidden state
        # h0 = torch.zeros(self.num_layers*2,x.size(0),self.hidden_size).to(device)
        # initialize cell_state
        # c0 = torch.zeros(self.num_layers*2,x.size(0),self.hidden_size).to(device)

        # shape(seq_len = len(sequence), batch_size = 1, input_size = -1)
        # (5aa,1 sequence per batch, 10-dimension embedded vector)

        #output of shape (seq_len, batch, num_directions * hidden_size):
        out, (hn, cn) = self.lstm(embedding_tensor.view(len(seq), 1, -1))
        # decoded_space = self.fc(out.view(len(seq), -1))
        decoded_space = self.fc(out.view(len(seq), -1))
        decoded_scores = F.log_softmax(decoded_space, dim=1)
        return decoded_scores, hn


NameError: name 'aminoacid_list' is not defined

In [8]:
# initialize network
model = BRNN(input_size, embedding_size, hidden_size, num_layers, output_size).to(device)

In [10]:
model.load_state_dict(torch.load("../data/bidirectional_lstm_5_201008.pt"))
model.eval()

BRNN(
  (log_softmax): LogSoftmax()
  (aa_embedding): Embedding(21, 10)
  (lstm): LSTM(10, 128, bidirectional=True)
  (fc): Linear(in_features=256, out_features=21, bias=True)
)

#### output hidden vector for motor toolkit

In [3]:
hn_vector = []
print_every = 100
for epoch in np.arange(0, motor_toolkit.shape[0]):   
    with torch.no_grad():
        seq = motor_toolkit.iloc[epoch, 7]
        sentence_in = prepare_eval(seq)
        sentence_in = sentence_in.to(device = device)
        decoded_scores, hn = model(sentence_in)
        hn_vector.append(hn.cpu().detach().numpy().reshape(1,-1))
    if epoch % print_every == 0:
      print(f"At Epoch: %.2f"% epoch)


NameError: name 'motor_toolkit' is not defined

In [21]:
hn_vector = np.array(hn_vector)
hn_vector = np.squeeze(hn_vector, axis=1)
print(hn_vector.shape)

(3255, 256)


In [22]:
np.save("../data/hn_lstm5_motortoolkit.npy", hn_vector)

In [23]:
hn_lstm_toolkit = np.load("../data/hn_lstm5_motortoolkit.npy")
hn_lstm_toolkit.shape

(3255, 256)

#### Output for kaggle pfam

In [26]:
hn_vector = []
print_every = 10000
for epoch in np.arange(0, df_dev.shape[0]):   
    with torch.no_grad():
        seq = df_dev.iloc[epoch, 6]
        sentence_in = prepare_eval(seq)
        sentence_in = sentence_in.to(device = device)
        decoded_scores, hn = model(sentence_in)
        hn_vector.append(hn.cpu().detach().numpy().reshape(1,-1))
    if epoch % print_every == 0:
        print(f"At Epoch: %.2f"% epoch)

At Epoch: 0.00
At Epoch: 10000.00
At Epoch: 20000.00
At Epoch: 30000.00
At Epoch: 40000.00
At Epoch: 50000.00
At Epoch: 60000.00
At Epoch: 70000.00
At Epoch: 80000.00
At Epoch: 90000.00
At Epoch: 100000.00
At Epoch: 110000.00
At Epoch: 120000.00
At Epoch: 130000.00
At Epoch: 140000.00
At Epoch: 150000.00
At Epoch: 160000.00
At Epoch: 170000.00
At Epoch: 180000.00
At Epoch: 190000.00
At Epoch: 200000.00
At Epoch: 210000.00
At Epoch: 220000.00
At Epoch: 230000.00
At Epoch: 240000.00
At Epoch: 250000.00
At Epoch: 260000.00
At Epoch: 270000.00
At Epoch: 280000.00
At Epoch: 290000.00
At Epoch: 300000.00
At Epoch: 310000.00
At Epoch: 320000.00
At Epoch: 330000.00
At Epoch: 340000.00
At Epoch: 350000.00
At Epoch: 360000.00
At Epoch: 370000.00
At Epoch: 380000.00
At Epoch: 390000.00
At Epoch: 400000.00
At Epoch: 410000.00
At Epoch: 420000.00
At Epoch: 430000.00
At Epoch: 440000.00
At Epoch: 450000.00
At Epoch: 460000.00
At Epoch: 470000.00
At Epoch: 480000.00
At Epoch: 490000.00
At Epoch: 5000

In [27]:
hn_vector = np.array(hn_vector)
hn_vector = np.squeeze(hn_vector, axis=1)
print(hn_vector.shape)
np.save("../data/hn_lstm5_dfdev.npy", hn_vector)

(1212912, 256)


#### Output for pfam molecular motors clans

In [28]:
hn_vector = []
print_every = 10000
for epoch in np.arange(0, pfamA_motors.shape[0]):   
    with torch.no_grad():
        seq = pfamA_motors.iloc[epoch, 3]
        sentence_in = prepare_eval(seq)
        sentence_in = sentence_in.to(device = device)
        decoded_scores, hn = model(sentence_in)
        hn_vector.append(hn.cpu().detach().numpy().reshape(1,-1))
    if epoch % print_every == 0:
        print(f"At Epoch: %.2f"% epoch)

At Epoch: 0.00
At Epoch: 10000.00
At Epoch: 20000.00
At Epoch: 30000.00
At Epoch: 40000.00
At Epoch: 50000.00
At Epoch: 60000.00
At Epoch: 70000.00
At Epoch: 80000.00
At Epoch: 90000.00
At Epoch: 100000.00
At Epoch: 110000.00
At Epoch: 120000.00
At Epoch: 130000.00
At Epoch: 140000.00
At Epoch: 150000.00
At Epoch: 160000.00
At Epoch: 170000.00
At Epoch: 180000.00
At Epoch: 190000.00
At Epoch: 200000.00
At Epoch: 210000.00
At Epoch: 220000.00
At Epoch: 230000.00
At Epoch: 240000.00
At Epoch: 250000.00
At Epoch: 260000.00
At Epoch: 270000.00
At Epoch: 280000.00
At Epoch: 290000.00
At Epoch: 300000.00
At Epoch: 310000.00
At Epoch: 320000.00
At Epoch: 330000.00
At Epoch: 340000.00
At Epoch: 350000.00
At Epoch: 360000.00
At Epoch: 370000.00
At Epoch: 380000.00
At Epoch: 390000.00
At Epoch: 400000.00
At Epoch: 410000.00
At Epoch: 420000.00
At Epoch: 430000.00
At Epoch: 440000.00
At Epoch: 450000.00
At Epoch: 460000.00
At Epoch: 470000.00
At Epoch: 480000.00
At Epoch: 490000.00
At Epoch: 5000

In [29]:
hn_vector = np.array(hn_vector)
hn_vector = np.squeeze(hn_vector, axis=1)
print(hn_vector.shape)
np.save("../data/hn_lstm5_pfammotors.npy", hn_vector)

(1914831, 256)


In [30]:
print("done")

done


## Define Seq2Seq Encoder with pre-trained weights

In [45]:
# Hyperparameters
input_size = len(aminoacid_list) + 1
num_layers = 1
hidden_size = 64
output_size = len(aminoacid_list) + 1
embedding_size= 10
learning_rate = 0.001

class s2s_Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, output_size, batch_first=False, bidirectional=True):
        super(s2s_Encoder, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.log_softmax = nn.LogSoftmax(dim= 1)
        self.batch_first = batch_first
        self.aa_embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers,
                           batch_first=batch_first, bidirectional=bidirectional)

    def forward(self, seq):
        # embed each aa to the embedded space
        embedding_tensor = self.aa_embedding(seq)
        #output of shape (seq_len, batch, num_directions * hidden_size):
        outputs, hidden = self.rnn(embedding_tensor.view(len(seq), 1, -1))
        # Return output and final hidden state
        return outputs, hidden

In [46]:
model = s2s_Encoder(input_size = input_size, \
                               embedding_size = embedding_size, \
                               hidden_size = hidden_size, \
                               num_layers = num_layers, \
                               output_size = output_size).to(device)

In [47]:
model.load_state_dict(torch.load("../data/seq2seq_encoder_df_dev_201012_230k.pt"))
model.eval()

s2s_Encoder(
  (log_softmax): LogSoftmax()
  (aa_embedding): Embedding(21, 10)
  (rnn): LSTM(10, 64, bidirectional=True)
)

#### output hidden vector for motor toolkit

In [49]:
hn_vector = []
print_every = 100
for epoch in np.arange(0, motor_toolkit.shape[0]):   
    with torch.no_grad():
        seq = motor_toolkit.iloc[epoch, 7]
        sentence_in = prepare_eval(seq)
        sentence_in = sentence_in.to(device = device)
        _, (hn,_) = model(sentence_in)
        hn_vector.append(hn.cpu().detach().numpy().reshape(1,-1))
    if epoch % print_every == 0:
      print(f"At Epoch: %.2f"% epoch)


At Epoch: 0.00
At Epoch: 100.00
At Epoch: 200.00
At Epoch: 300.00
At Epoch: 400.00
At Epoch: 500.00
At Epoch: 600.00
At Epoch: 700.00
At Epoch: 800.00
At Epoch: 900.00
At Epoch: 1000.00
At Epoch: 1100.00
At Epoch: 1200.00
At Epoch: 1300.00
At Epoch: 1400.00
At Epoch: 1500.00
At Epoch: 1600.00
At Epoch: 1700.00
At Epoch: 1800.00
At Epoch: 1900.00
At Epoch: 2000.00
At Epoch: 2100.00
At Epoch: 2200.00
At Epoch: 2300.00
At Epoch: 2400.00
At Epoch: 2500.00
At Epoch: 2600.00
At Epoch: 2700.00
At Epoch: 2800.00
At Epoch: 2900.00
At Epoch: 3000.00
At Epoch: 3100.00
At Epoch: 3200.00


In [50]:
hn_vector = np.array(hn_vector)
hn_vector = np.squeeze(hn_vector, axis=1)
print(hn_vector.shape)

(3255, 128)


In [51]:
np.save("../data/hn_s2sencoder_motortoolkit.npy", hn_vector)

In [52]:
hn_lstm_toolkit = np.load("../data/hn_s2sencoder_motortoolkit.npy")
hn_lstm_toolkit.shape

(3255, 128)

#### Output for kaggle pfam

In [53]:
hn_vector = []
print_every = 10000
for epoch in np.arange(0, df_dev.shape[0]):   
    with torch.no_grad():
        seq = df_dev.iloc[epoch, 6]
        sentence_in = prepare_eval(seq)
        sentence_in = sentence_in.to(device = device)
        _, (hn,_) = model(sentence_in)
        hn_vector.append(hn.cpu().detach().numpy().reshape(1,-1))
    if epoch % print_every == 0:
        print(f"At Epoch: %.2f"% epoch)

At Epoch: 0.00
At Epoch: 10000.00
At Epoch: 20000.00
At Epoch: 30000.00
At Epoch: 40000.00
At Epoch: 50000.00
At Epoch: 60000.00
At Epoch: 70000.00
At Epoch: 80000.00
At Epoch: 90000.00
At Epoch: 100000.00
At Epoch: 110000.00
At Epoch: 120000.00
At Epoch: 130000.00
At Epoch: 140000.00
At Epoch: 150000.00
At Epoch: 160000.00
At Epoch: 170000.00
At Epoch: 180000.00
At Epoch: 190000.00
At Epoch: 200000.00
At Epoch: 210000.00
At Epoch: 220000.00
At Epoch: 230000.00
At Epoch: 240000.00
At Epoch: 250000.00
At Epoch: 260000.00
At Epoch: 270000.00
At Epoch: 280000.00
At Epoch: 290000.00
At Epoch: 300000.00
At Epoch: 310000.00
At Epoch: 320000.00
At Epoch: 330000.00
At Epoch: 340000.00
At Epoch: 350000.00
At Epoch: 360000.00
At Epoch: 370000.00
At Epoch: 380000.00
At Epoch: 390000.00
At Epoch: 400000.00
At Epoch: 410000.00
At Epoch: 420000.00
At Epoch: 430000.00
At Epoch: 440000.00
At Epoch: 450000.00
At Epoch: 460000.00
At Epoch: 470000.00
At Epoch: 480000.00
At Epoch: 490000.00
At Epoch: 5000

In [54]:
hn_vector = np.array(hn_vector)
hn_vector = np.squeeze(hn_vector, axis=1)
print(hn_vector.shape)
np.save("../data/hn_s2sencoder_dfdev.npy", hn_vector)

(1212912, 128)


#### Output for pfam molecular motors clans

In [55]:
hn_vector = []
print_every = 10000
for epoch in np.arange(0, pfamA_motors.shape[0]):   
    with torch.no_grad():
        seq = pfamA_motors.iloc[epoch, 3]
        sentence_in = prepare_eval(seq)
        sentence_in = sentence_in.to(device = device)
        _, (hn,_) = model(sentence_in)
        hn_vector.append(hn.cpu().detach().numpy().reshape(1,-1))
    if epoch % print_every == 0:
        print(f"At Epoch: %.2f"% epoch)

At Epoch: 0.00
At Epoch: 10000.00
At Epoch: 20000.00
At Epoch: 30000.00
At Epoch: 40000.00
At Epoch: 50000.00
At Epoch: 60000.00
At Epoch: 70000.00
At Epoch: 80000.00
At Epoch: 90000.00
At Epoch: 100000.00
At Epoch: 110000.00
At Epoch: 120000.00
At Epoch: 130000.00
At Epoch: 140000.00
At Epoch: 150000.00
At Epoch: 160000.00
At Epoch: 170000.00
At Epoch: 180000.00
At Epoch: 190000.00
At Epoch: 200000.00
At Epoch: 210000.00
At Epoch: 220000.00
At Epoch: 230000.00
At Epoch: 240000.00
At Epoch: 250000.00
At Epoch: 260000.00
At Epoch: 270000.00
At Epoch: 280000.00
At Epoch: 290000.00
At Epoch: 300000.00
At Epoch: 310000.00
At Epoch: 320000.00
At Epoch: 330000.00
At Epoch: 340000.00
At Epoch: 350000.00
At Epoch: 360000.00
At Epoch: 370000.00
At Epoch: 380000.00
At Epoch: 390000.00
At Epoch: 400000.00
At Epoch: 410000.00
At Epoch: 420000.00
At Epoch: 430000.00
At Epoch: 440000.00
At Epoch: 450000.00
At Epoch: 460000.00
At Epoch: 470000.00
At Epoch: 480000.00
At Epoch: 490000.00
At Epoch: 5000

In [56]:
hn_vector = np.array(hn_vector)
hn_vector = np.squeeze(hn_vector, axis=1)
print(hn_vector.shape)
np.save("../data/hn_s2sencoder_pfammotors.npy", hn_vector)

(1914831, 128)


In [57]:
print("done")

done


## Define transformer encoder pre-trained weights

In [7]:
class PositionalEncoding(nn.Module):
    """
    PositionalEncoding module injects some information about the relative or absolute position of
    the tokens in the sequence. The positional encodings have the same dimension as the embeddings 
    so that the two can be summed. Here, we use sine and cosine functions of different frequencies.
    """
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        # print("self.pe.size() :", self.pe[:x.size(0),:,:].size())
        
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)

    def forward(self, x):
      
#         x = x + self.pe[:x.size(0), :]
        x = x.unsqueeze(0).transpose(0, 1)
        # print("x.size() : ", x.size())
        # print("self.pe.size() :", self.pe[:x.size(0),:,:].size())
        x = torch.add(x ,Variable(self.pe[:x.size(0),:,:], requires_grad=False))
        return self.dropout(x)

class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        # if self.src_mask is None or self.src_mask.size(0) != src.size(0):
        #     device = src.device
        #     mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
        #     self.src_mask = mask
#         print("src.device: ", src.device)
        src = self.encoder(src) * math.sqrt(self.ninp)
        # print("self.encoder(src) size: ", src.size())
        src = self.pos_encoder(src)
        # print("elf.pos_encoder(src) size: ", src.size())
        output_encoded = self.transformer_encoder(src, self.src_mask)
        output_encoded = self.transformer_encoder(src)
        # print("output_encoded size: ", output_encoded.size())
        output = self.decoder(output_encoded)
        return output,output_encoded

In [11]:
ntokens = len(aminoacid_list) + 1 # the size of vocabulary
emsize = 768 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 6 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 12 # the number of heads in the multiheadattention models
dropout = 0.1 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)


In [12]:
model.load_state_dict(torch.load("../data/transformer_encoder_201012.pt"))
model.eval()

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (linear1): Linear(in_features=768, out_features=200, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=200, out_features=768, bias=True)
        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (linear1): Linear(in_features=768, out_features=20

#### Output hidden states for motor toolkit dataset

In [22]:
hn_vector = []
print_every = 100
max_len = 5000
for epoch in np.arange(0, motor_toolkit.shape[0]):   
    with torch.no_grad():
        seq = motor_toolkit.iloc[epoch, 7]
        # print(seq)
        if len(seq) > max_len:
            continue
        sentence_in = prepare_eval(seq)
        sentence_in = sentence_in.to(device = device)
        _,hn = model(sentence_in)
        
        hn = hn.sum(dim = 0).cpu().detach().numpy()
        hn_vector.append(hn)
#         print(hn.shape)
    if epoch % print_every == 0:
        print(f"At Epoch: %.2f"% epoch)

At Epoch: 0.00
At Epoch: 100.00
At Epoch: 200.00
At Epoch: 300.00
At Epoch: 400.00
At Epoch: 500.00
At Epoch: 600.00
At Epoch: 700.00
At Epoch: 800.00
At Epoch: 900.00
At Epoch: 1000.00
At Epoch: 1100.00
At Epoch: 1200.00
At Epoch: 1300.00
At Epoch: 1400.00
At Epoch: 1500.00
At Epoch: 1600.00
At Epoch: 1700.00
At Epoch: 1800.00
At Epoch: 1900.00
At Epoch: 2000.00
At Epoch: 2100.00
At Epoch: 2200.00
At Epoch: 2300.00
At Epoch: 2400.00
At Epoch: 2500.00
At Epoch: 2600.00
At Epoch: 2700.00
At Epoch: 2800.00
At Epoch: 2900.00
At Epoch: 3000.00
At Epoch: 3100.00
At Epoch: 3200.00


In [24]:
hn_vector = np.array(hn_vector)
hn_vector = np.squeeze(hn_vector, axis=1)
print(hn_vector.shape)
np.save("../data/hn_transformerencoder_motortoolkit.npy", hn_vector)

(3235, 768)


In [27]:
motor_toolkit.shape

(3255, 9)

#### Output hidden states for df_dev dataset

In [25]:
hn_vector = []
print_every = 1000
max_len = 5000
for epoch in np.arange(0, df_dev.shape[0]):   
    with torch.no_grad():
        seq = df_dev.iloc[epoch, 6]
        
        if len(seq) > max_len:
            continue
        sentence_in = prepare_eval(seq)
        sentence_in = sentence_in.to(device = device)
        _,hn = model(sentence_in)
        
        hn = hn.sum(dim = 0).cpu().detach().numpy()
        hn_vector.append(hn)
#         print(hn.shape)
    if epoch % print_every == 0:
        print(f"At Epoch: %.2f"% epoch)

At Epoch: 0.00
At Epoch: 1000.00
At Epoch: 2000.00
At Epoch: 3000.00
At Epoch: 4000.00
At Epoch: 5000.00
At Epoch: 6000.00
At Epoch: 7000.00
At Epoch: 8000.00
At Epoch: 9000.00
At Epoch: 10000.00
At Epoch: 11000.00
At Epoch: 12000.00
At Epoch: 13000.00
At Epoch: 14000.00
At Epoch: 15000.00
At Epoch: 16000.00
At Epoch: 17000.00
At Epoch: 18000.00
At Epoch: 19000.00
At Epoch: 20000.00
At Epoch: 21000.00
At Epoch: 22000.00
At Epoch: 23000.00
At Epoch: 24000.00
At Epoch: 25000.00
At Epoch: 26000.00
At Epoch: 27000.00
At Epoch: 28000.00
At Epoch: 29000.00
At Epoch: 30000.00
At Epoch: 31000.00
At Epoch: 32000.00
At Epoch: 33000.00
At Epoch: 34000.00
At Epoch: 35000.00
At Epoch: 36000.00
At Epoch: 37000.00
At Epoch: 38000.00
At Epoch: 39000.00
At Epoch: 40000.00
At Epoch: 41000.00
At Epoch: 42000.00
At Epoch: 43000.00
At Epoch: 44000.00
At Epoch: 45000.00
At Epoch: 46000.00
At Epoch: 47000.00
At Epoch: 48000.00
At Epoch: 49000.00
At Epoch: 50000.00
At Epoch: 51000.00
At Epoch: 52000.00
At Epo

At Epoch: 416000.00
At Epoch: 417000.00
At Epoch: 418000.00
At Epoch: 419000.00
At Epoch: 420000.00
At Epoch: 421000.00
At Epoch: 422000.00
At Epoch: 423000.00
At Epoch: 424000.00
At Epoch: 425000.00
At Epoch: 426000.00
At Epoch: 427000.00
At Epoch: 428000.00
At Epoch: 429000.00
At Epoch: 430000.00
At Epoch: 431000.00
At Epoch: 432000.00
At Epoch: 433000.00
At Epoch: 434000.00
At Epoch: 435000.00
At Epoch: 436000.00
At Epoch: 437000.00
At Epoch: 438000.00
At Epoch: 439000.00
At Epoch: 440000.00
At Epoch: 441000.00
At Epoch: 442000.00
At Epoch: 443000.00
At Epoch: 444000.00
At Epoch: 445000.00
At Epoch: 446000.00
At Epoch: 447000.00
At Epoch: 448000.00
At Epoch: 449000.00
At Epoch: 450000.00
At Epoch: 451000.00
At Epoch: 452000.00
At Epoch: 453000.00
At Epoch: 454000.00
At Epoch: 455000.00
At Epoch: 456000.00
At Epoch: 457000.00
At Epoch: 458000.00
At Epoch: 459000.00
At Epoch: 460000.00
At Epoch: 461000.00
At Epoch: 462000.00
At Epoch: 463000.00
At Epoch: 464000.00
At Epoch: 465000.00


At Epoch: 826000.00
At Epoch: 827000.00
At Epoch: 828000.00
At Epoch: 829000.00
At Epoch: 830000.00
At Epoch: 831000.00
At Epoch: 832000.00
At Epoch: 833000.00
At Epoch: 834000.00
At Epoch: 835000.00
At Epoch: 836000.00
At Epoch: 837000.00
At Epoch: 838000.00
At Epoch: 839000.00
At Epoch: 840000.00
At Epoch: 841000.00
At Epoch: 842000.00
At Epoch: 843000.00
At Epoch: 844000.00
At Epoch: 845000.00
At Epoch: 846000.00
At Epoch: 847000.00
At Epoch: 848000.00
At Epoch: 849000.00
At Epoch: 850000.00
At Epoch: 851000.00
At Epoch: 852000.00
At Epoch: 853000.00
At Epoch: 854000.00
At Epoch: 855000.00
At Epoch: 856000.00
At Epoch: 857000.00
At Epoch: 858000.00
At Epoch: 859000.00
At Epoch: 860000.00
At Epoch: 861000.00
At Epoch: 862000.00
At Epoch: 863000.00
At Epoch: 864000.00
At Epoch: 865000.00
At Epoch: 866000.00
At Epoch: 867000.00
At Epoch: 868000.00
At Epoch: 869000.00
At Epoch: 870000.00
At Epoch: 871000.00
At Epoch: 872000.00
At Epoch: 873000.00
At Epoch: 874000.00
At Epoch: 875000.00


In [26]:
hn_vector = np.array(hn_vector)
hn_vector = np.squeeze(hn_vector, axis=1)
print(hn_vector.shape)
np.save("../data/hn_transformerencoder_dfdev.npy", hn_vector)

(1212912, 768)


In [28]:
df_dev.shape 

(1212912, 8)

#### Output hidden states for molecular motors dataset

In [29]:
hn_vector = []
print_every = 10000
max_len = 5000
  
for epoch in np.arange(0, pfamA_motors.shape[0]):   
    with torch.no_grad():
        seq = pfamA_motors.iloc[epoch, 3]
      
        if len(seq) > max_len:
            continue
        sentence_in = prepare_eval(seq)
        sentence_in = sentence_in.to(device = device)
        _,hn = model(sentence_in)
        
        hn = hn.sum(dim = 0).cpu().detach().numpy()
        hn_vector.append(hn)
#         print(hn.shape)
    if epoch % print_every == 0:
        print(f"At Epoch: %.2f"% epoch)

At Epoch: 0.00
At Epoch: 10000.00
At Epoch: 20000.00
At Epoch: 30000.00
At Epoch: 40000.00
At Epoch: 50000.00
At Epoch: 60000.00
At Epoch: 70000.00
At Epoch: 80000.00
At Epoch: 90000.00
At Epoch: 100000.00
At Epoch: 110000.00
At Epoch: 120000.00
At Epoch: 130000.00
At Epoch: 140000.00
At Epoch: 150000.00
At Epoch: 160000.00
At Epoch: 170000.00
At Epoch: 180000.00
At Epoch: 190000.00
At Epoch: 200000.00
At Epoch: 210000.00
At Epoch: 220000.00
At Epoch: 230000.00
At Epoch: 240000.00
At Epoch: 250000.00
At Epoch: 260000.00
At Epoch: 270000.00
At Epoch: 280000.00
At Epoch: 290000.00
At Epoch: 300000.00
At Epoch: 310000.00
At Epoch: 320000.00
At Epoch: 330000.00
At Epoch: 340000.00
At Epoch: 350000.00
At Epoch: 360000.00
At Epoch: 370000.00
At Epoch: 380000.00
At Epoch: 390000.00
At Epoch: 400000.00
At Epoch: 410000.00
At Epoch: 420000.00
At Epoch: 430000.00
At Epoch: 440000.00
At Epoch: 450000.00
At Epoch: 460000.00
At Epoch: 470000.00
At Epoch: 480000.00
At Epoch: 490000.00
At Epoch: 5000

In [30]:
hn_vector = np.array(hn_vector)
hn_vector = np.squeeze(hn_vector, axis=1)
print(hn_vector.shape)
np.save("../data/hn_transformerencoder_pfammotors.npy", hn_vector)

(1914831, 768)
