# Documentation
> 201025: This notebook generate embedding vectors for pfam_motors, df_dev, and motor_toolkit from the models that currently finished training:
    - lstm5
        - evotune_lstm_5_balanced.pt
        - evotune_lstm_5_balanced_target.pt
        - mini_lstm_5_balanced.pt
        - mini_lstm_5_balanced_target.pt
    - transformer_encoder
        - evotune_seq2seq_encoder_balanced.pt
        - evotune_seq2seq_encoder_balanced_target.pt
        - mini_seq2seq_encoder_balanced.pt
        - mini_seq2seq_encoder_balanced_target.pt
    - seq2seq_attention_mini
        - transformer_encoder_201025.pt
        - evotune_transformerencoder_balanced.pt
        - evotune_transformerencoder_balanced_target.pt
        - mini_evotune_transformerencoder_balanced.pt
        - mini_evotune_transformerencoder_balanced_target.pt
        

- output for motor_toolkit,

In [4]:
import torch
import torch.nn as nn 
import torch.optim as optim 

import torchvision 
import torchvision.transforms as transforms 
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, IterableDataset, DataLoader
# import tqdm
import numpy as np
import pandas as pd

import math

In [5]:
pfamA_motors = pd.read_csv("../../data/pfamA_motors.csv")
df_dev = pd.read_csv("../../data/df_dev.csv")
motor_toolkit = pd.read_csv("../../data/motor_tookits.csv")

aminoacid_list = [
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
    'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
]
clan_list = ["actin_like","tubulin_c","tubulin_binding","p_loop_gtpase"]
        
aa_to_ix = dict(zip(aminoacid_list, np.arange(1, 21)))
clan_to_ix = dict(zip(clan_list, np.arange(0, 4)))

def word_to_index(seq,to_ix):
    "Returns a list of indices (integers) from a list of words."
    return [to_ix.get(word, 0) for word in seq]

ix_to_aa = dict(zip(np.arange(1, 21), aminoacid_list))
ix_to_clan = dict(zip(np.arange(0, 4), clan_list))

def index_to_word(ixs,ix_to): 
    "Returns a list of words, given a list of their corresponding indices."
    return [ix_to.get(ix, 'X') for ix in ixs]

def prepare_sequence(seq):
    idxs = word_to_index(seq[0:-1],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_labels(seq):
    idxs = word_to_index(seq[1:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_eval(seq):
    idxs = word_to_index(seq[:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

prepare_labels('YCHXXXXX')

# set device
device  = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
# Hyperparameters
input_size = len(aminoacid_list) + 1
num_layers = 1
hidden_size = 128
output_size = len(aminoacid_list) + 1
embedding_size= 10
learning_rate = 0.001

# Create Bidirectional LSTM
class BRNN(nn.Module):
    def __init__(self,input_size, embedding_size, hidden_size, num_layers, output_size):
        super(BRNN,self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.log_softmax = nn.LogSoftmax(dim= 1)
        self.aa_embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(input_size = embedding_size, 
                            hidden_size = hidden_size,
                            num_layers = num_layers, 
                            bidirectional = True)
        #hidden_state: a forward and a backward state for each layer of LSTM
        self.fc = nn.Linear(hidden_size*2, output_size)
    
    def aa_encoder(self, input): 
        "Helper function to map single aminoacids to the embedding space."
        projected = self.embedding(input)
        return projected 
    

    def forward(self,seq):
        # embed each aa to the embedded space
        embedding_tensor = self.aa_embedding(seq)

        # initialization could be neglected as the default is 0 for h0 and c0
        # initialize hidden state
        # h0 = torch.zeros(self.num_layers*2,x.size(0),self.hidden_size).to(device)
        # initialize cell_state
        # c0 = torch.zeros(self.num_layers*2,x.size(0),self.hidden_size).to(device)

        # shape(seq_len = len(sequence), batch_size = 1, input_size = -1)
        # (5aa,1 sequence per batch, 10-dimension embedded vector)

        #output of shape (seq_len, batch, num_directions * hidden_size):
        out, (hn, cn) = self.lstm(embedding_tensor.view(len(seq), 1, -1))
        # decoded_space = self.fc(out.view(len(seq), -1))
        decoded_space = self.fc(out.view(len(seq), -1))
        decoded_scores = F.log_softmax(decoded_space, dim=1)
        return decoded_scores, hn


In [7]:
# initialize network
model = BRNN(input_size, embedding_size, hidden_size, num_layers, output_size).to(device)

In [None]:
model.load_state_dict(torch.load("../data/bidirectional_lstm_5_201008.pt"))
model.eval()
hn_vector = []
print_every = 100
for epoch in np.arange(0, motor_toolkit.shape[0]):   
    with torch.no_grad():
        seq = motor_toolkit.iloc[epoch, 7]
        sentence_in = prepare_eval(seq)
        sentence_in = sentence_in.to(device = device)
        decoded_scores, hn = model(sentence_in)
        hn_vector.append(hn.cpu().detach().numpy().reshape(1,-1))
    if epoch % print_every == 0:
      print(f"At Epoch: %.2f"% epoch)
hn_vector = np.array(hn_vector)
hn_vector = np.squeeze(hn_vector, axis=1)
print(hn_vector.shape)
np.save("../data/hn_lstm5_motortoolkit.npy", hn_vector)