# Documentation
> 201025: This notebook generate embedding vectors for pfam_motors, df_dev, and motor_toolkit from the models that currently finished training:
    - lstm5  
          - evotune_lstm_5_balanced.pt  
          - evotune_lstm_5_balanced_target.pt  
          - mini_lstm_5_balanced.pt   
          - mini_lstm_5_balanced_target.pt   
    - transformer_encoder   
          - evotune_seq2seq_encoder_balanced.pt   
          - evotune_seq2seq_encoder_balanced_target.pt  
          - mini_seq2seq_encoder_balanced.pt  
          - mini_seq2seq_encoder_balanced_target.pt  
    - seq2seq_attention_mini  
          - transformer_encoder_201025.pt  
          - evotune_transformerencoder_balanced.pt  
          - evotune_transformerencoder_balanced_target.pt  
          - mini_evotune_transformerencoder_balanced.pt  
          - mini_evotune_transformerencoder_balanced_target.pt  
        

- output for motor_toolkit,pfamA_random, and pfamA_motors


In [2]:
import torch
import torch.nn as nn 
import torch.optim as optim 

import torchvision 
import torchvision.transforms as transforms 
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, IterableDataset, DataLoader
# import tqdm
import numpy as np
import pandas as pd

import math
seed = 7
torch.manual_seed(seed)
np.random.seed(seed)

In [16]:
pfamA_motors = pd.read_csv("../../data/pfamA_motors.csv")
pfamA_random = pd.read_csv("../../data/pfamA_random_201027.csv")
motor_toolkit = pd.read_csv("../../data/motor_tookits.csv")

pfamA_motors_balanced = pfamA_motors.groupby('clan').apply(lambda _df: _df.sample(4500,random_state=1))
pfamA_motors_balanced = pfamA_motors_balanced.apply(lambda x: x.reset_index(drop = True))

pfamA_target_name = ["PF00349","PF00022","PF03727","PF06723",\
                       "PF14450","PF03953","PF12327","PF00091","PF10644",\
                      "PF13809","PF14881","PF00063","PF00225","PF03028"]

pfamA_target = pfamA_motors.loc[pfamA_motors["pfamA_acc"].isin(pfamA_target_name),:]


aminoacid_list = [
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
    'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
]
clan_list = ["actin_like","tubulin_c","tubulin_binding","p_loop_gtpase"]
        
aa_to_ix = dict(zip(aminoacid_list, np.arange(1, 21)))
clan_to_ix = dict(zip(clan_list, np.arange(0, 4)))

def word_to_index(seq,to_ix):
    "Returns a list of indices (integers) from a list of words."
    return [to_ix.get(word, 0) for word in seq]

ix_to_aa = dict(zip(np.arange(1, 21), aminoacid_list))
ix_to_clan = dict(zip(np.arange(0, 4), clan_list))

def index_to_word(ixs,ix_to): 
    "Returns a list of words, given a list of their corresponding indices."
    return [ix_to.get(ix, 'X') for ix in ixs]

def prepare_sequence(seq):
    idxs = word_to_index(seq[0:-1],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_labels(seq):
    idxs = word_to_index(seq[1:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_eval(seq):
    idxs = word_to_index(seq[:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

prepare_labels('YCHXXXXX')

# set device
device  = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [17]:
# Hyperparameters
input_size = len(aminoacid_list) + 1
num_layers = 1
hidden_size = 64
output_size = len(aminoacid_list) + 1
embedding_size= 10
learning_rate = 0.001

class s2s_Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, output_size, batch_first=False, bidirectional=True):
        super(s2s_Encoder, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.log_softmax = nn.LogSoftmax(dim= 1)
        self.batch_first = batch_first
        self.aa_embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers,
                           batch_first=batch_first, bidirectional=bidirectional)

    def forward(self, seq):
        # embed each aa to the embedded space
        embedding_tensor = self.aa_embedding(seq)
        #output of shape (seq_len, batch, num_directions * hidden_size):
        outputs, hidden = self.rnn(embedding_tensor.view(len(seq), 1, -1))
        # Return output and final hidden state
        return outputs, hidden

In [18]:
def generate_embedding_lstm(dict_file,dat,dat_name,out_path,out_dir,seq_col):
    # initialize network
    model = s2s_Encoder(input_size = input_size, \
                               embedding_size = embedding_size, \
                               hidden_size = hidden_size, \
                               num_layers = num_layers, \
                               output_size = output_size).to(device)
    model.load_state_dict(torch.load(dict_file))
    print("loaded dict file for weights " + dict_file)
    print("output embedding for " + dat_name)
    model.eval()
    hn_vector = []
    print_every = 1000
    for epoch in np.arange(0, dat.shape[0]):   
        with torch.no_grad():
            seq = dat.iloc[epoch, seq_col]
            sentence_in = prepare_eval(seq)
            sentence_in = sentence_in.to(device = device)
            _, (hn,_) = model(sentence_in)
            hn_vector.append(hn.cpu().detach().numpy().reshape(1,-1))
        if epoch % print_every == 0:
            print(f"At Epoch: %.2f"% epoch)
            print(seq)
    hn_vector = np.array(hn_vector)
    hn_vector = np.squeeze(hn_vector, axis=1)
    print(hn_vector.shape)
    print(out_dir+dat_name+"_"+out_path)
    np.save(out_dir+dat_name+"_"+out_path, hn_vector)
    return 



In [26]:
dict_files = ["evotune_seq2seq_encoder_balanced.pt","evotune_seq2seq_encoder_balanced_target.pt","mini_seq2seq_encoder_balanced.pt","mini_seq2seq_encoder_balanced_target.pt"]
dict_files  = ["../../data/201025/"+dict_file for dict_file in dict_files]
dict_files.append("../../data/first_try/seq2seq_encoder_df_dev_201012_230k.pt")
dict_files
# "../data/hn_lstm5_motortoolkit.npy"


['../../data/201025/evotune_seq2seq_encoder_balanced.pt',
 '../../data/201025/evotune_seq2seq_encoder_balanced_target.pt',
 '../../data/201025/mini_seq2seq_encoder_balanced.pt',
 '../../data/201025/mini_seq2seq_encoder_balanced_target.pt',
 '../../data/first_try/seq2seq_encoder_df_dev_201012_230k.pt']

In [20]:
out_paths = ["evotune_balanced.npy","evotune_balanced_target.npy","mini_balanced.npy","mini_balanced_target.npy","raw.npy"]

In [21]:
out_dir = "../../out/201027/embedding/seq2seq/"
out_paths

['evotune_balanced.npy',
 'evotune_balanced_target.npy',
 'mini_balanced.npy',
 'mini_balanced_target.npy',
 'raw.npy']

In [22]:
len(dict_files)==len(out_paths)

True

In [23]:
pfamA_target.iloc[1,3]

'NSGTGFSKLGFAGNDSPSFVFPTAIATKGPAAGGGGSGSGRPAVGNKPSFLTGGAGPASNHLSSKRGTEDLDFFIGDEATSAAAGPGKLARRRNAQATANHLGYGLHYPIRHGQIENWDHMERFWSNSIFKYLRVEPEDHYFLLTEPPLNPPENRENTAEIFFESFNCAGMYIAVQAVLALAASWTSSKVTDRSLTGTVIDSGDGVTHVIPVAEGYVIGSSIKSIPIAGRDITYFVQSLLRDRGEADSSLKTAQEIKESYCYVCPDIVKEFAKYDRDRSRFLKHTVSLPGGRQVGVDVGYERFLAPEIFFNPEIYSSDFLTPLPVVVDGVIQQSPIDVRRGLYKNIVLSGGSTLYKDFGRRLQRDIKLMVDDRIRASELRSGGARSGGLDVQVISHKRQRHGPWFGGSLLGQTPEFRSYCHTKAEYQEYGPSIVRRFQ'

In [24]:
data = [pfamA_motors_balanced,pfamA_target,pfamA_random,motor_toolkit]
data_names = ["pfamA_motors_balanced", "pfamA_target" , "pfamA_random", "motor_toolkit"]
seq_cols = [3,3,2,7]

In [25]:
for i in range(len(dict_files)):
    dict_file = dict_files[i]
    out_path = out_paths[i]
    for i in range(len(data)):
        dat = data[i]
        dat_name = data_names[i]
        seq_col = seq_cols[i]
        generate_embedding_lstm(dict_file,dat,dat_name,out_path,out_dir,seq_col)

loaded dict file for weights ../../data/201025/evotune_seq2seq_encoder_balanced.pt
output embedding for pfamA_motors_balanced
At Epoch: 0.00
HQDNVHARSLMGLVRNVFEQAGLEKTALDAVAVSSGPGSYTGLRIGVSVAKGLAYALDKPVIGVGTLEALAFRAIPFSDSTDTIIPMLDARRMEVYALVMDGLGDTLISPQPFILEDNPFMEYLEKGKVFFLGDGVPKSKEILSHPNSRFVPLFNSSQSIGELAYKKFLKADFESLAYFEPNYIKEFRI
At Epoch: 1000.00
LAAEARGDRAEAARILGAGAANLVGLLDIDRVVLGGRTVAADEDAYVRGVRAVIADRAARGAGGAHVTVTVADGGDRPVAEGAAQLVLA
At Epoch: 2000.00
ARKIGIDLGTTNLLICVDNKGILVDEPSIITVDATTKKCIAAGLDARDMLGRTPKNMICIRPLKDGVVADFEATDMMLNYFLKKCDLKGMFKKNVILICHPTKITSVEKNAIRDCAYRAGAKKVYLEEEPKIAALGAGLDIGKASGNMVLDIGGGTSDIAVLSLGDIVCSTSIKTAGNKITQDILENVRIQKKMYIGEQTADEIKRRIANALVVKEPETITISGRDVETGLPHSIDINSNEVESYIRSSLQEIVHATKTILEVTPPELAADIVQHGLVLTGGGALLKNLDQLMRNELQIPVYVAENALKCVVDGCTIMLQNL
At Epoch: 3000.00
NSLPSGDQHKAQQLTADYLGALKRHLIDSLKNQLGEHHAKATPLQFILTVPAVWSDAAKEKTLQAAETAGLGQHAPILMISEPEAAATYVLFRKELGGLSTGDTFVVCDAGGGTVDLISYTIEQLEPALQVKEAAPGSGGLCGSTYLNRRFQEFLVTKLGQEEGFDNETVGDAMKKFDEEIKREYSPNVPNPNYWVPVPGLA

FileNotFoundError: [Errno 2] No such file or directory: '../../data/seq2seq_encoder_df_dev_201012_230k.pt'

In [27]:
i = 4
dict_file = dict_files[i]
out_path = out_paths[i]
for i in range(len(data)):
    dat = data[i]
    dat_name = data_names[i]
    seq_col = seq_cols[i]
    generate_embedding_lstm(dict_file,dat,dat_name,out_path,out_dir,seq_col)

loaded dict file for weights ../../data/first_try/seq2seq_encoder_df_dev_201012_230k.pt
output embedding for pfamA_motors_balanced
At Epoch: 0.00
HQDNVHARSLMGLVRNVFEQAGLEKTALDAVAVSSGPGSYTGLRIGVSVAKGLAYALDKPVIGVGTLEALAFRAIPFSDSTDTIIPMLDARRMEVYALVMDGLGDTLISPQPFILEDNPFMEYLEKGKVFFLGDGVPKSKEILSHPNSRFVPLFNSSQSIGELAYKKFLKADFESLAYFEPNYIKEFRI
At Epoch: 1000.00
LAAEARGDRAEAARILGAGAANLVGLLDIDRVVLGGRTVAADEDAYVRGVRAVIADRAARGAGGAHVTVTVADGGDRPVAEGAAQLVLA
At Epoch: 2000.00
ARKIGIDLGTTNLLICVDNKGILVDEPSIITVDATTKKCIAAGLDARDMLGRTPKNMICIRPLKDGVVADFEATDMMLNYFLKKCDLKGMFKKNVILICHPTKITSVEKNAIRDCAYRAGAKKVYLEEEPKIAALGAGLDIGKASGNMVLDIGGGTSDIAVLSLGDIVCSTSIKTAGNKITQDILENVRIQKKMYIGEQTADEIKRRIANALVVKEPETITISGRDVETGLPHSIDINSNEVESYIRSSLQEIVHATKTILEVTPPELAADIVQHGLVLTGGGALLKNLDQLMRNELQIPVYVAENALKCVVDGCTIMLQNL
At Epoch: 3000.00
NSLPSGDQHKAQQLTADYLGALKRHLIDSLKNQLGEHHAKATPLQFILTVPAVWSDAAKEKTLQAAETAGLGQHAPILMISEPEAAATYVLFRKELGGLSTGDTFVVCDAGGGTVDLISYTIEQLEPALQVKEAAPGSGGLCGSTYLNRRFQEFLVTKLGQEEGFDNETVGDAMKKFDEEIKREYSPNVPNPNYWVP