In [1]:
import torch
import torch.nn as nn 
import torch.optim as optim 

import torchvision 
import torchvision.transforms as transforms 
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, IterableDataset, DataLoader
# import tqdm
import numpy as np
import pandas as pd

import math

seed = 7
torch.manual_seed(seed)
np.random.seed(seed)

In [2]:
pfamA_motors = pd.read_csv("../../data/pfamA_motors.csv")
df_dev = pd.read_csv("../../data/df_dev.csv")
motor_toolkit = pd.read_csv("../../data/motor_tookits.csv")
pfamA_motors_balanced = pfamA_motors.groupby('clan').apply(lambda _df: _df.sample(4500,random_state=1))
pfamA_motors_balanced = pfamA_motors_balanced.apply(lambda x: x.reset_index(drop = True))
pfamA_target_name = ["PF00349","PF00022","PF03727","PF06723",\
                       "PF14450","PF03953","PF12327","PF00091","PF10644",\
                      "PF13809","PF14881","PF00063","PF00225","PF03028"]

pfamA_target = pfamA_motors.loc[pfamA_motors["pfamA_acc"].isin(pfamA_target_name),:]

# shuffle pfamA_target and pfamA_motors_balanced
pfamA_target = pfamA_target.sample(frac = 1)
pfamA_target_ind = pfamA_target.iloc[:,0]
print(pfamA_target_ind[0:5])
print(pfamA_motors_balanced.shape)

pfamA_motors_balanced = pfamA_motors_balanced.sample(frac = 1) 
pfamA_motors_balanced_ind = pfamA_motors_balanced.iloc[:,0]
print(pfamA_motors_balanced_ind[0:5])
print(pfamA_target.shape)

179519      179519
1414859    1414859
12920        12920
1415258    1415258
13385        13385
Name: Unnamed: 0, dtype: int64
(18000, 6)
13493    180756
1539     166414
2688     131988
1691      37094
188      130155
Name: Unnamed: 0, dtype: int64
(59149, 6)


In [3]:
aminoacid_list = [
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
    'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
]
clan_list = ["actin_like","tubulin_c","tubulin_binding","p_loop_gtpase"]
        
aa_to_ix = dict(zip(aminoacid_list, np.arange(1, 21)))
clan_to_ix = dict(zip(clan_list, np.arange(0, 4)))

def word_to_index(seq,to_ix):
    "Returns a list of indices (integers) from a list of words."
    return [to_ix.get(word, 0) for word in seq]

ix_to_aa = dict(zip(np.arange(1, 21), aminoacid_list))
ix_to_clan = dict(zip(np.arange(0, 4), clan_list))

def index_to_word(ixs,ix_to): 
    "Returns a list of words, given a list of their corresponding indices."
    return [ix_to.get(ix, 'X') for ix in ixs]

def prepare_sequence(seq):
    idxs = word_to_index(seq[0:-1],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_labels(seq):
    idxs = word_to_index(seq[1:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_eval(seq):
    idxs = word_to_index(seq[:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

prepare_labels('YCHXXXXX')

tensor([2, 7, 0, 0, 0, 0, 0])

In [4]:
# set device
device  = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [10]:
# Hyperparameters
input_size = len(aminoacid_list) + 1
num_layers = 1
hidden_size = 64
output_size = len(aminoacid_list) + 1
embedding_size= 10
learning_rate = 0.001

class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, output_size, batch_first=False, bidirectional=True):
        super(Encoder, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.log_softmax = nn.LogSoftmax(dim= 1)
        self.batch_first = batch_first
        self.aa_embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers,
                           batch_first=batch_first, bidirectional=bidirectional)

    def forward(self, seq):
        # embed each aa to the embedded space
        embedding_tensor = self.aa_embedding(seq)
        #output of shape (seq_len, batch, num_directions * hidden_size):
        outputs, hidden = self.rnn(embedding_tensor.view(len(seq), 1, -1))
        # Return output and final hidden state
        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        # both forward and backward direction
        self.enc_units = hidden_size*2
        self.dec_units = hidden_size*2
        
        self.W1 = nn.Linear(self.enc_units, self.dec_units, bias=False)
        self.W2 = nn.Linear(self.enc_units, self.dec_units, bias=False)
        self.vt = nn.Linear(self.enc_units, 1, bias=False)

    def forward(self,encoder_outputs,decoder_state):
        
        
        # encoder_outputs: (seq_len, batch_size, hidden_size*2)
        encoder_transform = self.W1(encoder_outputs)
#         print("encoder_transform.shape: ", encoder_transform.shape)

        # (1 (unsqueezed),batch_size, hidden_size*2)
        decoder_transform = self.W2(decoder_state)
#         print("decoder_transform: ", decoder_transform.shape)
        
        combined_transform = encoder_transform + decoder_transform
#         print("combined_transform.shape ", combined_transform.shape)
        # 1st line of Eq.(3) in the paper
        # (seq_len, batch_size = 1 , 1) => squeeze to (seq_len, batch_size)
        u_i = self.vt(torch.tanh(combined_transform)).squeeze()
#         print("u_i.shape ", u_i.shape)

        # log-softmax for a better numerical stability
        attention_weights = F.log_softmax(u_i, dim=0).view(-1, 1, 1)
#         print("attention_weights.shape ", attention_weights.shape)
        
        #context_vector shape after sum == (batch,hidden*2)
        context_vector = attention_weights * encoder_outputs
        context_vector = torch.sum(context_vector, dim=0)
#         print("context_vector.shape ", context_vector.shape)
        return context_vector,attention_weights

    

class Decoder(nn.Module):
    # hidden size refers to input hidden size, decoder hidden size should be 2*encoder hidden size since the decoder is unidirectional
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, output_size, batch_first=False, bidirectional=False):
        super(Decoder, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.log_softmax = nn.LogSoftmax(dim= 1)
        self.batch_first = batch_first
        self.context_size = hidden_size*2
        self.aa_embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTMCell(input_size=embedding_size + self.context_size, hidden_size=hidden_size*2)
        self.attn = Attention(hidden_size)
        self.fc = nn.Linear(hidden_size*2, output_size)
        
    def forward(self, y, encoder_outputs,decoder_state,decoder_cell):
        
        context_vector,attention_weights = self.attn(encoder_outputs,decoder_state)
#         print("context_vector.shape: ", context_vector.shape)
#         print("attention_weights.shape: ", attention_weights.shape)
        y_embedded = self.aa_embedding(y)
#         print("y_embedded.shape: ", y_embedded.shape)
        y_cat = torch.cat((context_vector.unsqueeze(1), y_embedded), -1)
#         print("y_cat.shape: ", y_cat.shape)
        hidden = self.rnn(y_cat.squeeze(1),(decoder_state,decoder_cell))
        h_i,_ = hidden
#         print("h_i.shape: ", h_i.shape)
        decoded_space = self.fc(h_i)
#         print("decoded_space.shape: ", decoded_space.shape)
        decoded_scores = F.log_softmax(decoded_space, dim = 1)
#         print("decoded_scores.shape: ", decoded_scores.shape)
        decoded_aa = torch.argmax(decoded_scores)
#         print("decoded_aa.shape: ", decoded_aa.shape)
        return decoded_scores, decoded_aa, hidden, attention_weights


In [11]:
encoder = Encoder(input_size = input_size, \
                               embedding_size = embedding_size, \
                               hidden_size = hidden_size, \
                               num_layers = num_layers, \
                               output_size = output_size)
decoder = Decoder(input_size = input_size, \
                               embedding_size = embedding_size, \
                               hidden_size = hidden_size, \
                               num_layers = num_layers, \
                               output_size = output_size)

encoder.to(device)
decoder.to(device)

for m in encoder.modules():
    if isinstance(m, nn.Linear):
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias)

for m in decoder.modules():
    if isinstance(m, nn.Linear):
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias)


criterion = nn.NLLLoss()                
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), 
                       lr= learning_rate)

In [12]:
encoder.load_state_dict(torch.load("../../data/seq2seq_encoder_df_dev_201012_230k.pt"))
encoder.train()

Encoder(
  (log_softmax): LogSoftmax()
  (aa_embedding): Embedding(21, 10)
  (rnn): LSTM(10, 64, bidirectional=True)
)

In [13]:
decoder.load_state_dict(torch.load("../../data/seq2seq_decoder_df_dev_201012_230k.pt"))
decoder.train()

Decoder(
  (log_softmax): LogSoftmax()
  (aa_embedding): Embedding(21, 10)
  (rnn): LSTMCell(138, 128)
  (attn): Attention(
    (W1): Linear(in_features=128, out_features=128, bias=False)
    (W2): Linear(in_features=128, out_features=128, bias=False)
    (vt): Linear(in_features=128, out_features=1, bias=False)
  )
  (fc): Linear(in_features=128, out_features=21, bias=True)
)

## Training on pfamA_motors_balanced

In [None]:
# loss_vector = []
# running_loss = 0
print_every = 1000

# for epoch in np.arange(0, len(clan_training_data)): 
#     seq, clan = clan_training_data[epoch]

for epoch in np.arange(0, pfamA_motors_balanced.shape[0]): 
    seq = pfamA_motors_balanced.iloc[epoch, 3]
    # Step 1. Remember that Pytorch accumulates gradients.
    # We need to clear them out before each instance
#     print(epoch)
#     print("len(seq): ", len(seq))
    # Step 2. Get our inputs ready for the network, that is, turn them into
    # Tensors of word indices.
    sentence_in = prepare_sequence(seq)
    targets = prepare_labels(seq)
    sentence_in = sentence_in.to(device = device)
    targets = targets.view(1,-1).to(device = device)
#     print("targets.shape: ", targets.shape)
    
#     print(targets.shape)
#     print(sentence_in.shape)
    
    encoder.train()
    decoder.train()
    optimizer.zero_grad()
    
    loss = 0

    seq_len = sentence_in.size(0)

    encoder_outputs, encoder_hidden = encoder(sentence_in)
#     print("encoder_outputs.shape: " , encoder_outputs.shape)
    
    encoder_h_n, encoder_c_n = encoder_hidden
#     print("encoder_h_n.shape: ", encoder_h_n.shape)
#     print("encoder_hidden_last: ", encoder_c_n.shape)
    
    # Lets use zeros as an intial input
    y_0 = 0
    # using zeros for initial decoder hidden and cell state 
    d_0 = Variable(torch.zeros(1, decoder.hidden_size*2))
    dcell_0 = Variable(torch.zeros(1,  decoder.hidden_size*2))
#     print("d_0.shape: ", d_0.shape)
#     print("dcell_0.shape: ", dcell_0.shape)
    
    y_last = y_0
    d_last, d_cell_last = d_0, dcell_0

    
#     print("seq_len: ",seq_len)
    for di in range(seq_len):
        decoded_scores, y_last, (d_last,d_cell_last), attention_weights = decoder(Variable(torch.LongTensor([[y_last]])).to(device), \
                                                                                 encoder_outputs.to(device), \
                                                                                 d_last.to(device),\
                                                                                 d_cell_last.to(device))
        
        loss += criterion(decoded_scores.to(device), targets[:,di])   
    
    assert not np.isnan(loss.item()), 'Model diverged with loss = NaN'

    loss.backward()
    optimizer.step()
    
    if epoch % print_every == 0:
        print(f"At Epoch: %.1f"% epoch)
        print(f"Loss %.4f"% (loss/seq_len))
        torch.save(encoder.state_dict(), "../../data/seq2seq_encoder_balanced.pt")
        torch.save(decoder.state_dict(), "../../data/seq2seq_decoder_balanced.pt")
#     loss_vector.append(loss/seq_len)


At Epoch: 0.0
Loss 2.9367
At Epoch: 1000.0
Loss 3.0015
At Epoch: 2000.0
Loss 2.8021
At Epoch: 3000.0
Loss 2.7523
At Epoch: 4000.0
Loss 2.6752
At Epoch: 5000.0
Loss 2.8581
At Epoch: 6000.0
Loss 2.8701


In [None]:
torch.save(encoder.state_dict(), "../../data/seq2seq_encoder_balanced.pt")
torch.save(decoder.state_dict(), "../../data/seq2seq_decoder_balanced.pt")

In [None]:
# loss_vector = []
# running_loss = 0
print_every = 1000

# for epoch in np.arange(0, len(clan_training_data)): 
#     seq, clan = clan_training_data[epoch]

for epoch in np.arange(0, pfamA_target.shape[0]): 
    seq = pfamA_target.iloc[epoch, 3]
    # Step 1. Remember that Pytorch accumulates gradients.
    # We need to clear them out before each instance
#     print(epoch)
#     print("len(seq): ", len(seq))
    # Step 2. Get our inputs ready for the network, that is, turn them into
    # Tensors of word indices.
    sentence_in = prepare_sequence(seq)
    targets = prepare_labels(seq)
    sentence_in = sentence_in.to(device = device)
    targets = targets.view(1,-1).to(device = device)
#     print("targets.shape: ", targets.shape)
    
#     print(targets.shape)
#     print(sentence_in.shape)
    
    encoder.train()
    decoder.train()
    optimizer.zero_grad()
    
    loss = 0

    seq_len = sentence_in.size(0)

    encoder_outputs, encoder_hidden = encoder(sentence_in)
#     print("encoder_outputs.shape: " , encoder_outputs.shape)
    
    encoder_h_n, encoder_c_n = encoder_hidden
#     print("encoder_h_n.shape: ", encoder_h_n.shape)
#     print("encoder_hidden_last: ", encoder_c_n.shape)
    
    # Lets use zeros as an intial input
    y_0 = 0
    # using zeros for initial decoder hidden and cell state 
    d_0 = Variable(torch.zeros(1, decoder.hidden_size*2))
    dcell_0 = Variable(torch.zeros(1,  decoder.hidden_size*2))
#     print("d_0.shape: ", d_0.shape)
#     print("dcell_0.shape: ", dcell_0.shape)
    
    y_last = y_0
    d_last, d_cell_last = d_0, dcell_0

    
#     print("seq_len: ",seq_len)
    for di in range(seq_len):
        decoded_scores, y_last, (d_last,d_cell_last), attention_weights = decoder(Variable(torch.LongTensor([[y_last]])).to(device), \
                                                                                 encoder_outputs.to(device), \
                                                                                 d_last.to(device),\
                                                                                 d_cell_last.to(device))
        
        loss += criterion(decoded_scores.to(device), targets[:,di])   
    
    assert not np.isnan(loss.item()), 'Model diverged with loss = NaN'

    loss.backward()
    optimizer.step()
    
    if epoch % print_every == 0:
        print(f"At Epoch: %.1f"% epoch)
        print(f"Loss %.4f"% (loss/seq_len))
        torch.save(encoder.state_dict(), "../../data/seq2seq_encoder_balanced_target.pt")
        torch.save(decoder.state_dict(), "../../data/seq2seq_decoder_balanced_target.pt")
#     loss_vector.append(loss/seq_len)


In [None]:
torch.save(encoder.state_dict(), "../../data/seq2seq_encoder_balanced_target.pt")
torch.save(decoder.state_dict(), "../../data/seq2seq_decoder_balanced_target.pt")

In [None]:
print("done")