# Bidirectional LSTM on Uniref50_01.tsv
> The goal of this notebook is to expand the training dataset of LSTM from 8000 protein in a specific pfam to the one-tenth of the UniRef50 dataset. A 64*2 directions LSTM is used to predict the next token. After training is done, we would embed the entire/part of the pfam_motors to see is families are grouped

In [1]:
import torch
import torch.nn as nn 
import torch.optim as optim 

import torchvision 
import torchvision.transforms as transforms 
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, IterableDataset, DataLoader
# import tqdm
import numpy as np
import pandas as pd
# seed = 7
# torch.manual_seed(seed)
# np.random.seed(seed)

 

## Get Training Sequence
- Compilation of all pfam sequences from the 4 molecular motors-belonging clans
- sample 1000 sequences from each of the four clans for learning the hidden states

In [2]:
print('guaa')

guaa


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
# !ls ./drive/My\ Drive/data/

In [5]:
# pfamA_motors = pd.read_csv("../data/pfamA_motors.csv")
uniref50_01 = pd.read_csv("../data/uniref50_01.tsv",sep = "\t",header=None)

In [None]:
uniref50_01.head()

In [6]:
uniref50_01.shape

(846396, 2)

In [None]:
pfamA_motors.shape

In [None]:
pfamA_motors = pfamA_motors.iloc[:,1:]
pfamA_motors.head()

In [None]:
clan_train_dat = pfamA_motors.groupby("clan").head(4000)


In [None]:
clan_train_dat.shape

In [None]:
len(clan_train_dat.loc[:,"pfamA_acc"].unique())

In [None]:
clan_train_dat = clan_train_dat.sample(frac=1).reset_index(drop=True)
clan_train_dat.head(10)

In [None]:
clan_test_dat = pfamA_motors.loc[~pfamA_motors["id"].isin(clan_train_dat["id"]),:].groupby("clan").head(400)

In [None]:
clan_test_dat.shape

In [None]:
def df_to_tup(dat):
  data = []
  for i in range(dat.shape[0]):
    row = dat.iloc[i,:]
    tup = (row["seq"],row["clan"])
    data.append(tup)
  return data

In [None]:
clan_training_data = df_to_tup(clan_train_dat)
clan_test_data = df_to_tup(clan_test_dat)

In [None]:
for seq,clan in clan_training_data:
  print(seq)
  print(clan)
  break

In [None]:
for seq,clan in clan_test_data:
  print(seq)
  print(clan)
  break

In [7]:
aminoacid_list = [
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
    'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
]
clan_list = ["actin_like","tubulin_c","tubulin_binding","p_loop_gtpase"]
        
aa_to_ix = dict(zip(aminoacid_list, np.arange(1, 21)))
clan_to_ix = dict(zip(clan_list, np.arange(0, 4)))

def word_to_index(seq,to_ix):
    "Returns a list of indices (integers) from a list of words."
    return [to_ix.get(word, 0) for word in seq]

ix_to_aa = dict(zip(np.arange(1, 21), aminoacid_list))
ix_to_clan = dict(zip(np.arange(0, 4), clan_list))

def index_to_word(ixs,ix_to): 
    "Returns a list of words, given a list of their corresponding indices."
    return [ix_to.get(ix, 'X') for ix in ixs]



In [None]:
clan_to_ix.get("actin_like")

In [8]:
def prepare_sequence(seq):
    idxs = word_to_index(seq[0:-1],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_labels(seq):
    idxs = word_to_index(seq[1:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

In [9]:
# set device
device  = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
device

device(type='cuda')

In [11]:
# Hyperparameters
input_size = len(aminoacid_list) + 1
num_layers = 1
hidden_size = 64
output_size = len(aminoacid_list) + 1
embedding_size= 10
learning_rate = 0.001

In [12]:
input_size

21

In [13]:
# Create Bidirectional LSTM
class BRNN(nn.Module):
    def __init__(self,input_size, embedding_size, hidden_size, num_layers, output_size):
        super(BRNN,self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.log_softmax = nn.LogSoftmax(dim= 1)
        self.aa_embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(input_size = embedding_size, 
                            hidden_size = hidden_size,
                            num_layers = num_layers, 
                            bidirectional = True)
        #hidden_state: a forward and a backward state for each layer of LSTM
        self.fc = nn.Linear(hidden_size*2, output_size)
    
    def aa_encoder(self, input): 
        "Helper function to map single aminoacids to the embedding space."
        projected = self.embedding(input)
        return projected 
    

    def forward(self,seq):
        # embed each aa to the embedded space
        embedding_tensor = self.aa_embedding(seq)

        # initialization could be neglected as the default is 0 for h0 and c0
        # initialize hidden state
        # h0 = torch.zeros(self.num_layers*2,x.size(0),self.hidden_size).to(device)
        # initialize cell_state
        # c0 = torch.zeros(self.num_layers*2,x.size(0),self.hidden_size).to(device)

        # shape(seq_len = len(sequence), batch_size = 1, input_size = -1)
        # (5aa,1 sequence per batch, 10-dimension embedded vector)

        #output of shape (seq_len, batch, num_directions * hidden_size):
        out, (hn, cn) = self.lstm(embedding_tensor.view(len(seq), 1, -1))
        # decoded_space = self.fc(out.view(len(seq), -1))
        decoded_space = self.fc(out.view(len(seq), -1))
        decoded_scores = F.log_softmax(decoded_space, dim=1)
        return decoded_scores, hn

In [14]:
# initialize network
model = BRNN(input_size, embedding_size, hidden_size, num_layers, output_size).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [15]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = Variable(prepare_sequence(uniref50_01.iloc[0,1]))
    inputs = inputs.to(device = device)
    aa_scores, _ = model(inputs)
    print( aa_scores)

tensor([[-3.0509, -3.1128, -3.1330,  ..., -2.9522, -3.1162, -3.0967],
        [-3.0757, -3.1047, -3.1458,  ..., -2.9539, -3.1117, -3.0991],
        [-3.0765, -3.1058, -3.1476,  ..., -2.9678, -3.1096, -3.1037],
        ...,
        [-3.0390, -3.0004, -3.0610,  ..., -2.9404, -3.1320, -3.1425],
        [-3.0409, -3.0239, -3.0719,  ..., -2.9277, -3.1244, -3.0955],
        [-3.0469, -3.0448, -3.1066,  ..., -2.9487, -3.1077, -3.0804]],
       device='cuda:0')


In [16]:
aa_scores.shape

torch.Size([45353, 21])

In [18]:
#Train Network

print_every = 1000

for epoch in np.arange(0, uniref50_01.shape[0]): 
    seq = uniref50_01.iloc[epoch, 1]
    if(len(seq)>4000):
        continue
    # Step 1. Remember that Pytorch accumulates gradients.
    # We need to clear them out before each instance
    
    # Step 2. Get our inputs ready for the network, that is, turn them into
    # Tensors of word indices.
    sentence_in = prepare_sequence(seq)
    targets = prepare_labels(seq)
    
    sentence_in = sentence_in.to(device = device)
    targets = targets.to(device = device)
    
    # Step 3. Run our forward pass.
    model.zero_grad()
    aa_scores, hn = model(sentence_in)

    # Step 4. Compute the loss, gradients, and update the parameters by
    #  calling optimizer.step()
    
    loss = loss_function(aa_scores, targets)
    loss.backward()
    optimizer.step()

    if epoch % print_every == 0:
        print(f"At Epoch: %.2f"% epoch)
        print(f"Loss %.6f"% loss)

At Epoch: 9000.00
Loss 1.51
At Epoch: 29000.00
Loss 0.00
At Epoch: 30000.00
Loss 0.00
At Epoch: 31000.00
Loss 0.00
At Epoch: 32000.00
Loss 0.00
At Epoch: 33000.00
Loss 0.00
At Epoch: 34000.00
Loss 0.00
At Epoch: 35000.00
Loss 0.00
At Epoch: 36000.00
Loss 0.00
At Epoch: 37000.00
Loss 0.00
At Epoch: 38000.00
Loss 0.00
At Epoch: 39000.00
Loss 0.00
At Epoch: 40000.00
Loss 0.00
At Epoch: 41000.00
Loss 0.00
At Epoch: 42000.00
Loss 0.00
At Epoch: 43000.00
Loss 0.00
At Epoch: 44000.00
Loss 0.00
At Epoch: 45000.00
Loss 0.00
At Epoch: 46000.00
Loss 0.00
At Epoch: 47000.00
Loss 0.00


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), "../data/bidirectional_lstm_uniref_201009.pt")

In [None]:
print('done')