In [22]:
import torch
import torch.nn as nn 
import torch.optim as optim 

import torchvision 
import torchvision.transforms as transforms 
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, IterableDataset, DataLoader
# import tqdm
import numpy as np
import pandas as pd

import math

seed = 7
torch.manual_seed(seed)
np.random.seed(seed)


In [23]:
pfamA_motors = pd.read_csv("../../data/pfamA_motors.csv")
df_dev = pd.read_csv("../../data/df_dev.csv")
motor_toolkit = pd.read_csv("../../data/motor_tookits.csv")

In [24]:
pfamA_motors_balanced = pfamA_motors.groupby('clan').apply(lambda _df: _df.sample(4500,random_state=1))
pfamA_motors_balanced = pfamA_motors_balanced.apply(lambda x: x.reset_index(drop = True))

In [25]:
pfamA_target_name = ["PF00349","PF00022","PF03727","PF06723",\
                       "PF14450","PF03953","PF12327","PF00091","PF10644",\
                      "PF13809","PF14881","PF00063","PF00225","PF03028"]

pfamA_target = pfamA_motors.loc[pfamA_motors["pfamA_acc"].isin(pfamA_target_name),:]

In [26]:
# shuffle pfamA_target and pfamA_motors_balanced
pfamA_target = pfamA_target.sample(frac = 1)
pfamA_target_ind = pfamA_target.iloc[:,0]
print(pfamA_target_ind[0:5])
print(pfamA_motors_balanced.shape)

pfamA_motors_balanced = pfamA_motors_balanced.sample(frac = 1) 
pfamA_motors_balanced_ind = pfamA_motors_balanced.iloc[:,0]
print(pfamA_motors_balanced_ind[0:5])
print(pfamA_target.shape)

179519      179519
1414859    1414859
12920        12920
1415258    1415258
13385        13385
Name: Unnamed: 0, dtype: int64
(18000, 6)
13493    180756
1539     166414
2688     131988
1691      37094
188      130155
Name: Unnamed: 0, dtype: int64
(59149, 6)


In [27]:
pfamA_motors_balanced.head()

Unnamed: 0.1,Unnamed: 0,id,description,seq,pfamA_acc,clan
13493,180756,F7HLH3_MACMU/18-154,F7HLH3_MACMU/18-154 F7HLH3.2 PF00091.26;Tubulin;,SWSASTCTTTRPAVAGTCPAGAGNNWARGHYTEGAELMESVMDVVR...,PF00091,tubulin_binding
1539,166414,A0A0X8UZH9_9EURY/22-293,A0A0X8UZH9_9EURY/22-293 A0A0X8UZH9.1 PF00814.2...,AHVLSNIIDLFRPPQGGLHPREAANHHADAVAKTIVEAVETAGISL...,PF00814,actin_like
2688,131988,A0A1S6U8J5_9PROT/24-293,A0A1S6U8J5_9PROT/24-293 A0A1S6U8J5.1 PF02541.1...,CFVEKSFEAIVGSARGLRENMLISDEAKERIFNALKLAKEEFDFSL...,PF02541,actin_like
1691,37094,A0A0Q5VID2_9CAUL/261-451,A0A0Q5VID2_9CAUL/261-451 A0A0Q5VID2.1 PF02782....,AKITYGTGAFLVANVGDQPVVSTRRLLGTLGYDVRGTAAYALEGSI...,PF02782,actin_like
188,130155,A0A2Z4UGU8_9RHIZ/111-409,A0A2Z4UGU8_9RHIZ/111-409 A0A2Z4UGU8.1 PF02541....,LVAKRSRDGFRVIDAYSRIVRLGEGLASTGQLSDDAMNRAAAALKI...,PF02541,actin_like


In [28]:
pfamA_target.head()

Unnamed: 0.1,Unnamed: 0,id,description,seq,pfamA_acc,clan
179519,179519,A0A098S4B7_9BACT/12-174,A0A098S4B7_9BACT/12-174 A0A098S4B7.1 PF00091.2...,IIKVLGVGGGGSNAVTHMFRQGIVGVDFAICNTDSQAMELSPVTTR...,PF00091,tubulin_binding
1414859,1414859,A0A0A1SN99_9HYPO/43-437,A0A0A1SN99_9HYPO/43-437 A0A0A1SN99.1 PF00225.2...,RASDEDSRTAVRVAIRIRPPLKPTDPGYELIPQRFQRSMVQTTSDT...,PF00225,p_loop_gtpase
12920,12920,M7C1E0_CHEMY/152-523,M7C1E0_CHEMY/152-523 M7C1E0.1 PF00022.20;Actin;,MGKVAVVIDNGSCFTRAGFAGEDKPKSVLKTTSMPPTCPAVMREIP...,PF00022,actin_like
1415258,1415258,H9J8N3_BOMMO/16-329,H9J8N3_BOMMO/16-329 H9J8N3.1 PF00225.24;Kinesin;,NQTFAMDKRKKQVSLCEATSAASAPEDRKVGVTAPKMFAFDAIFSQ...,PF00225,p_loop_gtpase
13385,13385,A0A075AWM4_ROZAC/39-659,A0A075AWM4_ROZAC/39-659 A0A075AWM4.1 PF00022.2...,IDTSKVIVLHPGSETLKFGMATEGLPRTIPNVIARLDPTKGDTMEA...,PF00022,actin_like


In [29]:
aminoacid_list = [
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
    'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
]
clan_list = ["actin_like","tubulin_c","tubulin_binding","p_loop_gtpase"]
        
aa_to_ix = dict(zip(aminoacid_list, np.arange(1, 21)))
clan_to_ix = dict(zip(clan_list, np.arange(0, 4)))

def word_to_index(seq,to_ix):
    "Returns a list of indices (integers) from a list of words."
    return [to_ix.get(word, 0) for word in seq]

ix_to_aa = dict(zip(np.arange(1, 21), aminoacid_list))
ix_to_clan = dict(zip(np.arange(0, 4), clan_list))

def index_to_word(ixs,ix_to): 
    "Returns a list of words, given a list of their corresponding indices."
    return [ix_to.get(ix, 'X') for ix in ixs]

def prepare_sequence(seq):
    idxs = word_to_index(seq[0:-1],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_labels(seq):
    idxs = word_to_index(seq[1:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

def prepare_eval(seq):
    idxs = word_to_index(seq[:],aa_to_ix)
    return torch.tensor(idxs, dtype=torch.long)

prepare_labels('YCHXXXXX')

tensor([2, 7, 0, 0, 0, 0, 0])

In [30]:
# set device
device  = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [31]:
# Hyperparameters
input_size = len(aminoacid_list) + 1
num_layers = 1
hidden_size = 128
output_size = len(aminoacid_list) + 1
embedding_size= 10
learning_rate = 0.001

# Create Bidirectional LSTM
class BRNN(nn.Module):
    def __init__(self,input_size, embedding_size, hidden_size, num_layers, output_size):
        super(BRNN,self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.log_softmax = nn.LogSoftmax(dim= 1)
        self.aa_embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(input_size = embedding_size, 
                            hidden_size = hidden_size,
                            num_layers = num_layers, 
                            bidirectional = True)
        #hidden_state: a forward and a backward state for each layer of LSTM
        self.fc = nn.Linear(hidden_size*2, output_size)
    
    def aa_encoder(self, input): 
        "Helper function to map single aminoacids to the embedding space."
        projected = self.embedding(input)
        return projected 
    

    def forward(self,seq):
        # embed each aa to the embedded space
        embedding_tensor = self.aa_embedding(seq)

        # initialization could be neglected as the default is 0 for h0 and c0
        # initialize hidden state
        # h0 = torch.zeros(self.num_layers*2,x.size(0),self.hidden_size).to(device)
        # initialize cell_state
        # c0 = torch.zeros(self.num_layers*2,x.size(0),self.hidden_size).to(device)

        # shape(seq_len = len(sequence), batch_size = 1, input_size = -1)
        # (5aa,1 sequence per batch, 10-dimension embedded vector)

        #output of shape (seq_len, batch, num_directions * hidden_size):
        out, (hn, cn) = self.lstm(embedding_tensor.view(len(seq), 1, -1))
        # decoded_space = self.fc(out.view(len(seq), -1))
        decoded_space = self.fc(out.view(len(seq), -1))
        decoded_scores = F.log_softmax(decoded_space, dim=1)
        return decoded_scores, hn


In [32]:
# initialize network
model = BRNN(input_size, embedding_size, hidden_size, num_layers, output_size).to(device)

In [33]:
model.load_state_dict(torch.load("../../data/bidirectional_lstm_5_201008.pt"))

<All keys matched successfully>

In [34]:
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [35]:
model.train()

BRNN(
  (log_softmax): LogSoftmax()
  (aa_embedding): Embedding(21, 10)
  (lstm): LSTM(10, 128, bidirectional=True)
  (fc): Linear(in_features=256, out_features=21, bias=True)
)

## Proceed weight updates using motor_balanced

In [36]:
#Train Network

# loss_vector = []
running_loss = 0
print_every = 1000

for epoch in np.arange(0, pfamA_motors_balanced.shape[0]): 
    seq = pfamA_motors_balanced.iloc[epoch, 3]
    # Step 1. Remember that Pytorch accumulates gradients.
    # We need to clear them out before each instance
    
    # Step 2. Get our inputs ready for the network, that is, turn them into
    # Tensors of word indices.
    sentence_in = prepare_sequence(seq)
    targets = prepare_labels(seq)
    
    sentence_in = sentence_in.to(device = device)
    targets = targets.to(device = device)
    
    # Step 3. Run our forward pass.
    model.zero_grad()
    aa_scores, hn = model(sentence_in)

    # Step 4. Compute the loss, gradients, and update the parameters by
    #  calling optimizer.step()

    loss = loss_function(aa_scores, targets)
    loss.backward()
    optimizer.step()

    if epoch % print_every == 0:
      print(f"At Epoch: %.2f"% epoch)
      print(f"Loss %.2f"% loss)
    # Print current loss    
#     loss_vector.append(loss) 

At Epoch: 0.00
Loss 0.02
At Epoch: 1000.00
Loss 0.01
At Epoch: 2000.00
Loss 0.01
At Epoch: 3000.00
Loss 0.04
At Epoch: 4000.00
Loss 0.01
At Epoch: 5000.00
Loss 0.01
At Epoch: 6000.00
Loss 0.01
At Epoch: 7000.00
Loss 0.01
At Epoch: 8000.00
Loss 0.02
At Epoch: 9000.00
Loss 0.02
At Epoch: 10000.00
Loss 0.03
At Epoch: 11000.00
Loss 0.01
At Epoch: 12000.00
Loss 0.01
At Epoch: 13000.00
Loss 0.01
At Epoch: 14000.00
Loss 0.01
At Epoch: 15000.00
Loss 0.03
At Epoch: 16000.00
Loss 0.02
At Epoch: 17000.00
Loss 0.02


In [37]:
torch.save(model.state_dict(), "../../data/evotune_lstm_5_balanced.pt")

## Proceed weight updates using the entire pfam_motor set

In [38]:
#Train Network

# loss_vector = []
running_loss = 0
print_every = 1000

for epoch in np.arange(0, pfamA_target.shape[0]): 
    seq = pfamA_target.iloc[epoch, 3]
    # Step 1. Remember that Pytorch accumulates gradients.
    # We need to clear them out before each instance
    
    # Step 2. Get our inputs ready for the network, that is, turn them into
    # Tensors of word indices.
    sentence_in = prepare_sequence(seq)
    targets = prepare_labels(seq)
    
    sentence_in = sentence_in.to(device = device)
    targets = targets.to(device = device)
    
    # Step 3. Run our forward pass.
    model.zero_grad()
    aa_scores, hn = model(sentence_in)

    # Step 4. Compute the loss, gradients, and update the parameters by
    #  calling optimizer.step()

    loss = loss_function(aa_scores, targets)
    loss.backward()
    optimizer.step()

    if epoch % print_every == 0:
      print(f"At Epoch: %.2f"% epoch)
      print(f"Loss %.2f"% loss)
    # Print current loss    
#     loss_vector.append(loss) 

At Epoch: 0.00
Loss 0.01
At Epoch: 1000.00
Loss 0.00
At Epoch: 2000.00
Loss 0.02
At Epoch: 3000.00
Loss 0.02
At Epoch: 4000.00
Loss 0.00
At Epoch: 5000.00
Loss 0.01
At Epoch: 6000.00
Loss 0.00
At Epoch: 7000.00
Loss 0.01
At Epoch: 8000.00
Loss 0.01
At Epoch: 9000.00
Loss 0.00
At Epoch: 10000.00
Loss 0.01
At Epoch: 11000.00
Loss 0.00
At Epoch: 12000.00
Loss 0.04
At Epoch: 13000.00
Loss 0.00
At Epoch: 14000.00
Loss 0.06
At Epoch: 15000.00
Loss 0.04
At Epoch: 16000.00
Loss 0.08
At Epoch: 17000.00
Loss 0.04
At Epoch: 18000.00
Loss 0.00
At Epoch: 19000.00
Loss 0.00
At Epoch: 20000.00
Loss 0.00
At Epoch: 21000.00
Loss 0.03
At Epoch: 22000.00
Loss 0.03
At Epoch: 23000.00
Loss 0.00
At Epoch: 24000.00
Loss 0.01
At Epoch: 25000.00
Loss 0.00
At Epoch: 26000.00
Loss 0.01
At Epoch: 27000.00
Loss 0.01
At Epoch: 28000.00
Loss 0.01
At Epoch: 29000.00
Loss 0.00
At Epoch: 30000.00
Loss 0.01
At Epoch: 31000.00
Loss 0.02
At Epoch: 32000.00
Loss 0.00
At Epoch: 33000.00
Loss 0.01
At Epoch: 34000.00
Loss 0.0

In [39]:
torch.save(model.state_dict(), "../../data/evotune_lstm_5_balanced_target.pt")

In [40]:
print("done")

done
