<a href="https://colab.research.google.com/github/souzajvp/deep-learning-experiences/blob/master/gpcr_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPCR class prediction using amino acid sequence data

Much of this code is adapted from - https://github.com/bentrevett/pytorch-sentiment-analysis

In [None]:
import pandas as pd
import torch
from torchtext.legacy import data
import torch.nn as nn
import torch.optim as optim

In [None]:
url ='https://raw.githubusercontent.com/souzajvp/deep-learning-experiences/master/train_gpcr.csv'

In [None]:
train = pd.read_csv(url)

train.head(5)

In [None]:
tokenize = lambda x: [char for char in x]
TEXT = data.Field(tokenize = tokenize) 
LABEL = data.LabelField(dtype = torch.float)

In [None]:
fields = [('Sequence', TEXT), ('label', LABEL)] # Note: the order has to be the
# same as the order of columns in your dataset!

train_data = data.TabularDataset(
    path = "train_gpcr.csv",
    format = "CSV",
    fields = fields,
    skip_header = True
)

In [None]:
import random
SEED = 42

train_data, test_data = train_data.split(random_state = random.seed(SEED), split_ratio = 0.9)

In [None]:
len(train_data), len(test_data)

In [None]:
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [None]:
BATCH_SIZE = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device,
    sort_key = lambda x: len(x.Sequence),
    sort_within_batch = False)

# The last two arguments are because of -
# https://github.com/pytorch/text/issues/474
# Also try simply `sort = False`

In [None]:
# checking if everything looks ok

for batch in test_iterator:
  Sequence = batch.Sequence
  label = batch.label
  print(label)
  break

# Simple RNN

In [None]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #hidden = [num layers * num directions, batch size, hid dim]

        #what we need -
        #hidden = [batch size, hid dim]      
        return self.fc(hidden.squeeze(0))

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50 
HIDDEN_DIM = 25 
OUTPUT_DIM = 1

model = RNN(INPUT_DIM,
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM)

In [None]:
optimizer = optim.Adam(model.parameters(), lr = 1e-2)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
for epoch in range(20):    
    for batch in train_iterator:
        
        optimizer.zero_grad()

        sequence = batch.Sequence 

        predictions = model(sequence).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()

    print("Loss:", loss)

# this might take several minutes to run

In [None]:
len(train_data), len(test_data)

In [None]:
correct = 0
total = len(test_data)
    
with torch.no_grad():
    for batch in test_iterator:
        sequence = batch.Sequence  
        label = batch.label

        predictions = model(sequence).squeeze(1)
        
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct += torch.sum((rounded_preds == label)).item()

correct/total * 100 # 34.4%

# Simple LSTM

In [None]:
class simple_LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.lstm(embedded)
        
        #hidden = [num layers * num directions, batch size, hid dim]

        #what we need -
        #hidden = [batch size, hid dim]      
        return self.fc(hidden.squeeze(0))

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50 
HIDDEN_DIM = 25 
OUTPUT_DIM = 1

model = simple_LSTM(INPUT_DIM,
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM)

In [None]:
optimizer = optim.Adam(model.parameters(), lr = 1e-2)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
for epoch in range(20):    
    for batch in train_iterator:
        
        optimizer.zero_grad()

        sequence = batch.Sequence 

        predictions = model(sequence).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()

    print("Loss:", loss)

# this might take several minutes to run

In [None]:
correct = 0
total = len(test_data)
    
with torch.no_grad():
    for batch in test_iterator:
        sequence = batch.Sequence  
        label = batch.label

        predictions = model(sequence).squeeze(1)
        
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct += torch.sum((rounded_preds == label)).item()

correct/total * 100 # 59.24%

# Two-layer bi-directional LSTM

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim,
                            num_layers = n_layers, 
                            bidirectional = bidirectional)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.lstm(embedded)
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers

        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)

        return self.fc(hidden)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50 
HIDDEN_DIM = 25 
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True

model = LSTM(INPUT_DIM,
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM,
            N_LAYERS, 
            BIDIRECTIONAL)

In [None]:
optimizer = optim.Adam(model.parameters(), lr = 1e-2)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
for epoch in range(20):    
    for batch in train_iterator:
        
        optimizer.zero_grad()

        sequence = batch.Sequence 

        predictions = model(sequence).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()

    print("Loss:", loss)

# this might take several minutes to run

In [None]:
correct = 0
total = len(test_data)
    
with torch.no_grad():
    for batch in test_iterator:
        sequence = batch.Sequence  
        label = batch.label

        predictions = model(sequence).squeeze(1)
        
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct += torch.sum((rounded_preds == label)).item()

correct/total * 100 # 97%

In [None]:
#torch.save(model.state_dict(), 'gpcr_model_03_29_2021.pt')

In [None]:
# model.load_state_dict(torch.load('gpcr_model_03_28_2021.pt'))
model.load_state_dict(torch.load('gpcr_model_03_29_2021.pt'))

# Test model on individual input sequences

In [None]:
def predict_gpcr(model, sequence):
    tokenized = lambda x:[char for char in x]
    tokenized = tokenized(sequence)
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device) 
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [None]:
at1r = "MILNSSTEDGIKRIQDDCPKAGRHNYIFVMIPTLYSIIFVVGIFGNSLVVIVIYFYMKLKTVASVFLLNLALADLCFLLTLPLWAVYTAMEYRWPFGNYLCKIASASVSFNLYASVFLLTCLSIDRYLAIVHPMKSRLRRTMLVAKVTCIIIWLLAGLASLPAIIHRNVFFIENTNITVCAFHYESQNSTLPIGLGLTKNILGFLFPFLIILTSYTLIWKALKKAYEIQKNKPRNDDIFKIIMAIVLFFFFSWIPHQIFTFLDVLIQLGIIRDCRIADIVDTAMPITICIAYFNNCLNPLFYGFLGKKFKRYFLQLLKYIPPKAKSHSNLSTKMSTLSYRPSDNVSSSTKKPAPCFEVE"
nupr1 = "MATFPPATSAPQQPPGPEDEDSSLDESDLYSLAHSYLGGGGRKGRTKREAAANTNRPSPGGHERKLVTKLQNSERKKRGARR"

predict_gpcr(model, at1r) 
predict_gpcr(model, nupr1) 


# References

Deep Learning review paper - https://www.nature.com/articles/nature14539  
Deep Learning and Biological Sequence data review paper - https://pubmed.ncbi.nlm.nih.gov/28961695/  
Understanding LSTM Networks - https://colah.github.io/posts/2015-08-Understanding-LSTMs/  
The Unreasonable Effectiveness of Recurrent Neural Networks - http://karpathy.github.io/2015/05/21/rnn-effectiveness/  
Positional SHAP for Interpretation of Deep Learning Models Trained from Biological Sequences - https://www.biorxiv.org/content/10.1101/2021.03.04.433939v1  
