## Installation and Imports

In [1]:
!pip install tape_proteins

Collecting tape_proteins
  Downloading tape_proteins-0.5-py3-none-any.whl (68 kB)
[?25l[K     |████▊                           | 10 kB 23.4 MB/s eta 0:00:01[K     |█████████▌                      | 20 kB 28.4 MB/s eta 0:00:01[K     |██████████████▎                 | 30 kB 25.0 MB/s eta 0:00:01[K     |███████████████████             | 40 kB 12.7 MB/s eta 0:00:01[K     |███████████████████████▊        | 51 kB 11.2 MB/s eta 0:00:01[K     |████████████████████████████▌   | 61 kB 12.9 MB/s eta 0:00:01[K     |████████████████████████████████| 68 kB 3.6 MB/s 
Collecting boto3
  Downloading boto3-1.21.29-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 28.1 MB/s 
[?25hCollecting biopython
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 55.9 MB/s 
Collecting tensorboardX
  Downloading tensorboardX-2.5-py2.py3-none-any.whl (125 kB)
[K     |████████████████████

In [26]:
from pathlib import Path
from torch.utils import data as data
from tape import datasets
from tape import TAPETokenizer
from tape import ProteinBertForMaskedLM
from Bio import SeqIO
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
import pickle
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import gc
import csv
import itertools

In [3]:
import torch
from tape import ProteinBertModel, TAPETokenizer
model = ProteinBertModel.from_pretrained('bert-base')
tokenizer = TAPETokenizer(vocab='iupac')

100%|██████████| 567/567 [00:00<00:00, 297717.87B/s]
100%|██████████| 370264230/370264230 [00:13<00:00, 27298157.31B/s]


## Demonstrating TAPE Inputs, Outputs 

In [4]:
sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
token_ids = torch.tensor([tokenizer.encode(sequence)])
display(token_ids)
output = model(token_ids)
sequence_output = output[0]
pooled_output = output[1]


  


tensor([[ 2, 11,  7, 23, 25,  9,  8, 21,  7, 15, 13, 11, 16, 11,  5, 13, 15, 15,
         17, 11,  7, 25, 13, 11, 22, 11, 22, 15, 25,  5,  5, 11,  5, 15, 13, 23,
         20,  3]])

In [5]:
display(sequence_output[0].shape)
pooled_sequence_emb = torch.sum(sequence_output, dim=2)
display(pooled_sequence_emb.shape)
# display(pooled_output)

torch.Size([38, 768])

torch.Size([1, 38])

In [6]:
display((pooled_output.shape), (sequence_output.shape))

torch.Size([1, 768])

torch.Size([1, 38, 768])

# Sequence Tokenization and Embedding Step

In [7]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [8]:
import csv
file_dir = '/content/drive/MyDrive/Colab Notebooks/Protein Sequence Analysis'



In [9]:
from pathlib import Path
from torch.utils import data as data
from tape import datasets
from tape import TAPETokenizer
from tape import ProteinBertForMaskedLM
from Bio import SeqIO
from tqdm.auto import tqdm
import pickle

data_folder = Path(file_dir)
input_sequence_fasta = 'raw.fasta'
batch_size = 100 

output_file_name_header = 'resultsoutputnew_'

Defining a loader 

In [10]:
class LoaderClass(data.Dataset):
  def __init__(self, input_ids, attention_mask):
    super().__init__()
    self.input_ids = input_ids
    self.attention_mask = attention_mask
  
  def __len__(self):
    return self.input_ids.shape[0]
  
  def __getitem__(self, idx):
    return self.input_ids[idx], self.attention_mask[idx]

class Identity(nn.Module):
  def __init__(self):
    super(Identity, self).__init__()
  def forward(self, x, target=None):
    return (x,)

'embedding' function takes an input pretrained model (in this case, it is TAPE), data folder, the input sequence file, and the output file name for naming. This function takes the average of the values across the sequence length.

In [19]:
def embedding(model, data_folder, input_sequence_file,output_file_name_header, batch_size=100):
  input_file = data_folder / input_sequence_file #data path (fasta)
  output_file = data_folder / (output_file_name_header + model + '.p') #output path (pickle)
  if model == 'TAPE':
    model = ProteinBertForMaskedLM.from_pretrained('bert-base')
  
    model.mlm = Identity()
    model.eval()
    embed = datasets.EmbedDataset(data_file=input_file, tokenizer='iupac')
    loader = data.DataLoader(embed, batch_size, False, collate_fn=embed.collate_fn)

    count_x = 0 
    model.cuda()

    seq_encodings = []
    seq_all_hiddens = []
    seq_ids = [] 

    for seq_batch in loader: ## Add .no_grad() here to see performance on GPU mem
      count_x += 1 
      print(count_x)
      ids, input_ids, input_mask = seq_batch["ids"], seq_batch["input_ids"], seq_batch["input_mask"]
      input_ids, input_mask = input_ids.cuda(), input_mask.cuda()
      with torch.no_grad():
        output = model(input_ids, input_mask)
      output = output[0].cpu().detach().numpy()
      features = []
      for seq_num in range(len(output)):
        seq_len = (input_mask[seq_num]==1).sum()
        seq_emd = output[seq_num][1:seq_len-1]
        seq_all_hiddens.append(seq_emd)
        features.append(np.mean(seq_emd, axis=0))
      features = np.stack(features)
      print('features.shape: ', features.shape)
      seq_encodings.append(features)
      seq_ids += ids
    seq_embeddings = np.concatenate(seq_encodings)
    print('seq_embeddings.shape: ', seq_embeddings.shape)
    # seq_embedding_output = {"seq_embeddings":seq_embeddings, "seq_ids":seq_ids, "seq_all_hiddens":seq_all_hiddens}
    seq_embedding_output = {"seq_embeddings":seq_embeddings, "seq_ids":seq_ids}
    print('done')
  return seq_embedding_output

Defining input seqeuence data as fasta file and performing embedding. The embedding data is stored as a pickle file in the directory

In [20]:
data_folder = Path(file_dir)
input_sequence_fasta = 'postprocessedECnumtoSeq.fasta'
batch_size = 5
output_file_name_header = 'resultsoutputnew_'

seq_embedding_output = embedding('TAPE', data_folder, input_sequence_fasta, output_file_name_header, batch_size)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
901
features.shape:  (5, 768)
902
features.shape:  (5, 768)
903
features.shape:  (5, 768)
904
features.shape:  (5, 768)
905
features.shape:  (5, 768)
906
features.shape:  (5, 768)
907
features.shape:  (5, 768)
908
features.shape:  (5, 768)
909
features.shape:  (5, 768)
910
features.shape:  (5, 768)
911
features.shape:  (5, 768)
912
features.shape:  (5, 768)
913
features.shape:  (5, 768)
914
features.shape:  (5, 768)
915
features.shape:  (5, 768)
916
features.shape:  (5, 768)
917
features.shape:  (5, 768)
918
features.shape:  (5, 768)
919
features.shape:  (5, 768)
920
features.shape:  (5, 768)
921
features.shape:  (5, 768)
922
features.shape:  (5, 768)
923
features.shape:  (5, 768)
924
features.shape:  (5, 768)
925
features.shape:  (5, 768)
926
features.shape:  (5, 768)
927
features.shape:  (5, 768)
928
features.shape:  (5, 768)
929
features.shape:  (5, 768)
930
features.shape:  (5, 768)
931
features.shape:  (5, 768)
932
f

In [25]:
len(seq_embedding_output['seq_embeddings'])

16993

Extracting the sequence embedding files from the directory

Extracting ec number labels 

In [22]:
import pandas as pd
EC_path = (data_folder / 'postprocessedECnumToSeq-edited.csv')
EC_sequence_data = pd.read_csv(EC_path)
EC_seq_data = (EC_sequence_data.iloc[:-1,2]).to_numpy()
display(EC_seq_data)

array([1, 1, 1, ..., 7, 7, 7])

Defining labels and saving seqeuence, ec number, embedding data into pickle file

In [34]:
seq_data=EC_sequence_data.iloc[1:, :]
cols = ['EC Number', 'sequence']
seq_data = seq_data[cols]
seq_data.columns = ['ECNumber', 'sequence']
# seq_data['seg_length'] = seq_data['sequence'].apply(len)
ec_encoder = LabelEncoder()
seq_data.loc[:,'ECLabel'] = ec_encoder.fit_transform(seq_data['ECNumber'])
seq_data.head()

Unnamed: 0,ECNumber,sequence,ECLabel
1,1,MHHHHHHSSGVDLGTENLYFQSNAMNISRKTALVTGASRGIGRAIA...,0
2,1,QGMIMTSDIKLLDYLRVRRSTPALQLSEPGPSKGEIEEILRLAVRV...,0
3,1,AMVSSSCSSIPKMPVTPLSLVTRHVAIIGAGAAGLVTARELRREGH...,0
4,1,MADLNQRRQRSEFQSKIKILLSTTIKAKPELVPSLLKLALNDAMTY...,0
5,1,MAELLLVETPIPQQKHYESKPFPAVISPPSASIPIPALSLPLFTQT...,0


In [38]:
len(seq_data)
embeddings = (seq_embedding_output['seq_embeddings'])
labels = seq_data.loc[:,'ECLabel'].to_numpy()
sequences = seq_data.loc[:,'sequence'].to_numpy()

seq_embedding_output = {"seq_embeddings": embeddings, "seq_labels": labels, "seq": sequences}

In [40]:
output_file = '/content/drive/MyDrive/Colab Notebooks/Protein Sequence Analysis/TAPE_embeddings.p'
with open(output_file, 'wb') as f:
  pickle.dump(seq_embedding_output, f)