In [2]:
import os
import pickle
import numpy as np
import random
import time

In [3]:
# from transformers import T5EncoderModel, T5Tokenizer 
from transformers import T5EncoderModel, T5Tokenizer ,AlbertModel, AlbertTokenizer,BertModel, BertTokenizer
from transformers  import XLNetModel, XLNetTokenizer

import torch
import h5py
device = torch.device('cuda:7' if torch.cuda.is_available() else 'cpu')
print("Using {}".format(device))


  from .autonotebook import tqdm as notebook_tqdm


Using cuda:7


In [4]:
def find_alphabet(string, alphabet):
    return alphabet in string


def one_hot_encode(sequence):
    # Define dictionary mapping amino acids to their indices
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    aa_to_index = {aa: i for i, aa in enumerate(amino_acids)}
    
    # Initialize one-hot encoded sequence
    one_hot_sequence = []
    
    # Iterate over each amino acid in the sequence
    for aa in sequence:
        # Initialize one-hot encoding vector for current amino acid
        encoding = [0] * len(amino_acids)
        # Set the index corresponding to the amino acid to 1
        if find_alphabet(amino_acids, aa):
            encoding[aa_to_index[aa]] = 1
        # Append the one-hot encoding vector to the sequence
        one_hot_sequence.append(encoding)
    one_hot_sequence = np.array(one_hot_sequence)   
    return one_hot_sequence

def get_T5_model(device):
    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc")
    model = model.to(device) # move model to GPU
    model = model.eval() # set model to evaluation model
    tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)

    return model, tokenizer

def get_ProtBert(device):
    model = BertModel.from_pretrained("Rostlab/prot_bert")
    model = model.to(device) # move model to GPU
    model = model.eval() # set model to evaluation model
    tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)

    return model, tokenizer

def get_ProtT5_XL_BFD(device):
    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_bfd")
    model = model.to(device) # move model to GPU
    model = model.eval() # set model to evaluation model
    tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_bfd", do_lower_case=False )

    return model, tokenizer

def get_ProtXLNet(device):    
    xlnet_men_len = 512
    model = XLNetModel.from_pretrained("Rostlab/prot_xlnet",mem_len=xlnet_men_len)
    model = model.to(device) # move model to GPU
    model = model.eval() # set model to evaluation model
    tokenizer = XLNetTokenizer.from_pretrained("Rostlab/prot_xlnet", do_lower_case=False)

    return model, tokenizer

def get_ProtBert_BFD(device):
    model = BertModel.from_pretrained("Rostlab/prot_bert_bfd")
    model = model.to(device) # move model to GPU
    model = model.eval() # set model to evaluation model
    tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False )

    return model, tokenizer


def get_embeddings( model, tokenizer, seqs, per_residue, per_protein, sec_struct, 
                   max_residues=4000, max_seq_len=1000, max_batch=100 ):

    if sec_struct:
      sec_struct_model = load_sec_struct_model()

    results = {"residue_embs" : dict(), 
               "protein_embs" : dict(),
               "sec_structs" : dict() 
               }

    # sort sequences according to length (reduces unnecessary padding --> speeds up embedding)
    seq_dict   = sorted( seqs.items(), key=lambda kv: len( seqs[kv[0]] ), reverse=True )
    start = time.time()
    batch = list()
    for seq_idx, (pdb_id, seq) in enumerate(seq_dict,1):
        seq = seq
        seq_len = len(seq)
        seq = ' '.join(list(seq))
        batch.append((pdb_id,seq,seq_len))

        n_res_batch = sum([ s_len for  _, _, s_len in batch ]) + seq_len 
        if len(batch) >= max_batch or n_res_batch>=max_residues or seq_idx==len(seq_dict) or seq_len>max_seq_len:
            pdb_ids, seqs, seq_lens = zip(*batch)
            batch = list()

            token_encoding = tokenizer.batch_encode_plus(seqs, add_special_tokens=True, padding="longest")
            input_ids      = torch.tensor(token_encoding['input_ids']).to(device)
            attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)
            embedding_repr = model(input_ids, attention_mask=attention_mask)

            for batch_idx, identifier in enumerate(pdb_ids): # for each protein in the current mini-batch
                s_len = seq_lens[batch_idx]
                emb = embedding_repr.last_hidden_state[batch_idx,:s_len]
                if per_residue: # store per-residue embeddings (Lx1024)
                    results["residue_embs"][ identifier ] = emb.detach().cpu().numpy().squeeze()


    passed_time=time.time()-start

    
    return results


def get_features(seq_all):



    !mkdir protein_seqences
    data4 = seq_all




    temp= dict()
    for keys in data4.keys():
        if  len(data4[keys][0]) <=500:
            temp[keys] = data4[keys][0]



    i = 0
    j = 0
    test2 = dict()
    handle = 0
    for keys in temp.keys():
        # print(keys)
        handle = 0
        test2[keys] = temp[keys]

        if i >= 0:
            i = 0
            #/media/4TB_hardisk/sharzil/Downloads/alphapdb/dictionary_train/  this is output folder name
            #create this folder 
            with open('./protein_seqences/ppisequence'+str(j)+'.pkl', 'wb') as file:
                pickle.dump(test2, file)
            j = j+1
            handle = 1
            test2 = dict()
        i = i+1
    if handle == 0:
        with open('./protein_seqences/ppisequence'+str(j)+'.pkl', 'wb') as file:
            pickle.dump(test2, file)

    !mkdir protT5 # root directory for storing checkpoints, results etc
    !mkdir protT5/protT5_checkpoint # directory holding the ProtT5 checkpoint
    !mkdir protT5/sec_struct_checkpoint # directory storing the supervised classifier's checkpoint
    !mkdir protT5/output # directory for storing your embeddings & predictions
    !wget -nc -P protT5/ https://rostlab.org/~deepppi/example_seqs.fasta
    # Huge kudos to the bio_embeddings team here! We will integrate the new encoder, half-prec ProtT5 checkpoint soon
    !wget -nc -P protT5/sec_struct_checkpoint http://data.bioembeddings.com/public/embeddings/feature_models/t5/secstruct_checkpoint.pt


    seq_path = "./protT5/example_seqs.fasta"

    per_residue = True 
    per_residue_path = "./protT5/output/per_residue_embeddings.h5" # where to store the embeddings

    per_protein = True
    per_protein_path = "./protT5/output/per_protein_embeddings.h5" # where to store the embeddings

    sec_struct = False
    sec_struct_path = "./protT5/output/ss3_preds.fasta" # file for storing predictions

    assert per_protein is True or per_residue is True or sec_struct is True, print(
        "Minimally, you need to active per_residue, per_protein or sec_struct. (or any combination)")

    model, tokenizer = get_T5_model(device)
    # model, tokenizer = get_ProtBert(device)
    # model, tokenizer = get_ProtT5_XL_BFD(device)
    # model, tokenizer = get_ProtXLNet(device)
    # model, tokenizer =get_ProtBert_BFD(device)

    protein_feat_r=dict()
    i=0
    for index in seq_all.keys():
        temp=dict()
        temp[index]=seq_all[index]
        results=get_embeddings( model, tokenizer, temp,
                             per_residue, per_protein, sec_struct)
        i=i+1
        temp1=results['residue_embs']
        for index2 in temp1.keys():
            protein_feat_r[index2]=temp1[index2]
        
    protein_onehot=dict()

    for index in seq_all.keys():
        temp=seq_all[index]
        encoded_sequence = one_hot_encode(temp)
        protein_onehot[index]=encoded_sequence

    return protein_feat_r,protein_onehot


In [6]:
#feature loading and features extractions

In [7]:
pathdata='/media/4TB_hardisk/sharzil/Downloads/Dataset/'

# pathdata='./dataset/'
with open(pathdata+'Train_335.pkl', 'rb') as file:
    train = pickle.load(file) 
with open(pathdata+'Test_60.pkl', 'rb') as file:
    test = pickle.load(file)
with open(pathdata+'Test_20_new.pkl', 'rb') as file:
    test_20 = pickle.load(file)  
with open(pathdata+'UBtest_31.pkl', 'rb') as file:
    test_UB_31 = pickle.load(file)
with open(pathdata+'Btest_31.pkl', 'rb') as file:
    test_B_31 = pickle.load(file)
    
    
all_data=dict()
for index in train.keys():
    all_data[index]=train[index]
for index in test.keys():
    all_data[index]=test[index]
for index in test_20.keys():
    all_data[index]=test_20[index]
for index in test_UB_31.keys():
    all_data[index]=test_UB_31[index]
for index in test_B_31.keys():
    all_data[index]=test_B_31[index]

seq_all=dict()
for index in all_data.keys():
    temp=all_data[index]
    seq_all[index]=temp[0]
label_all=dict()
for index in all_data.keys():
    temp=all_data[index]
    label_all[index]=temp[1]


In [8]:
result1, result2=get_features(seq_all)

mkdir: cannot create directory ‘protein_seqences’: File exists
mkdir: cannot create directory ‘protT5’: File exists
mkdir: cannot create directory ‘protT5/protT5_checkpoint’: File exists
mkdir: cannot create directory ‘protT5/sec_struct_checkpoint’: File exists
mkdir: cannot create directory ‘protT5/output’: File exists
File ‘protT5/example_seqs.fasta’ already there; not retrieving.

File ‘protT5/sec_struct_checkpoint/secstruct_checkpoint.pt’ already there; not retrieving.



Downloading: 100%|██████████| 361/361 [00:00<00:00, 114kB/s]
Downloading: 100%|██████████| 1.57G/1.57G [01:16<00:00, 21.9MB/s] 
Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification

In [9]:

out_folder=('./feature_save')
with open(file=os.path.join(out_folder,'protein_feat_r.pkl'), mode='wb') as f:
    pickle.dump(result1, f)
with open(file=os.path.join(out_folder,'protein_feat_one.pkl'), mode='wb') as f:
    pickle.dump(result2, f)

In [10]:
pathdata='./pisite_dataset/'

with open(pathdata+'Test_70.pkl', 'rb') as file:
    test_70 = pickle.load(file) 
        
all_data=dict()
for index in test_70.keys():
    all_data[index]=test_70[index]
seq_all=dict()
for index in all_data.keys():
    temp=all_data[index]
    seq_all[index]=temp[0]
label_all=dict()
for index in all_data.keys():
    temp=all_data[index]
    label_all[index]=temp[1]

In [12]:
result1, result2=get_features(seq_all)




mkdir: cannot create directory ‘protein_seqences’: File exists
mkdir: cannot create directory ‘protT5’: File exists
mkdir: cannot create directory ‘protT5/protT5_checkpoint’: File exists
mkdir: cannot create directory ‘protT5/sec_struct_checkpoint’: File exists
mkdir: cannot create directory ‘protT5/output’: File exists
File ‘protT5/example_seqs.fasta’ already there; not retrieving.

File ‘protT5/sec_struct_checkpoint/secstruct_checkpoint.pt’ already there; not retrieving.



Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:

out_folder=('./feature_save')
with open(file=os.path.join(out_folder,'protein_feat_pisite_r.pkl'), mode='wb') as f:
    pickle.dump(result1, f)
with open(file=os.path.join(out_folder,'protein_feat_one.pkl'), mode='wb') as f:
    pickle.dump(result2, f)