In [1]:
import pandas as pd
import numpy as np
import h5py
import numpy as np
from Bio import SeqIO
from keras import backend as K
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, roc_auc_score
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, concatenate
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow as tf
from tqdm import tqdm
from tabulate import tabulate  # Make sure to install tabulate package
import re
import torch
from transformers import T5EncoderModel, T5Tokenizer
import gc

tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50", do_lower_case=False )
pretrained_model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")

device = torch.device('cpu')
pretrained_model = pretrained_model.to(device)
pretrained_model = pretrained_model.eval()

my_train = pd.read_csv('my_train.csv')
my_valid = pd.read_csv('my_valid.csv')
my_test = pd.read_csv('my_test.csv')

2024-02-02 23:40:02.859301: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-02 23:40:03.373760: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-02 23:40:04.559316: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-02-02 23:40:04.559584: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
def get_protT5_features(sequence): 
    # Ensure the sequence is a string
    sequence = str(sequence)

    # Replace rare amino acids with X
    sequence = re.sub(r"[UZOB]", "X", sequence)
    
    # Tokenize the sequence
    ids = tokenizer.batch_encode_plus([sequence], add_special_tokens=True, padding=True)
    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)
    
    # Extract features from the pretrained model
    with torch.no_grad():
        embedding = pretrained_model(input_ids=input_ids, attention_mask=attention_mask)

    # Extract the last hidden state
    embedding = embedding.last_hidden_state.cpu().numpy()
    
    # Find length
    seq_len = (attention_mask[0] == 1).sum()
    
    # Select features
    seq_emd = embedding[0][:seq_len-1]
    
    return seq_emd

def get_input_for_embedding(fasta_file):
    sequences = []
    
    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        sequence = str(seq_record.seq)
        sequences.append(sequence)

    # Generate ProtT5 embeddings for the sequences
    embeddings = [get_protT5_features(seq) for seq in sequences]

    return embeddings


In [9]:
def aggregate_emb(hf_path, test_df, extract_pos=16):
    # Process training sequences
    test_embedding = list()
    for seq in test_df['sequence']:
        pt5_all = get_protT5_features(seq)
        embed_pos = pt5_all[extract_pos, :]
        test_embedding.append(embed_pos)


    # Save the training embeddings to an h5 file
    test_X = np.array(test_embedding)

    print("Size of X_test:", test_X.shape)
    
    with h5py.File(hf_path.replace(".h5", "_trial.h5"), 'w') as hf:
        hf.create_dataset('embedding', data=test_X)

    

In [10]:
test_Y_embedding = get_input_for_embedding('./test_Pos_Neg_Y.fasta')
aggregate_emb("./embeddings/trial.h5", my_test, extract_pos=16)


Size of X_test: (50, 1024)
