In [1]:
import pandas as pd
import numpy as np
import h5py
import numpy as np
from Bio import SeqIO
from keras import backend as K
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, roc_auc_score
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, concatenate
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow as tf
from tqdm import tqdm
from tabulate import tabulate  # Make sure to install tabulate package
import re
import torch
from transformers import T5EncoderModel, T5Tokenizer
import gc

tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50", do_lower_case=False )
pretrained_model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")

device = torch.device('cpu')
pretrained_model = pretrained_model.to(device)
pretrained_model = pretrained_model.eval()

my_train = pd.read_csv('my_train2.csv')
my_valid = pd.read_csv('my_valid2.csv')

2024-02-03 22:38:18.046523: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-03 22:38:18.404578: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-03 22:38:19.768862: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-02-03 22:38:19.769071: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
def get_protT5_features(sequence): 
    # Ensure the sequence is a string
    sequence = str(sequence)

    # Replace rare amino acids with X
    sequence = re.sub(r"[UZOB]", "X", sequence)
    
    # Tokenize the sequence
    ids = tokenizer.batch_encode_plus([sequence], add_special_tokens=True, padding=True)
    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)
    
    # Extract features from the pretrained model
    with torch.no_grad():
        embedding = pretrained_model(input_ids=input_ids, attention_mask=attention_mask)

    # Extract the last hidden state
    embedding = embedding.last_hidden_state.cpu().numpy()
    
    # Find length
    seq_len = (attention_mask[0] == 1).sum()
    
    # Select features
    seq_emd = embedding[0][:seq_len-1]
    
    return seq_emd

def get_input_for_embedding(fasta_file):
    sequences = []
    
    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        sequence = str(seq_record.seq)
        sequences.append(sequence)

    # Generate ProtT5 embeddings for the sequences
    embeddings = [get_protT5_features(seq) for seq in sequences]

    return embeddings


In [7]:
def aggregate_emb(hf_path, train_df, valid_df, extract_pos=16):
    # Process training sequences
    # Process training sequences
    train_embedding = list()
    for seq in train_df['sequence']:
        pt5_all = get_protT5_features(seq)
        embed_pos = pt5_all[0]  # Access the single element in pt5_all
        train_embedding.append(embed_pos)
    
    # Process validation sequences
    valid_embedding = list()
    for seq in valid_df['sequence']:
        pt5_all = get_protT5_features(seq)
        embed_pos = pt5_all[0]  # Access the single element in pt5_all
        valid_embedding.append(embed_pos)


    # Save the training embeddings to an h5 file
    train_X = np.array(train_embedding)

    print("Size of X_train:", train_X.shape)
    
    with h5py.File(hf_path.replace(".h5", "_train.h5"), 'w') as hf:
        hf.create_dataset('embedding', data=train_X)

    # Save the validation embeddings to an h5 file
    valid_X = np.array(valid_embedding)

    print("Size of X_valid:", valid_X.shape)
    
    with h5py.File(hf_path.replace(".h5", "_valid.h5"), 'w') as hf:
        hf.create_dataset('embedding', data=valid_X)

In [8]:
def aggregate_emb1(hf_path, seq_list):
    agg_embedding = list()
    for seq in seq_list:
        pt5_all = get_protT5_features(seq)
        agg_embedding.append(pt5_all)

    X = np.array(agg_embedding)
    X = np.squeeze(X, axis=1)
    print(X.shape)
    hf = h5py.File(hf_path, 'w')
    hf.create_dataset('embedding', data=X)
    hf.close()


In [9]:
test_Y_embedding = get_input_for_embedding('./D+P_Equal.fasta')
aggregate_emb("./embeddings/data2.h5", my_train, my_valid, extract_pos=16)


Size of X_train: (323, 1024)
Size of X_valid: (81, 1024)
