In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install fair-esm
!pip install biopython

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0
Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import os
import torch
import h5py
import pandas as pd
from transformers import T5EncoderModel, T5Tokenizer
from transformers import BertForMaskedLM, BertTokenizer, pipeline
from Bio import SeqIO
from transformers import BertTokenizer



os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


def read_fasta(fastain):
    records = list(SeqIO.parse(fastain, "fasta"))
    id_seqs = []
    for i in range(len(records)):
        id = str(records[i].id)
        ss = str(records[i].seq)
        id_seqs.append([id, ss])

    id_seqs = pd.DataFrame(id_seqs, columns=["ID", "Seq"])

    print("Read fasta file DONE!")
    return id_seqs

def get_t5_embeddings(input_fasta, output_embeddings_h5, output_embeddings_csv, model_dir=None):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("Using device: {}".format(device))

    # Load T5 model and tokenizer(prot_t5_xl_bfd,  prot_t5_xl_uniref50, prot_albert,prot_xlnet,Prot_bert_bfd)
    transformer_link = "Rostlab/prot_t5_xl_bfd"

    model = T5EncoderModel.from_pretrained(transformer_link, cache_dir=model_dir)
    model = model.to(device)
    model = model.eval()
    vocab = T5Tokenizer.from_pretrained(transformer_link, do_lower_case=False)
    # Read protein sequences from FASTA file
    sequences = read_fasta(input_fasta)

    # Process sequences and obtain embeddings
    embeddings = {}
    for identifier, sequence in zip(sequences["ID"], sequences["Seq"]):
        sequence = sequence.replace('U', 'X').replace('Z', 'X').replace('O', 'X')
        sequence = ' '.join(list(sequence))

        token_encoding = vocab.encode_plus(sequence, add_special_tokens=True, padding="longest")
        # print(token_encoding)

        input_ids = torch.tensor(token_encoding['input_ids']).to(device)
        attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)


        with torch.no_grad():
            embedding_repr = model(input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))

        # Slice-off padded/special tokens
        emb = embedding_repr.last_hidden_state.squeeze(0)

        embeddings[identifier] = emb.mean(dim=0).detach().cpu().numpy().squeeze()

    # Save embeddings to an HDF5 file
    # with h5py.File(output_embeddings_h5, "w") as hf:
    #     for sequence_id, embedding in embeddings.items():
    #         hf.create_dataset(sequence_id, data=embedding)

    # Save embeddings to a CSV file
    df = pd.DataFrame.from_dict(embeddings, orient='index')
    df.to_csv(output_embeddings_csv)

    print('Total number of embeddings: {}'.format(len(embeddings)))
    return embeddings

# Example usage:
path = '/content/drive/MyDrive/Watashara_Projects/neurotoxic/'
print("Current Working Directory:", os.getcwd())
input_fasta_path = path+'Features_extraction/datasets/peptides/independent_dataset_peptides.fasta'
output_embeddings_h5_path = path+'features/peptides/TRN_prot_t5_xl_bfd.h5'
output_embeddings_csv_path = path+'features/peptides/IND_prot_t5_xl_bfd.csv'

# Rest of your script...
get_t5_embeddings(input_fasta_path, output_embeddings_h5_path, output_embeddings_csv_path)

Current Working Directory: /content
Using device: cuda:0
Read fasta file DONE!
Total number of embeddings: 351


{'1_0': array([ 0.08148295,  0.14007026,  0.0665552 , ..., -0.01752453,
         0.00343996, -0.05963172], dtype=float32),
 '1_1': array([0.09998729, 0.01186462, 0.09566975, ..., 0.18958114, 0.00662936,
        0.06741942], dtype=float32),
 '1_2': array([0.20771281, 0.09274427, 0.07202883, ..., 0.04140832, 0.03336206,
        0.006126  ], dtype=float32),
 '1_3': array([0.09825114, 0.10553127, 0.12641118, ..., 0.01542141, 0.09533481,
        0.05179782], dtype=float32),
 '1_4': array([ 0.08544825, -0.01453502,  0.07990216, ...,  0.10882458,
         0.0308098 ,  0.07823423], dtype=float32),
 '1_5': array([ 0.08199111, -0.01741029,  0.01661752, ...,  0.02036709,
         0.04697378,  0.00401336], dtype=float32),
 '1_6': array([ 0.13576975,  0.12807658, -0.086144  , ...,  0.08011601,
         0.02760992,  0.02345183], dtype=float32),
 '1_7': array([0.12457325, 0.10524795, 0.1298785 , ..., 0.01487848, 0.10381052,
        0.01023285], dtype=float32),
 '1_8': array([0.07895229, 0.0224076 , 0

**PortBet_Bd**

In [None]:
#independent_dataset_combined  cross_val_dataset_combined
#cross_val_dataset_peptides  independent_dataset_peptides
#cross_val_dataset_protein   independent_dataset_protein


In [None]:
import os
import torch
import h5py
import pandas as pd
from transformers import BertForMaskedLM, BertTokenizer
from Bio import SeqIO

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

def read_fasta(fastain):
    records = list(SeqIO.parse(fastain, "fasta"))
    id_seqs = []
    for record in records:
        id_seqs.append([str(record.id), str(record.seq)])

    id_seqs = pd.DataFrame(id_seqs, columns=["ID", "Seq"])
    print("Read fasta file DONE!")
    return id_seqs

def get_t5_embeddings(input_fasta, output_embeddings_h5, output_embeddings_csv, model_dir=None):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("Using device:", device)

    # Correct transformer link
    transformer_link = "Rostlab/Prot_bert_bfd"

    # Load BERT tokenizer and model (since this is a BERT-based model)
    tokenizer = BertTokenizer.from_pretrained(transformer_link, do_lower_case=False)
    model = BertForMaskedLM.from_pretrained(transformer_link, cache_dir=model_dir)
    model.to(device)
    model.eval()

    # Read protein sequences from FASTA file
    sequences = read_fasta(input_fasta)

    # Process sequences and obtain embeddings
    embeddings = {}
    for identifier, sequence in zip(sequences["ID"], sequences["Seq"]):
        sequence = sequence.replace('U', 'X').replace('Z', 'X').replace('O', 'X')
        sequence = ' '.join(list(sequence))

        token_encoding = tokenizer.encode_plus(sequence, add_special_tokens=True, padding="longest")

        # Debugging print
        print(f"Tokenized sequence for {identifier}: {token_encoding}")

        # Convert input tensors explicitly to torch.long
        input_ids = torch.tensor(token_encoding['input_ids'], dtype=torch.long).to(device)
        attention_mask = torch.tensor(token_encoding['attention_mask'], dtype=torch.long).to(device)

        # Ensure valid input
        if input_ids.shape[0] == 0:
            print(f"Skipping sequence {identifier} due to empty input.")
            continue

        # Debugging: Print tensor shapes
        print(f"Processing {identifier} -> Input IDs shape: {input_ids.shape}, Attention mask shape: {attention_mask.shape}")

        with torch.no_grad():
            embedding_repr = model(input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))

        # Extract embeddings and remove special tokens
        emb = embedding_repr.logits.squeeze(0)
        embeddings[identifier] = emb.mean(dim=0).detach().cpu().numpy().squeeze()

    # Save embeddings to CSV file
    df = pd.DataFrame.from_dict(embeddings, orient='index')
    df.to_csv(output_embeddings_csv)

    print("Total number of embeddings:", len(embeddings))
    return embeddings

# Example usage:
path = '/content/drive/MyDrive/Watashara_Projects/9-AVP/'
print("Current Working Directory:", os.getcwd())

input_fasta_path = path + 'Features_extraction/TR_pos_neg.fasta'
output_embeddings_csv_path = path + 'features/Train_Prot_bert_bf.csv'

# Run the function
get_t5_embeddings(input_fasta_path, None, output_embeddings_csv_path)


Current Working Directory: /content
Using device: cuda:0


OSError: Can't load tokenizer for 'Rostlab/prot_xlnet'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'Rostlab/prot_xlnet' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

In [None]:
import os
import torch
import h5py
import pandas as pd
from transformers import BertForMaskedLM, BertTokenizer
from Bio import SeqIO

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

def read_fasta(fastain):
    records = list(SeqIO.parse(fastain, "fasta"))
    id_seqs = []
    for record in records:
        id_seqs.append([str(record.id), str(record.seq)])

    id_seqs = pd.DataFrame(id_seqs, columns=["ID", "Seq"])
    print("Read fasta file DONE!")
    return id_seqs

def get_t5_embeddings(input_fasta, output_embeddings_h5, output_embeddings_csv, model_dir=None):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("Using device:", device)

    # Correct transformer link
    transformer_link = "Rostlab/prot_t5_xl_uniref50"

    # Load BERT tokenizer and model (since this is a BERT-based model)
    tokenizer = BertTokenizer.from_pretrained(transformer_link, do_lower_case=False)
    model = BertForMaskedLM.from_pretrained(transformer_link, cache_dir=model_dir)
    model.to(device)
    model.eval()

    # Read protein sequences from FASTA file
    sequences = read_fasta(input_fasta)

    # Process sequences and obtain embeddings
    embeddings = {}
    for identifier, sequence in zip(sequences["ID"], sequences["Seq"]):
        sequence = sequence.replace('U', 'X').replace('Z', 'X').replace('O', 'X')
        sequence = ' '.join(list(sequence))

        token_encoding = tokenizer.encode_plus(sequence, add_special_tokens=True, padding="longest")

        # Debugging print
        print(f"Tokenized sequence for {identifier}: {token_encoding}")

        # Convert input tensors explicitly to torch.long
        input_ids = torch.tensor(token_encoding['input_ids'], dtype=torch.long).to(device)
        attention_mask = torch.tensor(token_encoding['attention_mask'], dtype=torch.long).to(device)

        # Ensure valid input
        if input_ids.shape[0] == 0:
            print(f"Skipping sequence {identifier} due to empty input.")
            continue

        # Debugging: Print tensor shapes
        print(f"Processing {identifier} -> Input IDs shape: {input_ids.shape}, Attention mask shape: {attention_mask.shape}")

        with torch.no_grad():
            embedding_repr = model(input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))

        # Extract embeddings and remove special tokens
        emb = embedding_repr.logits.squeeze(0)
        embeddings[identifier] = emb.mean(dim=0).detach().cpu().numpy().squeeze()

    # Save embeddings to CSV file
    df = pd.DataFrame.from_dict(embeddings, orient='index')
    df.to_csv(output_embeddings_csv)

    print("Total number of embeddings:", len(embeddings))
    return embeddings

# Example usage:
path = '/content/drive/MyDrive/Watashara_Projects/9-AVP/'
print("Current Working Directory:", os.getcwd())

input_fasta_path = path + 'Features_extraction/TR_pos_neg.fasta'
output_embeddings_csv_path = path + 'features/Train_Prot_t5_xl_uniref50.csv'

# Run the function
get_t5_embeddings(input_fasta_path, None, output_embeddings_csv_path)


In [None]:
#independent_dataset_combined  cross_val_dataset_combined
#cross_val_dataset_peptides  independent_dataset_peptides
#cross_val_dataset_protein   independent_dataset_protein