In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install fair-esm
!pip install biopython

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0
Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import os
import torch
import pandas as pd
from transformers import BertModel, BertTokenizer
from Bio import SeqIO

# Set CUDA blocking for debugging purposes
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

def read_fasta(fastain):
    """Reads a FASTA file and returns a DataFrame with IDs and sequences."""
    records = list(SeqIO.parse(fastain, "fasta"))
    id_seqs = [[str(record.id), str(record.seq)] for record in records]
    print(f"Read {len(id_seqs)} sequences from FASTA file.")
    return pd.DataFrame(id_seqs, columns=["ID", "Seq"])

def get_biobert_embeddings(input_fasta, output_embeddings_csv, model_dir=None):
    """Extracts peptide embeddings using BioBERT and saves them as a CSV file."""
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load BioBERT tokenizer and model
    biobert_model_name = "dmis-lab/biobert-v1.1"
    tokenizer = BertTokenizer.from_pretrained(biobert_model_name, cache_dir=model_dir)
    model = BertModel.from_pretrained(biobert_model_name, cache_dir=model_dir)
    model = model.to(device)
    model.eval()

    # Read sequences from FASTA file
    sequences = read_fasta(input_fasta)

    embeddings = {}
    for identifier, sequence in zip(sequences["ID"], sequences["Seq"]):
        sequence = sequence.replace('U', 'X').replace('Z', 'X').replace('O', 'X')
        tokenized_input = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=512)

        input_ids = tokenized_input['input_ids'].to(device)
        attention_mask = tokenized_input['attention_mask'].to(device)

        with torch.no_grad():
            output = model(input_ids, attention_mask=attention_mask)

        # Extract hidden states and average token embeddings (excluding CLS and PAD tokens)
        emb = output.last_hidden_state.squeeze(0)
        emb_avg = emb.mean(dim=0).cpu().numpy()

        embeddings[identifier] = emb_avg

    # Save embeddings to CSV file
    df = pd.DataFrame.from_dict(embeddings, orient='index')
    df.to_csv(output_embeddings_csv)

    print(f'Total embeddings extracted: {len(embeddings)}')
    return embeddings

# Example usage
path = '/content/drive/MyDrive/Watashara_Projects/neurotoxic/'
print("Current Working Directory:", os.getcwd())
input_fasta_path = path+'Features_extraction/datasets/peptides/cross_val_dataset_peptides.fasta'
output_embeddings_csv_path = path + 'features/peptides/TRN_BioBERT.csv'

get_biobert_embeddings(input_fasta_path, output_embeddings_csv_path)
