In [None]:
!pip install transformers pandas biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch  # Import PyTorch
from transformers import GPT2Tokenizer, GPT2Model
from Bio import SeqIO

# Load ProtGPT2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("nferruz/ProtGPT2")
model = GPT2Model.from_pretrained("nferruz/ProtGPT2")

def extract_features(sequence):
    # Tokenize the sequence
    inputs = tokenizer(sequence, return_tensors="pt", max_length=1024, truncation=True)

    # Get the hidden states from the model
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)

    # Extract the last hidden state (features)
    last_hidden_state = outputs.last_hidden_state

    # Average over the sequence length to get a fixed-size feature vector
    features = last_hidden_state.mean(dim=1).squeeze().numpy()

    return features

def process_fasta(fasta_file):
    sequences = []
    features_list = []

    # Parse the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence = str(record.seq)
        sequences.append(sequence)

        # Extract features
        features = extract_features(sequence)
        features_list.append(features)

    # Create a DataFrame
    df = pd.DataFrame(features_list)
    df.insert(0, "Sequence", sequences)

    return df

# Path to your FASTA file
fasta_file = "/content/drive/MyDrive/Colab Notebooks/AntiOxident Peptides 2025/AVP-test2-set.fasta"

# Process the FASTA file and extract features
df = process_fasta(fasta_file)

# Save the features to a CSV file
df.to_csv("ProtGPT2-AVP-Test2.csv", index=False)

print("CBiLSuccSite positive.csv'.")

CBiLSuccSite positive.csv'.
