In [None]:
!pip install torch esm pandas
!pip install fair-esm --upgrade
!pip install biopython

In [None]:
import torch
import esm
# Load the ESM-2 model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()  # Set the model to evaluation mode

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import esm
import pandas as pd
from Bio import SeqIO

# === Load ESM-2 (33-layer, 650M parameter model) ===
print("ðŸ“¥ Loading ESM-2 650M (33 layers)...")
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()  # evaluation mode

# === Load sequences from FASTA ===
fasta_file = "/content/drive/MyDrive/Colab Notebooks/PIP-Training.fasta"
sequences = [(record.id, str(record.seq)) for record in SeqIO.parse(fasta_file, "fasta")]
print(f"âœ… Loaded {len(sequences)} sequences from FASTA")

# === Prepare input batches ===
batch_labels, batch_strs, batch_tokens = batch_converter(sequences)

# === Compute embeddings (Layer 33) ===
print("ðŸ§  Extracting embeddings from layer 33...")
with torch.no_grad():
    results = model(batch_tokens, repr_layers=[33])
    token_representations = results["representations"][33]

# === Average token embeddings (excluding special tokens) ===
sequence_representations = []
for i, (_, seq) in enumerate(sequences):
    rep = token_representations[i, 1:len(seq)+1].mean(0)
    sequence_representations.append(rep)

# === Convert to DataFrame and save ===
sequence_representations = torch.stack(sequence_representations).numpy()
df = pd.DataFrame(sequence_representations, index=[s[0] for s in sequences])
output_file = "ESM2_Layer33_PIPs.csv"
df.to_csv(output_file)
print(f"âœ… Features extracted from Layer 33 and saved as: {output_file}")