Imports

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, EsmModel

In [None]:
# Load ESM model and tokenizer
model_name = "facebook/esmfold_v1"  # Adjust if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EsmModel.from_pretrained(model_name)
model.eval()  # Set model to evaluation mode

# Load the CSV file
input_csv = "protein_sequences.csv"  # Change to your filename
df = pd.read_csv(input_csv)

# Initialize list to store embeddings
embeddings_list = []

# Process each protein sequence
for idx, row in df.iterrows():
    protein_id = row["id"]
    sequence = row["sequence"]

    # Tokenize the sequence
    inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=True)

    # Compute embeddings (disable gradient calculations)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling

    # Convert tensor to list
    embedding_vector = embeddings.squeeze().tolist()
    
    # Store in list
    embeddings_list.append([protein_id] + embedding_vector)

# Convert to DataFrame
columns = ["id"] + [f"dim_{i}" for i in range(len(embedding_vector))]
embedding_df = pd.DataFrame(embeddings_list, columns=columns)

# Save as Parquet file
embedding_df.to_parquet("protein_embeddings.parquet", index=False)

print("Embeddings saved to protein_embeddings.parquet")