Imports

In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, EsmModel

In [2]:
from proteinclip.fasta_utils import read_fasta

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [None]:
!pwd

In [5]:
!pwd

/storage/ice1/2/4/rmehta98/cse7850-compbio-project


In [3]:
fasta_data = read_fasta("./data/uniprot/uniprot_sprot.fasta.gz")
fasta_data[list(fasta_data.keys())[0]]

INFO:root:Read 572970 sequences from ./data/uniprot/uniprot_sprot.fasta.gz


'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL'

In [4]:
import pandas as pd
df_dict = {"id": [], "sequence": []}
for k, v in fasta_data.items():
    # print(k.split(" "))
    _id = k.split(" ")[0].split("|")[1]
    df_dict["id"].append(_id)
    df_dict["sequence"].append(v)
df = pd.DataFrame(df_dict)

In [6]:
max_length = df["sequence"].str.len().max()
print("Maximum sequence length:", max_length)


Maximum sequence length: 35213


In [13]:
from tqdm.auto import tqdm
import torch
import pandas as pd
from transformers import AutoTokenizer
from transformers.models.esm.modeling_esm import EsmModel

# Load ESM model and tokenizer
model_name = "facebook/esmfold_v1"  # Adjust if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EsmModel.from_pretrained(model_name)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

# You should have defined max_length somewhere; for example, it can be determined by:
# max_length = df["sequence"].str.len().max()
# Alternatively, if you want to use the model's maximum allowed length, use:
# max_length = model.config.max_position_embeddings

# Define a batch size (adjust based on your available resources)
batch_size = 32

# Initialize list to store embeddings
embeddings_list = []

# Loop over the DataFrame in batches
for i in tqdm(range(0, len(df), batch_size), total=(len(df) // batch_size) + 1):
    # Select the batch from the DataFrame
    batch_df = df.iloc[i : i + batch_size]
    protein_ids = batch_df["id"].tolist()
    sequences = torch.tensor(batch_df["sequence"].tolist(), device=device)

    # Tokenize the batch of sequences. Here we use:
    # - padding=True: Pads to the longest sequence in the batch.
    # - truncation=True: Ensures that sequences longer than max_length are truncated.
    # - max_length: Specifies the maximum length to pad/truncate to.
    inputs = tokenizer(
        sequences,
        return_tensors="pt",
        add_special_tokens=True,
        padding=True,
        truncation=True,
        max_length=max_length  # Ensure max_length is defined appropriately
    )

    # Optionally, if you have a GPU, move your inputs and model to CUDA:
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model.to(device)
    # inputs = {k: v.to(device) for k, v in inputs.items()}

    # Compute embeddings (disable gradient computations for efficiency)
    with torch.no_grad():
        outputs = model(**inputs)
        # Mean pooling over the sequence length dimension for each sequence in the batch
        embeddings = outputs.last_hidden_state.mean(dim=1)

    # Convert the batch embeddings to list and pair with corresponding protein IDs
    embedding_vectors = embeddings.tolist()
    for pid, embed in zip(protein_ids, embedding_vectors):
        embeddings_list.append([pid] + embed)

# Create a DataFrame from the embeddings list
# The first column is the protein id and the remaining columns are the embedding dimensions.
num_dims = len(embedding_vectors[0])
columns = ["id"] + [f"dim_{i}" for i in range(num_dims)]
embedding_df = pd.DataFrame(embeddings_list, columns=columns)

# Save the DataFrame as a Parquet file
embedding_df.to_parquet("data/structural/esmfold_embeddings.parquet", index=False)
print("Embeddings saved to data/structural/esmfold_embeddings.parquet")


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight', 'esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 26.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 14.19 MiB is free. Including non-PyTorch memory, this process has 31.71 GiB memory in use. Of the allocated memory 31.40 GiB is allocated by PyTorch, and 12.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [12]:
del model