Imports

In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, EsmModel

In [2]:
from proteinclip.fasta_utils import read_fasta

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [5]:
!pwd

/storage/ice1/2/4/rmehta98/cse7850-compbio-project


In [3]:
fasta_data = read_fasta("./data/uniprot/uniprot_sprot.fasta.gz")
fasta_data[list(fasta_data.keys())[0]]

INFO:root:Read 572970 sequences from ./data/uniprot/uniprot_sprot.fasta.gz


'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL'

In [32]:
import pandas as pd
df_dict = {"id": [], "sequence": []}
for k, v in fasta_data.items():
    # print(k.split(" "))
    _id = k.split(" ")[0].split("|")[1]
    df_dict["id"].append(_id)
    df_dict["sequence"].append(str(v))
df = pd.DataFrame(df_dict)
df["id"] = df["id"].astype(dtype="string")
df["sequence"] = df["sequence"].astype(dtype="string")
df.dtypes

id          string[python]
sequence    string[python]
dtype: object

In [33]:
max_length = df["sequence"].str.len().max()
print("Maximum sequence length:", max_length)


Maximum sequence length: 35213


In [6]:
from tqdm.auto import tqdm
import torch
import pandas as pd
from transformers import AutoTokenizer
from transformers.models.esm.modeling_esm import EsmModel

# Load ESM model and tokenizer
model_name = "facebook/esmfold_v1"  # Adjust if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EsmModel.from_pretrained(model_name)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight', 'esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 2560, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
    (position_embeddings): Embedding(1026, 2560, padding_idx=1)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-35): 36 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=2560, out_features=2560, bias=True)
            (key): Linear(in_features=2560, out_features=2560, bias=True)
            (value): Linear(in_features=2560, out_features=2560, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=2560, out_features=2560, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmInter

In [34]:
sample = df.iloc[:32]
sample_sequences = sample.sequence.to_list()
inputs = tokenizer(
        sample_sequences,
        return_tensors="pt",
        add_special_tokens=True,
        padding=True,
        truncation=True,
        max_length=max_length  # Ensure max_length is defined appropriately
    ).to(device)
# Compute embeddings (disable gradient computations for efficiency)
with torch.no_grad():
    outputs = model(**inputs)
    # Mean pooling over the sequence length dimension for each sequence in the batch
    embeddings = outputs.last_hidden_state.mean(dim=1)
del embeddings
del inputs

In [35]:
import torch
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()


In [37]:


# Define a batch size (adjust based on your available resources)
batch_size = 32

# Initialize list to store embeddings
embeddings_dict = {"protein_id": [], "embedding": []}

# Loop over the DataFrame in batches
for i in tqdm(range(0, len(df), batch_size), total=(len(df) // batch_size) + 1):
    # Select the batch from the DataFrame
    batch_df = df.iloc[i : i + batch_size]
    protein_ids = batch_df["id"].tolist()
    sequences = batch_df["sequence"].tolist()

    # Tokenize the batch of sequences. Here we use:
    # - padding=True: Pads to the longest sequence in the batch.
    # - truncation=True: Ensures that sequences longer than max_length are truncated.
    # - max_length: Specifies the maximum length to pad/truncate to.
    for n in sequences:
        if n is None:
            print(f"{n} -> {type(n)}")
            raise ValueError("Found a None value in sequences!")
    print(sequences, max_length)
    inputs = tokenizer(
        sequences,
        return_tensors="pt",
        add_special_tokens=True,
        padding=True,
        truncation=True,
    ).to(device)

    # Optionally, if you have a GPU, move your inputs and model to CUDA:
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model.to(device)
    # inputs = {k: v.to(device) for k, v in inputs.items()}

    # Compute embeddings (disable gradient computations for efficiency)
    with torch.no_grad():
        outputs = model(**inputs)
        # Mean pooling over the sequence length dimension for each sequence in the batch
        embeddings = outputs.last_hidden_state.mean(dim=1)

    # Convert the batch embeddings to list and pair with corresponding protein IDs
    embedding_vectors = embeddings.tolist()
    embeddings_dict["protein_id"].extend(protein_ids)
    embeddings_dict["embedding"].extend(embedding_vectors)

embedding_df = pd.DataFrame(embeddings)

# Save the DataFrame as a Parquet file
embedding_df.to_parquet("data/structural/esmfold_embeddings.parquet", index=False)
print("Embeddings saved to data/structural/esmfold_embeddings.parquet")


  0%|          | 0/17906 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


['MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL', 'MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCARIKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSLAERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADCKCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNMLDDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRKVMFFVAGAVLVAILISTVRW', 'MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPELQTSEPSEDYSGPVESLELLPLEIKLDIMQYLSWEQISWCKHPWLWTRWYKDNVVRVSAITFEDFQREYAFPEKIQEIHFTDTRAEEIKAILETTPNVTRLVIRRIDDMNYNTHGDLGLDDLEFLTHLMVEDACGFTDFWAPSLTHLTIKNLDMHPRWFGPVMDGIKSMQSTLKYLYIFETYGVNKPFVQWCTDNIETFYCTNSYRYENVPRPIYVWVLFQEDEWHGYRVEDNKFHRRYMYSTILHKRDTDWVENNPLKTPAQVEMYKFLLRISQLNRDGTGYESDSDPENEHFDDESFSSGEEDSSDEDDPTWAPDSDDSDWETETEEEPSVAARILEKGKLTITNLMKSLGFK

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).