# Version 2 Workspace

In [None]:
!pip install --upgrade py3Dmol accelerate

In [1]:
from transformers import AutoTokenizer, EsmForProteinFolding

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)

model = model.cuda()

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import torch

torch.backends.cuda.matmul.allow_tf32 = True

In [3]:
test_protein = "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"

tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids']
tokenized_input = tokenized_input.cuda()

with torch.no_grad():
    output = model(tokenized_input)


# Real Structual Embedding Generations

## Load Dataset

In [1]:
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()


In [2]:
from proteinclip.fasta_utils import read_fasta
fasta_data_raw = read_fasta("./data/uniprot/uniprot_sprot.fasta.gz")
fasta_data_raw[list(fasta_data_raw.keys())[0]]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
INFO:root:Read 572970 sequences from ./data/uniprot/uniprot_sprot.fasta.gz


'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL'

In [3]:
# filter out excessive length
# officially doesn't support sequences with > 1024 residuals

fasta_data = {k: v for k, v in fasta_data_raw.items() if len(v) <= 1024}
print(f"Eliminating: {100*(1 - len(fasta_data) / len(fasta_data_raw)) : .2f}% of dataset for excessive length")

Eliminating:  3.17% of dataset for excessive length


In [4]:
protein_ids = list(fasta_data.keys())
sequences = list(fasta_data.values())

In [5]:
from transformers import AutoTokenizer, EsmForProteinFolding

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)

model = model.cuda()

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
print(torch.cuda.memory_summary())


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  16690 MiB |  16690 MiB |  16690 MiB |    512 B   |
|       from large pool |  16686 MiB |  16686 MiB |  16686 MiB |      0 B   |
|       from small pool |      4 MiB |      4 MiB |      4 MiB |    512 B   |
|---------------------------------------------------------------------------|
| Active memory         |  16690 MiB |  16690 MiB |  16690 MiB |    512 B   |
|       from large pool |  16686 MiB |  16686 MiB |  16686 MiB |      0 B   |
|       from small pool |      4 MiB |      4 MiB |      4 MiB |    512 B   |
|---------------------------------------------------------------

In [6]:

from tqdm.auto import tqdm 
# Define a batch size (adjust based on your available resources)
batch_size = 10

# Initialize list to store embeddings
embeddings_dict = {"protein_id": [], "embedding": []}

for i in tqdm(range(0, len(sequences), batch_size), total=(len(sequences) // batch_size) + 1):
    batch_protein_ids = protein_ids[i:i+batch_size]
    batch_sequences = sequences[i:i+batch_size]

    tokenized_input = tokenizer(batch_sequences, return_tensors="pt", add_special_tokens=False, truncation=True, max_length=1024, padding=True)['input_ids']
    tokenized_input = tokenized_input.cuda()

    with torch.no_grad():
        output = model(tokenized_input)


    # Convert the batch embeddings to list and pair with corresponding protein IDs
    embedding_vectors = output.tolist()
    embeddings_dict["protein_id"].extend(protein_ids)
    embeddings_dict["embedding"].extend(embedding_vectors)

embedding_df = pd.DataFrame(embeddings)

# Save the DataFrame as a Parquet file
embedding_df.to_parquet("data/structural/esmfold_embeddings.parquet", index=False)
print("Embeddings saved to data/structural/esmfold_embeddings.parquet")


  0%|          | 0/55484 [00:00<?, ?it/s]

AttributeError: 'EsmForProteinFoldingOutput' object has no attribute 'tolist'

In [6]:
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs

In [6]:
output

NameError: name 'output' is not defined

In [36]:
import torch
import pandas as pd

test_protein = "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"
protein_id = "1"
output_dict = {"protein_id": [], "sequence": [], "pdb": []}


tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids']
tokenized_input = tokenized_input.cuda()

with torch.no_grad():
    output = model(tokenized_input)
    if len(output_dict.keys()) < 4:
        for k in output.keys():
            output_dict[k] = []

    for k in output.keys():
        output_dict[k].append(output[k].cpu().numpy())

    output_dict["protein_id"].append(protein_id)
    output_dict["sequence"].append(test_protein)
    output_dict["pdb"].append(model.output_to_pdb(output))

output_df = pd.DataFrame(output_dict)

In [48]:
import numpy as np
np.save("structural_test.npy", output_df.to_numpy())

## Loading

In [None]:
output_df_loaded = np.load("structural_test.npy", allow_pickle=True)
output_df_loaded

In [52]:
output_dict_loaded = {k : [] for k in ['protein_id', 'sequence', 'pdb', 'frames', 'sidechain_frames',
       'unnormalized_angles', 'angles', 'positions', 'states', 's_s', 's_z',
       'distogram_logits', 'lm_logits', 'aatype', 'atom14_atom_exists',
       'residx_atom14_to_atom37', 'residx_atom37_to_atom14',
       'atom37_atom_exists', 'residue_index', 'lddt_head', 'plddt',
       'ptm_logits', 'ptm', 'aligned_confidence_probs',
       'predicted_aligned_error', 'max_predicted_aligned_error']}

In [43]:
output_df.to_json("structural_test.json")

TypeError: array(0.94865006, dtype=float32) (numpy-scalar) is not JSON serializable at the moment

In [39]:
print(model)

EsmForProteinFolding(
  (esm): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(33, 2560, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(1026, 2560, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0-35): 36 x EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=2560, out_features=2560, bias=True)
              (key): Linear(in_features=2560, out_features=2560, bias=True)
              (value): Linear(in_features=2560, out_features=2560, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
              (rotary_embeddings): RotaryEmbedding()
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=2560, out_features=2560, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((2560,), eps=1