# Version 2 Workspace

In [None]:
!pip install --upgrade py3Dmol accelerate

In [1]:
from transformers import AutoTokenizer, EsmForProteinFolding

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)

model = model.cuda()

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import torch

torch.backends.cuda.matmul.allow_tf32 = True

In [3]:
test_protein = "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"

tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids']
tokenized_input = tokenized_input.cuda()

with torch.no_grad():
    output = model(tokenized_input)


# Real Structual Embedding Generations

## Load Dataset

In [1]:
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()


In [2]:
import pandas as pd
df = pd.read_parquet("./data/dataset/sequence_and_embeddings.parquet")
df.head()

Unnamed: 0,organism,organism_id,name,evidence,function,id,embedding,sequence
0,Homo sapiens (Human),9606,EPHA7,1,Receptor tyrosine kinase which binds promiscuo...,Q15375,"[-0.015253728, 0.016237658, -0.016555615, 0.02...",MVFQTRYPSWIILCYIWLLRFAHTGEAQAAKEVLLLDSKAQQTELE...
1,Homo sapiens (Human),9606,ANXA8,1,This protein is an anticoagulant protein that ...,P13928,"[-0.008352073, 0.00474287, 0.006541474, -0.002...",MAWWKSWIEQEGVTVKSSSHFNPDPDAETLYKAMKGIGTNEQAIID...
2,Homo sapiens (Human),9606,DPY19L2P1,2,Probable C-mannosyltransferase that mediates C...,Q6NXN4,"[-0.00039709447, -0.02393247, -0.014100584, 0....",MKKQGVNPKPLQSSRPSPSKRPYGASPARELEVEKSALGGGKLPGG...
3,Homo sapiens (Human),9606,NR1D1,1,Transcriptional repressor which coordinates ci...,P20393,"[0.008172105, -0.0116752, -0.016805198, -0.005...",MTTLDSNNNTGGVITYIGSSGSSPSRTSPESLYSDNSNGSFQSLTQ...
4,Homo sapiens (Human),9606,SLC15A2,1,Proton-coupled amino-acid transporter that tra...,Q16348,"[0.0031013805, -0.0019497981, -0.003831747, 0....",MNPFQKNESKETLFSPVSIEEVPPRPPSPPKKPSPTICGSNYPLSI...


In [3]:
from transformers import AutoTokenizer, EsmForProteinFolding

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)

model = model.cuda()

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
print(torch.cuda.memory_summary())


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  13601 MiB |  13601 MiB |  13601 MiB |      0 B   |
|       from large pool |  13469 MiB |  13469 MiB |  13469 MiB |      0 B   |
|       from small pool |    132 MiB |    132 MiB |    132 MiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |  13601 MiB |  13601 MiB |  13601 MiB |      0 B   |
|       from large pool |  13469 MiB |  13469 MiB |  13469 MiB |      0 B   |
|       from small pool |    132 MiB |    132 MiB |    132 MiB |      0 B   |
|---------------------------------------------------------------

In [5]:
df.head()

Unnamed: 0,organism,organism_id,name,evidence,function,id,embedding,sequence
0,Homo sapiens (Human),9606,EPHA7,1,Receptor tyrosine kinase which binds promiscuo...,Q15375,"[-0.015253728, 0.016237658, -0.016555615, 0.02...",MVFQTRYPSWIILCYIWLLRFAHTGEAQAAKEVLLLDSKAQQTELE...
1,Homo sapiens (Human),9606,ANXA8,1,This protein is an anticoagulant protein that ...,P13928,"[-0.008352073, 0.00474287, 0.006541474, -0.002...",MAWWKSWIEQEGVTVKSSSHFNPDPDAETLYKAMKGIGTNEQAIID...
2,Homo sapiens (Human),9606,DPY19L2P1,2,Probable C-mannosyltransferase that mediates C...,Q6NXN4,"[-0.00039709447, -0.02393247, -0.014100584, 0....",MKKQGVNPKPLQSSRPSPSKRPYGASPARELEVEKSALGGGKLPGG...
3,Homo sapiens (Human),9606,NR1D1,1,Transcriptional repressor which coordinates ci...,P20393,"[0.008172105, -0.0116752, -0.016805198, -0.005...",MTTLDSNNNTGGVITYIGSSGSSPSRTSPESLYSDNSNGSFQSLTQ...
4,Homo sapiens (Human),9606,SLC15A2,1,Proton-coupled amino-acid transporter that tra...,Q16348,"[0.0031013805, -0.0019497981, -0.003831747, 0....",MNPFQKNESKETLFSPVSIEEVPPRPPSPPKKPSPTICGSNYPLSI...


In [6]:
import torch
import pandas as pd
from tqdm.auto import tqdm

# Define a desired batch size (modify as needed)
batch_size = 4

# Initialize your output dictionary with metadata keys;
# keys from model output will be added on first iteration.
output_dict = {"protein_id": [], "sequence": [], "pdb": []}

with torch.no_grad():
    # Process the DataFrame in batches
    for i in tqdm(range(0, len(df), batch_size)):
        batch_df = df.iloc[i:i+batch_size]
        
        # Prepare the list of sequences in the current batch
        sequences = batch_df["sequence"].tolist()
        
        # Tokenize the batch of sequences and move the tensor to GPU
        tokenized_input = tokenizer(
            sequences, 
            return_tensors="pt", 
            add_special_tokens=False,
            padding=True,
            truncation=True,
            max_length=1024
        )['input_ids'].cuda()

        # Run the model on the batched input
        output = model(tokenized_input)

        # Initialize any additional keys from the model output during the first batch
        if i == 0:
            for k in output.keys():
                output_dict[k] = []

        # Loop over each key in the model output and process each sample in the batch
        for k, value in output.items():
            # Convert the output tensor to a NumPy array
            output_np = value.cpu().numpy()
            # Append each sample separately to keep alignment with metadata
            for sample_idx in range(output_np.shape[0]):
                output_dict[k].append(output_np[sample_idx])

        # Process protein ids and sequences from the batch
        output_dict["protein_id"].extend(batch_df["id"].tolist())
        output_dict["sequence"].extend(sequences)

        # For each sample in the batch, generate the corresponding pdb string.
        # Here we process them one by one to ensure compatibility with output_to_pdb.
        for sample_idx in range(tokenized_input.size(0)):
            # Prepare a dictionary with the output slice for this specific sample.
            single_sample_output = {k: value[sample_idx:sample_idx+1] for k, value in output.items()}
            pdb_str = model.output_to_pdb(single_sample_output)
            output_dict["pdb"].append(pdb_str)

# Create a DataFrame from the completed dictionary.
output_df = pd.DataFrame(output_dict)


  0%|          | 0/2500 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 59.25 GiB. GPU 0 has a total capacity of 79.22 GiB of which 44.74 GiB is free. Including non-PyTorch memory, this process has 34.46 GiB memory in use. Of the allocated memory 33.53 GiB is allocated by PyTorch, and 276.04 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
output_df.head()

Unnamed: 0,protein_id,sequence,pdb,frames,sidechain_frames,unnormalized_angles,angles,positions,states,s_s,...,residx_atom37_to_atom14,atom37_atom_exists,residue_index,lddt_head,plddt,ptm_logits,ptm,aligned_confidence_probs,predicted_aligned_error,max_predicted_aligned_error
0,1,MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIV...,[PARENT N/A\nATOM 1 N MET A 1 ...,[[[[ 0.53268844 0.42589465 0.6979253 -0.218...,[[[[[[-6.9713593e-02 8.2730621e-01 5.5740875...,[[[[[-0.72785556 -0.68563384]\n [-0.7659888 -...,[[[[[-0.7279038 -0.68567926]\n [-0.766075 -...,[[[[[ 5.1296043 19.622414 69.8106 ]\n [ 3.9...,[[[[ 2.22066760e-01 -2.29268894e-01 -4.8433545...,"[[[-36.013134, -5.151828, 50.13264, -20.710196...",...,"[[[0, 1, 2, 4, 3, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",[[[[[ -3.5443087 -1.6201594 12.064354 .....,"[[[0.8376053, 0.86606765, 0.85308105, 0.789050...",[[[[31.894642 26.905233 21.444742 18.208603...,0.94865006,[[[[9.9319035e-01 6.7633195e-03 2.8754030e-05 ...,"[[[0.25346506, 0.92572325, 2.2531812, 2.304195...",31.75


In [48]:
import numpy as np
np.save("structural_test.npy", output_df.to_numpy())

## Loading

In [None]:
output_df_loaded = np.load("structural_test.npy", allow_pickle=True)
output_df_loaded

In [52]:
output_dict_loaded = {k : [] for k in ['protein_id', 'sequence', 'pdb', 'frames', 'sidechain_frames',
       'unnormalized_angles', 'angles', 'positions', 'states', 's_s', 's_z',
       'distogram_logits', 'lm_logits', 'aatype', 'atom14_atom_exists',
       'residx_atom14_to_atom37', 'residx_atom37_to_atom14',
       'atom37_atom_exists', 'residue_index', 'lddt_head', 'plddt',
       'ptm_logits', 'ptm', 'aligned_confidence_probs',
       'predicted_aligned_error', 'max_predicted_aligned_error']}

In [43]:
output_df.to_json("structural_test.json")

TypeError: array(0.94865006, dtype=float32) (numpy-scalar) is not JSON serializable at the moment

In [3]:
print(model)

EsmForProteinFolding(
  (esm): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(33, 2560, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(1026, 2560, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0-35): 36 x EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=2560, out_features=2560, bias=True)
              (key): Linear(in_features=2560, out_features=2560, bias=True)
              (value): Linear(in_features=2560, out_features=2560, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
              (rotary_embeddings): RotaryEmbedding()
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=2560, out_features=2560, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((2560,), eps=1