# Version 2 Workspace

In [None]:
!pip install --upgrade py3Dmol accelerate

In [None]:
from transformers import AutoTokenizer, EsmForProteinFolding

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True)

model = model.cuda()

In [None]:
import torch

torch.backends.cuda.matmul.allow_tf32 = True

In [None]:
test_protein = "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"

tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids']
tokenized_input = tokenized_input.cuda()

with torch.no_grad():
    output = model(tokenized_input)


# Real Structual Embedding Generations

## Load Dataset

In [1]:
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()


In [2]:
import pandas as pd
df = pd.read_parquet("./data/dataset/sequence_and_embeddings.parquet")
df.head()

Unnamed: 0,organism,organism_id,name,evidence,function,id,embedding,sequence
0,Homo sapiens (Human),9606,EPHA7,1,Receptor tyrosine kinase which binds promiscuo...,Q15375,"[-0.015253728, 0.016237658, -0.016555615, 0.02...",MVFQTRYPSWIILCYIWLLRFAHTGEAQAAKEVLLLDSKAQQTELE...
1,Homo sapiens (Human),9606,ANXA8,1,This protein is an anticoagulant protein that ...,P13928,"[-0.008352073, 0.00474287, 0.006541474, -0.002...",MAWWKSWIEQEGVTVKSSSHFNPDPDAETLYKAMKGIGTNEQAIID...
2,Homo sapiens (Human),9606,DPY19L2P1,2,Probable C-mannosyltransferase that mediates C...,Q6NXN4,"[-0.00039709447, -0.02393247, -0.014100584, 0....",MKKQGVNPKPLQSSRPSPSKRPYGASPARELEVEKSALGGGKLPGG...
3,Homo sapiens (Human),9606,NR1D1,1,Transcriptional repressor which coordinates ci...,P20393,"[0.008172105, -0.0116752, -0.016805198, -0.005...",MTTLDSNNNTGGVITYIGSSGSSPSRTSPESLYSDNSNGSFQSLTQ...
4,Homo sapiens (Human),9606,SLC15A2,1,Proton-coupled amino-acid transporter that tra...,Q16348,"[0.0031013805, -0.0019497981, -0.003831747, 0....",MNPFQKNESKETLFSPVSIEEVPPRPPSPPKKPSPTICGSNYPLSI...


In [3]:
from transformers import AutoTokenizer, EsmForProteinFolding

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")

model = model.cuda()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(torch.cuda.memory_summary())


In [None]:
df.shape

## Single Batch

In [None]:
import torch
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

output_dict = {"protein_id": [], "sequence": [], "pdb": []}

START_IDX = 0
END_IDX = 1000 # len(df)

with torch.no_grad():
    for i in tqdm(range(START_IDX, END_IDX)):

        tokenized_input = tokenizer([df.iloc[i]["sequence"]], return_tensors="pt", add_special_tokens=False)['input_ids']
        tokenized_input = tokenized_input.cuda()

        output = model(tokenized_input)
        if len(output_dict.keys()) < 4:
            for k in output.keys():
                output_dict[k] = []

        for k in output.keys():
            output_dict[k].append(output[k].cpu().numpy())

        output_dict["protein_id"].append(df.iloc[i]["id"])
        output_dict["sequence"].append(df.iloc[i]["sequence"])
        output_dict["pdb"].append(model.output_to_pdb(output))

        if i % 50 == 0:
            # Create a DataFrame from the completed dictionary.
            output_df = pd.DataFrame(output_dict)
            np.save(f"structural_test_{START_IDX}_{END_IDX}.npy", output_df.to_numpy())

# S_S Embeddings

In [8]:
import torch
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

output_dict = {"protein_id": [], "sequence": [], "embedding": []}

START_IDX = 0
END_IDX = 2000  # len(df)

VALID_AAS = set("ACDEFGHIKLMNPQRSTVWY")
MAX_LEN = 1024

def is_valid_sequence(seq):
    return isinstance(seq, str) and 0 < len(seq) <= MAX_LEN and all(aa in VALID_AAS for aa in seq)

with torch.no_grad():
    for i in tqdm(range(START_IDX, END_IDX)):
        seq = df.iloc[i]["sequence"]
        if not is_valid_sequence(seq):
            continue  # Skip invalid sequences

        
        tokenized_input = tokenizer([df.iloc[i]["sequence"]], return_tensors="pt", add_special_tokens=False)['input_ids']
        tokenized_input = tokenized_input.cuda()

        output = model(tokenized_input)

        hidden = output.s_s  # Shape: (1, L, D)

        # Mean pooling over residues
        mean_pooled = hidden.mean(dim=1).squeeze(0).cpu().numpy()

        # Store result
        output_dict["protein_id"].append(df.iloc[i]["id"])
        output_dict["sequence"].append(seq)
        output_dict["embedding"].append(mean_pooled)


        # Periodically save
        if (i + 1) % 50 == 0 or i == END_IDX - 1:
            df_out = pd.DataFrame(output_dict)
            df_out.to_pickle(f"structural_embeddings_{START_IDX}_{END_IDX}.pkl")


 19%|█▉        | 375/2000 [52:34<4:08:59,  9.19s/it] 

In [4]:
sample = pd.read_pickle('structural_embeddings_0_1000.pkl')

In [7]:
sample['embedding'][0].shape

(1024,)

In [None]:
sequences = df['sequence'].tolist()

In [None]:
x = min([len(i) for i in sequences])
x

In [None]:
is_valid_sequence(sequences[0])

In [None]:
get_structural_embedding(sequences[0])

In [None]:
!export CUDA_LAUNCH_BLOCKING=1
embeddings = get_structural_embeddings(sequences, batch_size=4)

## BATCHED IMPL

In [None]:
import torch
import pandas as pd
from tqdm.auto import tqdm

START_IDX = 0
END_IDX = 1000 # len(df)

model.eval()
# Define a desired batch size (modify as needed)
batch_size = 1

# Initialize your output dictionary with metadata keys;
# keys from model output will be added on first iteration.
output_dict = {"protein_id": [], "sequence": [], "pdb": []}

with torch.no_grad():
    # Process the DataFrame in batches
    for i in tqdm(range(START_IDX, END_IDX, batch_size)):
        batch_df = df.iloc[i:i+batch_size]
        
        # Prepare the list of sequences in the current batch
        sequences = batch_df["sequence"].tolist()
        
        # Tokenize the batch of sequences and move the tensor to GPU
        tokenized_input = tokenizer(
            sequences, 
            return_tensors="pt", 
            add_special_tokens=False,
            padding=True,
            truncation=True,
            max_length=1024
        )['input_ids'].cuda()

        # Run the model on the batched input
        
        output = model(tokenized_input)

        # Initialize any additional keys from the model output during the first batch
        if i == 0:
            for k in output.keys():
                output_dict[k] = []

        # Loop over each key in the model output and process each sample in the batch
        for k, value in output.items():
            # Convert the output tensor to a NumPy array
            output_np = value.cpu().numpy()
            # Append each sample separately to keep alignment with metadata
            if k == "ptm" p:
                output_dict[k].append(output_np)
            else:

                for sample_idx in range(output_np.shape[0]):
                    output_dict[k].append(output_np[sample_idx])

        # Process protein ids and sequences from the batch
        output_dict["protein_id"].extend(batch_df["id"].tolist())
        output_dict["sequence"].extend(sequences)

        # For each sample in the batch, generate the corresponding pdb string.
        # Here we process them one by one to ensure compatibility with output_to_pdb.
        for sample_idx in range(tokenized_input.size(0)):
            # Prepare a dictionary with the output slice for this specific sample.
            single_sample_output = {k: value[sample_idx:sample_idx+1] for k, value in output.items()}
            pdb_str = model.output_to_pdb(single_sample_output)
            output_dict["pdb"].append(pdb_str)

        if i % 50 == 0:
            # Create a DataFrame from the completed dictionary.
            output_df = pd.DataFrame(output_dict)
            np.save("structural_test_0_1000.npy", output_df.to_numpy())


In [None]:
k

In [None]:
import numpy as np
np.save("structural_test.npy", output_df.to_numpy())

## Loading

In [None]:
output_df_loaded = np.load("structural_test.npy", allow_pickle=True)
output_df_loaded

In [None]:
output_dict_loaded = {k : [] for k in ['protein_id', 'sequence', 'pdb', 'frames', 'sidechain_frames',
       'unnormalized_angles', 'angles', 'positions', 'states', 's_s', 's_z',
       'distogram_logits', 'lm_logits', 'aatype', 'atom14_atom_exists',
       'residx_atom14_to_atom37', 'residx_atom37_to_atom14',
       'atom37_atom_exists', 'residue_index', 'lddt_head', 'plddt',
       'ptm_logits', 'ptm', 'aligned_confidence_probs',
       'predicted_aligned_error', 'max_predicted_aligned_error']}

In [None]:
output_df.to_json("structural_test.json")

In [None]:
print(model)