In [None]:
!rm -rf tmp # cleaning up is necessary because the decoder will skip already existing files
!mkdir -p tmp
%cd tmp

In [None]:
# Get the sequence for structure prediction
from modelgenerator.structure_tokenizer.datasets.protein import Protein
from modelgenerator.structure_tokenizer.utils.constants import residue_constants as RC

# 6vxx A chain
pdb_id, chain_id = '6vxx', 'A'
!wget -qnc https://files.rcsb.org/download/{pdb_id}.pdb

aatype_tensor = Protein.from_pdb_file_path(f'{pdb_id}.pdb', chain_id).aatype
seq = "".join(list(RC.restype_1to3)[i] for i in aatype_tensor)
seq

In [None]:
# dump the sequence to a csv file
import pandas as pd

df = pd.DataFrame(data={"idx": [0], "aa_seq": [seq], "seq_len": [len(seq)]})
df.to_csv("tmp.csv", index=False)

In [None]:
# language model: amino acid sequence -> structure tokens
!WANDB_MODE=dryrun mgen predict --config ../protein2structoken_16b.yaml \
            --data.init_args.path "csv" \
            --data.init_args.test_split_files ["tmp.csv"]

In [None]:
# post process
!python ../struct_token_format_conversion.py logs/protein2structoken_16b/predict_predictions.tsv logs/protein2structoken_16b/predict_predictions.pt
!python ../extract_structure_tokenizer_codebook.py --output_path logs/protein2structoken_16b/codebook.pt

In [None]:
# Decode: structure tokens -> 3D coordinates
!WANDB_MODE=dryrun CUDA_VISIBLE_DEVICES=0 mgen predict --config ../decode.yaml \
 --data.init_args.config.struct_tokens_datasets_configs.name=protein2structoken_16b \
 --data.init_args.config.struct_tokens_datasets_configs.struct_tokens_path=./logs/protein2structoken_16b/predict_predictions.pt \
 --data.init_args.config.struct_tokens_datasets_configs.codebook_path=./logs/protein2structoken_16b/codebook.pt

In [None]:
# install visualization tool
!pip install py3Dmol -q
import py3Dmol

In [None]:
# visualize the prediction and the ground truth
def visualize(file):
    view = py3Dmol.view(query='pdb')
    with open(file, 'r') as f:
        view.addModel(f.read(), 'pdb')
    view.setStyle({'cartoon': {'color': 'spectrum'}})
    view.zoomTo()
    view.show()

prediction = "logs/protstruct_decode/protein2structoken_16b_pdb_files/0__output.pdb"
ground_truth = f"{pdb_id}_{chain_id}.pdb"
# drop the additional chain in the ground truth before visualization
Protein.from_pdb_file_path(f'{pdb_id}.pdb', chain_id).to_pdb(f"{pdb_id}_{chain_id}.pdb")

print("Ground truth:")
visualize(f"{pdb_id}_{chain_id}.pdb")
print("Prediction:")
visualize(prediction)