In [1]:
!python -m vsx.bench.cli version
!python -m vsx.bench.cli paths
!python -m vsx.bench.cli show-config




proteosync 0.1.0
REPO_ROOT=/Users/hike/code/proteosync1
RAW_DIR=/Users/hike/code/proteosync1/artifacts/structures/raw
STD_DIR=/Users/hike/code/proteosync1/artifacts/structures/std
POCKETS_DIR=/Users/hike/code/proteosync1/artifacts/pockets
{
  "project": {
    "name": "proteosync",
    "version": "0.1.0"
  },
  "paths": {
    "artifacts": "artifacts",
    "raw": "artifacts/structures/raw",
    "std": "artifacts/structures/std",
    "pockets": "artifacts/pockets",
    "data": "/Users/hike/code/proteosync1/artifacts/data"
  },
  "run": {
    "log_level": "INFO"
  }
}


In [None]:
# Ensure targets.yaml and proceed to make the AF3 request
from pathlib import Path
import json, yaml
from vsx.utils.paths import REPO_ROOT, DATA_DIR
from vsx.data.targets import load_targets, get_target, fetch_uniprot_fasta

reg_path = REPO_ROOT / "config" / "targets.yaml"
reg_path.parent.mkdir(parents=True, exist_ok=True)

# 1) Ensure the registry exists and has GLP1R with a UniProt ID
data = load_targets()
if "targets" not in data:
    data = {"targets": {}}
glp1r = data["targets"].get("GLP1R", {})
glp1r.setdefault("uniprot", "P43220")   # default UniProt for GLP1R
glp1r.setdefault("sequence", "")        # leave empty -> auto-fetch from UniProt
glp1r.setdefault("ligand_smiles", "")
data["targets"]["GLP1R"] = glp1r
reg_path.write_text(yaml.safe_dump(data, sort_keys=False))
print("registry:", reg_path)

# 2) Load the target and get the sequence (fallback to UniProt if blank)
meta = get_target("GLP1R")
seq = (meta.get("sequence") or fetch_uniprot_fasta(meta["uniprot"]))
print("sequence length:", len(seq))

# 3) Write request.json and tell you where to put AF output
out_dir = DATA_DIR / "GLP1R"
out_dir.mkdir(parents=True, exist_ok=True)
req = {"job_name": "GLP1R_seed", "sequences": [seq], "ligand_smiles": meta.get("ligand_smiles") or None}
(out_dir / "request.json").write_text(json.dumps(req, indent=2))

print("request.json:", out_dir / "request.json")
print("Place AF3 output at:", out_dir / "seed_structure.pdb (or .cif)")


registry: /Users/hike/code/proteosync1/config/targets.yaml
sequence length: 463
request.json: /Users/hike/code/proteosync1/artifacts/data/GLP1R/request.json
Place AF3 output at: /Users/hike/code/proteosync1/artifacts/data/GLP1R/seed_structure.pdb (or .cif)


In [4]:
#Verify the structure & detect chains
from pathlib import Path
from Bio.PDB import MMCIFParser, is_aa
from vsx.utils.paths import DATA_DIR

p = Path(DATA_DIR/"GLP1R"/"seed_structure.cif")
print("exists:", p.exists(), "size:", p.stat().st_size if p.exists() else 0)
assert p.exists(), "seed_structure.cif not found where we expect it"

# parse & summarize chains
s = MMCIFParser(QUIET=True).get_structure("GLP1R", str(p))
chain_lengths = {}
for model in s:
    for ch in model:
        n = sum(1 for r in ch if is_aa(r, standard=False))
        chain_lengths[ch.id] = chain_lengths.get(ch.id, 0) + n

print("chains (aa counts):", chain_lengths)

# crude guess: longest = receptor, any chain <120 aa = peptide candidates
if chain_lengths:
    receptor_chain = max(chain_lengths, key=chain_lengths.get)
    peptide_candidates = [cid for cid, n in chain_lengths.items() if n < 120]
    print("receptor_chain:", receptor_chain)
    print("peptide_candidates:", peptide_candidates)
else:
    print("No chains found?!")


exists: True size: 309936
chains (aa counts): {'A': 300, 'B': 0, 'C': 0, 'D': 0, 'E': 0, 'F': 0, 'G': 0}
receptor_chain: A
peptide_candidates: ['B', 'C', 'D', 'E', 'F', 'G']


In [5]:
from pathlib import Path
from math import inf
from Bio.PDB import MMCIFParser, is_aa
from vsx.utils.paths import DATA_DIR, POCKETS_DIR

cif = Path(DATA_DIR/"GLP1R"/"seed_structure.cif")
s = MMCIFParser(QUIET=True).get_structure("GLP1R", str(cif))

# count residues per chain
chain_res_counts = {}
for m in s:
    for ch in m:
        chain_res_counts[ch.id] = sum(1 for r in ch if is_aa(r, standard=False))

# choose chains
receptor = max(chain_res_counts, key=chain_res_counts.get)
pep = min((cid for cid,n in chain_res_counts.items() if n>0), key=lambda c: chain_res_counts[c])

print("receptor:", receptor, "len", chain_res_counts[receptor], "| peptide:", pep, "len", chain_res_counts[pep])

# collect atoms
def aa_atoms(chain):
    for res in chain:
        if is_aa(res, standard=False):
            for atom in res.get_atoms():
                yield res, atom

model = next(iter(s))  # first model
receptor_chain = model[receptor]
peptide_chain  = model[pep]

# interface by distance threshold
cutoff = 6.0
contact_res = set()
pep_atoms = [a for _,a in aa_atoms(peptide_chain)]
for res, atom in aa_atoms(receptor_chain):
    ax,ay,az = atom.coord
    for a2 in pep_atoms:
        bx,by,bz = a2.coord
        d2 = (ax-bx)*(ax-bx) + (ay-by)*(ay-by) + (az-bz)*(az-bz)
        if d2 <= cutoff*cutoff:
            # use (chain, resseq, icode, resname)
            idt = (receptor, int(res.id[1]), res.id[2].strip() or "", res.resname.strip())
            contact_res.add(idt)
            break

contact_res = sorted(contact_res, key=lambda x: (x[1], x[2]))
print(f"interface residues: {len(contact_res)}")

# write pocket TSV
out_dir = POCKETS_DIR / "GLP1R"
out_dir.mkdir(parents=True, exist_ok=True)
out_tsv = out_dir / "peptide_interface.tsv"
with out_tsv.open("w") as f:
    f.write("chain\tresid\ticode\tresname\n")
    for ch, resid, icode, rname in contact_res:
        f.write(f"{ch}\t{resid}\t{icode}\t{rname}\n")

print("wrote:", out_tsv)


receptor: A len 300 | peptide: A len 300
interface residues: 300
wrote: /Users/hike/code/proteosync1/artifacts/pockets/GLP1R/peptide_interface.tsv
