In [15]:
import requests
from hydra import initialize, compose
import polars as pl
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from tqdm import tqdm
from src.ergochemics.standardize import standardize_rxn, hash_reaction
import json
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
import torch
from tqdm import tqdm

In [16]:
with initialize(version_base=None, config_path="../configs/filepaths"):
    cfg = compose(config_name="base")

In [17]:
uni_ids = [
    'P0DX75',
    'P9WQG7',
    'P0DXF9',
    'A0A8B6XWW9',
    'P0DXX5',
    'P0DXB6',
    'P0DXD5',
    'P0DXE2',
    'C0HMB0',
    'P0DXH6',
    'P0DXC4',
    'P0DXG0',
    'P0DO78',
    'P0DXD6',
    'P0DXB5',
    'P9WEM0',
    'P0DO76',
    'P0DXE7',
    'P0DXE4',
    'P0DXH2',
    'P0DXD7',
    'A0A9J9HGY6',
    'A0A913XCT1',
    'P0DO83',
    'P0DO84',
    'P0DXB2',
    'P0DXE0',
    'P13288',
    'P0DXB4',
    'P0DXB8',
    'A0A8D5M3Y5',
    'P0DXE1',
    'P0DO77',
    'P0DO86',
    'P0DXE3',
    'O41136',
    'P0DXD3',
    'P0DW70',
    'P0DXD2',
    'P0DXB7',
    'P0DX71'
]

save_to = "/home/stef/quest_data/hiec/data/time_split/af2_add"

In [18]:
for uni_id in uni_ids:
    url = f'https://alphafold.ebi.ac.uk/api/prediction/{uni_id}'
    response = requests.get(url)
    print(f'{uni_id}: {response.status_code}')

    if response.status_code == 200:
        print(response.json()[0]['cifUrl'])
        cif_url = response.json()[0]['cifUrl']
        fn = cif_url.split('/')[-1]
        cif_response = requests.get(cif_url)
        filename = f"{save_to}/{fn}.cif"
        with open(filename, 'wb') as f:
            f.write(cif_response.content)
        print(f'Saved {filename}')
    else:
        print(f'No prediction found for {uni_id}')

P0DX75: 200
https://alphafold.ebi.ac.uk/files/AF-P0DX75-F1-model_v6.cif
Saved /home/stef/quest_data/hiec/data/time_split/af2_add/AF-P0DX75-F1-model_v6.cif.cif
P9WQG7: 200
https://alphafold.ebi.ac.uk/files/AF-P9WQG7-F1-model_v6.cif
Saved /home/stef/quest_data/hiec/data/time_split/af2_add/AF-P9WQG7-F1-model_v6.cif.cif
P0DXF9: 200
https://alphafold.ebi.ac.uk/files/AF-P0DXF9-F1-model_v6.cif
Saved /home/stef/quest_data/hiec/data/time_split/af2_add/AF-P0DXF9-F1-model_v6.cif.cif
A0A8B6XWW9: 200
https://alphafold.ebi.ac.uk/files/AF-A0A8B6XWW9-F1-model_v6.cif
Saved /home/stef/quest_data/hiec/data/time_split/af2_add/AF-A0A8B6XWW9-F1-model_v6.cif.cif
P0DXX5: 200
https://alphafold.ebi.ac.uk/files/AF-P0DXX5-F1-model_v6.cif
Saved /home/stef/quest_data/hiec/data/time_split/af2_add/AF-P0DXX5-F1-model_v6.cif.cif
P0DXB6: 200
https://alphafold.ebi.ac.uk/files/AF-P0DXB6-F1-model_v6.cif
Saved /home/stef/quest_data/hiec/data/time_split/af2_add/AF-P0DXB6-F1-model_v6.cif.cif
P0DXD5: 200
https://alphafold.ebi.

In [19]:
clip_dir = "/home/stef/clipzyme/files"
af2_whitelist = {}
for file in (Path(cfg.data) / "time_split" / "af2_add").iterdir():
    af2_whitelist[file.stem.split("-")[1]] = str(file).replace("/home/stef/quest_data", "/projects/p30041/spn1560")

In [20]:
rand_negs = pl.read_parquet(Path(cfg.scratch) / "time_split_250915_time_split/None_random/Nonefold/test.parquet")
rand_negs.head()

protein_idx,reaction_idx,pid,rid,protein_embedding,smarts,am_smarts,reaction_center,am_chiral_smarts,chiral_smarts,y
i32,i32,str,str,list[f32],str,str,list[list[list[i64]]],str,str,i64
0,0,"""P9WEM0""","""772cec55ef0385f46f9c59a741eff3…","[-0.045221, 0.127457, … -0.064685]","""CC1=C(COP(=O)(O)O)C2(C)CCCC(C)…","""[CH3:6][C:4]1=[C:3]([CH2:1][O:…","[[[3, 4], [0]], [[3, 4], [2]]]","""[CH3:1][C:2]1=[C:3]([CH2:4][OH…","""CC1=C(CO)[C@@]2(C)CCCC(C)(C)[C…",1
1,1,"""A0A085GHR3""","""6b5456b8e1371df3f1326343255cbc…","[0.08258, 0.175821, … 0.161705]","""*NC(CO)C(*)=O.Nc1ncnc2c1ncn2C1…","""[*:6][NH:4][CH:3]([CH2:1][OH:1…","[[[3, 4], [22, 19]], [[3, 4], [19, 21]]]","""[*:1][NH:2][C@@H:3]([CH2:4][O:…","""*N[C@@H](COP(=O)(O)O)C(*)=O.Nc…",1
2,2,"""O24769""","""181b940baaf957ae014788e3f2b289…","[0.059184, 0.253729, … 0.162025]","""Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O…","""[NH2:18][c:16]1[n:17][cH:15][n…","[[[13, 14], [0]], [[13, 14], [2]]]","""[NH2:1][c:2]1[n:3][cH:4][n:5][…","""Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO…",1
3,3,"""A0A397HK53""","""e083443eeced3a35861687bd0dbcf4…","[0.072891, 0.278549, … 0.093182]","""Nc1ncnc2c1ncn2C1OC(CSCCC(N)C(=…","""[NH2:27][c:25]1[n:26][cH:24][n…","[[[14], [0, 1]], [[0, 1], [17]]]","""[CH3:1][O:45][c:44]1[cH:43][cH…","""COc1ccc(/C=C2\NC(=O)[C@H](CCCN…",1
4,4,"""Q4WBI5""","""d1e49e481f532c530e85f82580657a…","[0.083018, 0.236445, … 0.102194]","""Cc1cc(O)cc(=O)o1.CC(C)=CCCC(C)…","""[CH3:10][c:9]1[cH:5][c:3]([OH:…","[[[5], [19, 20]], [[19, 20], [2]]]","""[CH3:1][C:2]([CH3:3])=[CH:4][C…","""CC(C)=CCC/C(C)=C/CC/C(C)=C/CC/…",1


In [21]:
toc = pl.read_csv(Path(cfg.data) / "time_split" / "250915_time_split.csv", separator='\t')
toc.head()

Entry,Label,Sequence
str,str,str
"""P9WEM0""","""772cec55ef0385f46f9c59a741eff3…","""MCTTFKAAIFDMGGVLFTWNPIVDTQVSLK…"
"""A0A085GHR3""","""6b5456b8e1371df3f1326343255cbc…","""MFNVSNNVAPSRYQGPSSTSVTPNAFHDVP…"
"""O24769""","""181b940baaf957ae014788e3f2b289…","""MSSLLDIIYQLRQVPRWDGSFQFEKEDVSQ…"
"""A0A397HK53""","""e083443eeced3a35861687bd0dbcf4…","""MYESIIPFPDETASTVDKYCCSHSTSLPPL…"
"""Q4WBI5""","""d1e49e481f532c530e85f82580657a…","""MCKMYKVVETDASPGQRSVLQLVKDLLILS…"


In [22]:
random_clip = rand_negs.filter(
    pl.col("pid").is_in(list(af2_whitelist.keys()))
).join(
    toc,
    how="left",
    left_on="pid",
    right_on="Entry",
).select(
    ["am_chiral_smarts", "Sequence", "pid", "y"]
).with_columns(
    pl.col("pid").replace(af2_whitelist).alias("cif")
).rename(
    {
        "am_chiral_smarts": "reaction",
        "Sequence": "sequence",
        "pid": "protein_id",
    }
)
print(f"# pairs post af2 filtering: {random_clip.height} vs pre-filtering: {rand_negs.height}")
random_clip.head()

# pairs post af2 filtering: 640 vs pre-filtering: 6072


reaction,sequence,protein_id,y,cif
str,str,str,i64,str
"""[CH3:1][C:2]1=[C:3]([CH2:4][OH…","""MCTTFKAAIFDMGGVLFTWNPIVDTQVSLK…","""P9WEM0""",1,"""/projects/p30041/spn1560/hiec/…"
"""[NH2:1][C:2](=[O:3])[C:4]1=[CH…","""MSRVSDRVAEKVALISGAARGMGASHAQVL…","""P0DXE0""",1,"""/projects/p30041/spn1560/hiec/…"
"""[NH2:1][c:2]1[n:3][c:4]2[c:5](…","""MEYSWLENKLQEINRKCIQINNKKSNNDSL…","""P0DXB7""",1,"""/projects/p30041/spn1560/hiec/…"
"""[NH2:1][c:2]1[n:3][c:4]2[c:5](…","""MEYSWLENKLQEINRKCIQINNKKSNNDSL…","""P0DXB7""",1,"""/projects/p30041/spn1560/hiec/…"
"""[SH:4][CH2:5][CH2:6][NH:7][C:8…","""MSFSPTYSIVMASPLLTSSQMIPTTGSTVG…","""P0DO77""",1,"""/projects/p30041/spn1560/hiec/…"


In [26]:
random_clip.write_csv(Path(clip_dir) / "time_split_random_af2_add.csv")