In [14]:
from hydra import initialize, compose
import polars as pl
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from tqdm import tqdm
from src.ergochemics.standardize import standardize_rxn, hash_reaction
import json
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
import pickle
import torch

In [15]:
with initialize(version_base=None, config_path="../configs/filepaths"):
    cfg = compose(config_name="base")

In [16]:
arc_negs = pl.read_parquet(Path(cfg.scratch) / "time_split_250915_time_split/None_alternate_reaction_center/Nonefold/test.parquet")
arc_negs.head()

protein_idx,reaction_idx,pid,rid,protein_embedding,smarts,am_smarts,reaction_center,y
i64,i64,str,str,list[f32],str,str,list[list[list[i64]]],i64
0,0,"""Q9H3Z7""","""b974ffacf1d79e2635848eb4228153…","[0.03637, 0.16163, … 0.000506]","""*C(=O)OCC(O)CO.O>>*C(=O)O.OCC(…","""[*:3][C:1](=[O:4])[O:5][CH2:6]…","[[[1, 3], [0]], [[1, 3], [0]]]",1
1,1,"""Q0RWC9""","""1a55f3c111dc32bc80f27e9d4874ea…","[-0.023623, 0.137708, … 0.061511]","""O=C(O)c1ccc(O)c(O)c1.O=C=O>>O=…","""[O:12]=[C:8]([OH:13])[c:5]1[cH…","[[[10], [1, 0]], [[10, 11, 13]]]",1
2,2,"""Q80XH4""","""46b8f933837da28efe31dccc0dc939…","[-0.046594, 0.372996, … -0.210981]","""[1*]C(O)C(COC1OC(CO)C(OC2OC(CO…","""[1*:73][CH:71]([OH:74])[CH:70]…","[[[53, 56], [17, 15]], [[53, 54], [15, 18]]]",1
3,3,"""H2LNR5""","""9851de4d3521c90a80bc8cf68a948f…","[-0.062493, 0.197662, … 0.024808]","""NC(CCC(=O)NC(CS)C(=O)NCC(=O)O)…","""[NH2:17][CH:16]([CH2:15][CH2:1…","[[[14, 16], [21], [2, 1]], [[14, 16], [0], [22, 23]]]",1
4,4,"""Q8NKB0""","""17bb99c7a7a73802ae1dc5760af1cb…","[0.072061, 0.220584, … 0.06905]","""CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c…","""[CH3:4][CH:2]1[CH2:5][CH2:6][C…","[[[1, 22], [0]], [[1, 2, 23]]]",1


In [17]:
rand_negs = pl.read_parquet(Path(cfg.scratch) / "time_split_250915_time_split/None_random/Nonefold/test.parquet")
arc_negs.head()

protein_idx,reaction_idx,pid,rid,protein_embedding,smarts,am_smarts,reaction_center,y
i64,i64,str,str,list[f32],str,str,list[list[list[i64]]],i64
0,0,"""Q9H3Z7""","""b974ffacf1d79e2635848eb4228153…","[0.03637, 0.16163, … 0.000506]","""*C(=O)OCC(O)CO.O>>*C(=O)O.OCC(…","""[*:3][C:1](=[O:4])[O:5][CH2:6]…","[[[1, 3], [0]], [[1, 3], [0]]]",1
1,1,"""Q0RWC9""","""1a55f3c111dc32bc80f27e9d4874ea…","[-0.023623, 0.137708, … 0.061511]","""O=C(O)c1ccc(O)c(O)c1.O=C=O>>O=…","""[O:12]=[C:8]([OH:13])[c:5]1[cH…","[[[10], [1, 0]], [[10, 11, 13]]]",1
2,2,"""Q80XH4""","""46b8f933837da28efe31dccc0dc939…","[-0.046594, 0.372996, … -0.210981]","""[1*]C(O)C(COC1OC(CO)C(OC2OC(CO…","""[1*:73][CH:71]([OH:74])[CH:70]…","[[[53, 56], [17, 15]], [[53, 54], [15, 18]]]",1
3,3,"""H2LNR5""","""9851de4d3521c90a80bc8cf68a948f…","[-0.062493, 0.197662, … 0.024808]","""NC(CCC(=O)NC(CS)C(=O)NCC(=O)O)…","""[NH2:17][CH:16]([CH2:15][CH2:1…","[[[14, 16], [21], [2, 1]], [[14, 16], [0], [22, 23]]]",1
4,4,"""Q8NKB0""","""17bb99c7a7a73802ae1dc5760af1cb…","[0.072061, 0.220584, … 0.06905]","""CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c…","""[CH3:4][CH:2]1[CH2:5][CH2:6][C…","[[[1, 22], [0]], [[1, 2, 23]]]",1


In [18]:
toc = pl.read_csv(Path(cfg.data) / "time_split" / "250915_time_split.csv", separator='\t')
toc.head()


Entry,Label,Sequence
str,str,str
"""P9WEM0""","""772cec55ef0385f46f9c59a741eff3…","""MCTTFKAAIFDMGGVLFTWNPIVDTQVSLK…"
"""A0A085GHR3""","""6b5456b8e1371df3f1326343255cbc…","""MFNVSNNVAPSRYQGPSSTSVTPNAFHDVP…"
"""O24769""","""181b940baaf957ae014788e3f2b289…","""MSSLLDIIYQLRQVPRWDGSFQFEKEDVSQ…"
"""A0A397HK53""","""e083443eeced3a35861687bd0dbcf4…","""MYESIIPFPDETASTVDKYCCSHSTSLPPL…"
"""Q4WBI5""","""d1e49e481f532c530e85f82580657a…","""MCKMYKVVETDASPGQRSVLQLVKDLLILS…"


In [19]:
clip_dir = "/home/stef/clipzyme/files"
af2_whitelist = {}
for file in (Path(cfg.data) / "time_split" / "af2").iterdir():
    af2_whitelist[file.stem.split("-")[1]] = str(file).replace("/home/stef/quest_data", "/projects/p30041/spn1560")

In [20]:
arc_clip = arc_negs.filter(
    pl.col("pid").is_in(list(af2_whitelist.keys()))
).join(
    toc,
    how="left",
    left_on="pid",
    right_on="Entry",
).select(
    ["am_smarts", "Sequence", "pid", "y"]
).with_columns(
    pl.col("pid").replace(af2_whitelist).alias("cif")
).rename(
    {
        "am_smarts": "reaction",
        "Sequence": "sequence",
        "pid": "protein_id",
    }
)
print(f"# pairs post af2 filtering: {arc_clip.height} vs pre-filtering: {arc_negs.height}")
arc_clip.head()

# pairs post af2 filtering: 6332 vs pre-filtering: 7234


reaction,sequence,protein_id,y,cif
str,str,str,i64,str
"""[*:3][C:1](=[O:4])[O:5][CH2:6]…","""MCVICFVKALVRVFKIYLTASYTYPFRGWP…","""Q9H3Z7""",1,"""/projects/p30041/spn1560/hiec/…"
"""[O:12]=[C:8]([OH:13])[c:5]1[cH…","""MNETDIEGARVVVAEACRVAAARGLMEGIL…","""Q0RWC9""",1,"""/projects/p30041/spn1560/hiec/…"
"""[1*:73][CH:71]([OH:74])[CH:70]…","""MAKPFFRLQKFLRRTQFLLLFLTAAYLMTG…","""Q80XH4""",1,"""/projects/p30041/spn1560/hiec/…"
"""[NH2:17][CH:16]([CH2:15][CH2:1…","""MAPMLVSLNCGIRVQRRTLTLLIRQTSSYH…","""H2LNR5""",1,"""/projects/p30041/spn1560/hiec/…"
"""[CH3:4][CH:2]1[CH2:5][CH2:6][C…","""MRTRSTISTPNGITWYYEQEGTGPDVVLVP…","""Q8NKB0""",1,"""/projects/p30041/spn1560/hiec/…"


In [21]:
arc_clip.write_csv(Path(clip_dir) / "time_split_arc.csv")

In [22]:
random_clip = rand_negs.filter(
    pl.col("pid").is_in(list(af2_whitelist.keys()))
).join(
    toc,
    how="left",
    left_on="pid",
    right_on="Entry",
).select(
    ["am_chiral_smarts", "Sequence", "pid", "y"]
).with_columns(
    pl.col("pid").replace(af2_whitelist).alias("cif")
).rename(
    {
        "am_chiral_smarts": "reaction",
        "Sequence": "sequence",
        "pid": "protein_id",
    }
)
print(f"# pairs post af2 filtering: {random_clip.height} vs pre-filtering: {rand_negs.height}")
random_clip.head()

# pairs post af2 filtering: 5285 vs pre-filtering: 6072


reaction,sequence,protein_id,y,cif
str,str,str,i64,str
"""[*:1][NH:2][C@@H:3]([CH2:4][O:…","""MFNVSNNVAPSRYQGPSSTSVTPNAFHDVP…","""A0A085GHR3""",1,"""/projects/p30041/spn1560/hiec/…"
"""[NH2:1][c:2]1[n:3][cH:4][n:5][…","""MSSLLDIIYQLRQVPRWDGSFQFEKEDVSQ…","""O24769""",1,"""/projects/p30041/spn1560/hiec/…"
"""[CH3:1][O:45][c:44]1[cH:43][cH…","""MYESIIPFPDETASTVDKYCCSHSTSLPPL…","""A0A397HK53""",1,"""/projects/p30041/spn1560/hiec/…"
"""[CH3:1][C:2]([CH3:3])=[CH:4][C…","""MCKMYKVVETDASPGQRSVLQLVKDLLILS…","""Q4WBI5""",1,"""/projects/p30041/spn1560/hiec/…"
"""[NH2:1][C:2](=[O:3])[C:4]1=[CH…","""MNEILKKRLKLLKNNFGTHINKIANKKILI…","""A0A0U3AP28""",1,"""/projects/p30041/spn1560/hiec/…"


In [23]:
random_clip.write_csv(Path(clip_dir) / "time_split_random.csv")

In [24]:
random_clip['y'].unique()

y
i64
0
1


In [25]:
random_clip['reaction'].unique().to_list()

['[O:1]=[C:2]([OH:3])[c:4]1[cH:5][cH:6][c:7]([OH:8])[c:9]([OH:10])[cH:11]1.[C:12](=[O:13])=[O:14]>>[O:1]=[C:2]([OH:3])[c:4]1[cH:5][cH:6][c:7]([OH:8])[c:9]([OH:10])[c:11]1[C:12](=[O:13])[OH:14]',
 '[NH2:1][c:2]1[cH:3][cH:4][n:5]([C@@H:6]2[O:7][C@H:8]([CH2:9][O:10][P:11](=[O:12])([OH:13])[O:14][P:15](=[O:16])([OH:17])[O:46][P:47](=[O:48])([OH:49])[OH:50])[C@@H:19]([OH:20])[C@H:21]2[OH:22])[c:23](=[O:24])[n:25]1.[OH:18][P:43]([O:42][P:39]([O:38][CH2:37][C@H:36]1[O:35][C@@H:34]([n:33]2[c:29]3[n:28][c:27]([NH2:26])[nH:57][c:55](=[O:56])[c:30]3[n:31][cH:32]2)[C@H:53]([OH:54])[C@@H:51]1[OH:52])(=[O:40])[OH:41])(=[O:44])[OH:45]>>[NH2:1][c:2]1[cH:3][cH:4][n:5]([C@@H:6]2[O:7][C@H:8]([CH2:9][O:10][P:11](=[O:12])([OH:13])[O:14][P:15](=[O:16])([OH:17])[OH:18])[C@@H:19]([OH:20])[C@H:21]2[OH:22])[c:23](=[O:24])[n:25]1.[NH2:26][c:27]1[n:28][c:29]2[c:30]([n:31][cH:32][n:33]2[C@@H:34]2[O:35][C@H:36]([CH2:37][O:38][P:39](=[O:40])([OH:41])[O:42][P:43](=[O:44])([OH:45])[O:46][P:47](=[O:48])([OH:49])[OH:50]