In [17]:
from hydra import initialize, compose
import polars as pl
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from tqdm import tqdm
from src.ergochemics.standardize import standardize_rxn, hash_reaction
import json
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
import pickle

In [18]:
with initialize(version_base=None, config_path="../configs/filepaths"):
    cfg = compose(config_name="base")

In [19]:
arc_negs = pl.read_parquet(Path(cfg.scratch) / "time_split_250915_time_split/None_alternate_reaction_center/Nonefold/test.parquet")
arc_negs.head()

protein_idx,reaction_idx,pid,rid,protein_embedding,smarts,am_smarts,reaction_center,y
i64,i64,str,str,list[f32],str,str,list[list[list[i64]]],i64
0,0,"""Q9H3Z7""","""b974ffacf1d79e2635848eb4228153…","[0.03637, 0.16163, … 0.000506]","""*C(=O)OCC(O)CO.O>>*C(=O)O.OCC(…","""[*:3][C:1](=[O:4])[O:5][CH2:6]…","[[[1, 3], [0]], [[1, 3], [0]]]",1
1,1,"""Q0RWC9""","""1a55f3c111dc32bc80f27e9d4874ea…","[-0.023623, 0.137708, … 0.061511]","""O=C(O)c1ccc(O)c(O)c1.O=C=O>>O=…","""[O:12]=[C:8]([OH:13])[c:5]1[cH…","[[[10], [1, 0]], [[10, 11, 13]]]",1
2,2,"""Q80XH4""","""46b8f933837da28efe31dccc0dc939…","[-0.046594, 0.372996, … -0.210981]","""[1*]C(O)C(COC1OC(CO)C(OC2OC(CO…","""[1*:73][CH:71]([OH:74])[CH:70]…","[[[53, 56], [17, 15]], [[53, 54], [15, 18]]]",1
3,3,"""H2LNR5""","""9851de4d3521c90a80bc8cf68a948f…","[-0.062493, 0.197662, … 0.024808]","""NC(CCC(=O)NC(CS)C(=O)NCC(=O)O)…","""[NH2:17][CH:16]([CH2:15][CH2:1…","[[[14, 16], [21], [2, 1]], [[14, 16], [0], [22, 23]]]",1
4,4,"""Q8NKB0""","""17bb99c7a7a73802ae1dc5760af1cb…","[0.072061, 0.220584, … 0.06905]","""CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c…","""[CH3:4][CH:2]1[CH2:5][CH2:6][C…","[[[1, 22], [0]], [[1, 2, 23]]]",1


In [20]:
rand_negs = pl.read_parquet(Path(cfg.scratch) / "time_split_250915_time_split/None_random/Nonefold/test.parquet")
arc_negs.head()

protein_idx,reaction_idx,pid,rid,protein_embedding,smarts,am_smarts,reaction_center,y
i64,i64,str,str,list[f32],str,str,list[list[list[i64]]],i64
0,0,"""Q9H3Z7""","""b974ffacf1d79e2635848eb4228153…","[0.03637, 0.16163, … 0.000506]","""*C(=O)OCC(O)CO.O>>*C(=O)O.OCC(…","""[*:3][C:1](=[O:4])[O:5][CH2:6]…","[[[1, 3], [0]], [[1, 3], [0]]]",1
1,1,"""Q0RWC9""","""1a55f3c111dc32bc80f27e9d4874ea…","[-0.023623, 0.137708, … 0.061511]","""O=C(O)c1ccc(O)c(O)c1.O=C=O>>O=…","""[O:12]=[C:8]([OH:13])[c:5]1[cH…","[[[10], [1, 0]], [[10, 11, 13]]]",1
2,2,"""Q80XH4""","""46b8f933837da28efe31dccc0dc939…","[-0.046594, 0.372996, … -0.210981]","""[1*]C(O)C(COC1OC(CO)C(OC2OC(CO…","""[1*:73][CH:71]([OH:74])[CH:70]…","[[[53, 56], [17, 15]], [[53, 54], [15, 18]]]",1
3,3,"""H2LNR5""","""9851de4d3521c90a80bc8cf68a948f…","[-0.062493, 0.197662, … 0.024808]","""NC(CCC(=O)NC(CS)C(=O)NCC(=O)O)…","""[NH2:17][CH:16]([CH2:15][CH2:1…","[[[14, 16], [21], [2, 1]], [[14, 16], [0], [22, 23]]]",1
4,4,"""Q8NKB0""","""17bb99c7a7a73802ae1dc5760af1cb…","[0.072061, 0.220584, … 0.06905]","""CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c…","""[CH3:4][CH:2]1[CH2:5][CH2:6][C…","[[[1, 22], [0]], [[1, 2, 23]]]",1


In [21]:
toc = pl.read_csv(Path(cfg.data) / "time_split" / "250915_time_split.csv", separator='\t')
toc.head()


Entry,Label,Sequence
str,str,str
"""Q9H3Z7""","""b974ffacf1d79e2635848eb4228153…","""MCVICFVKALVRVFKIYLTASYTYPFRGWP…"
"""Q0RWC9""","""1a55f3c111dc32bc80f27e9d4874ea…","""MNETDIEGARVVVAEACRVAAARGLMEGIL…"
"""Q80XH4""","""46b8f933837da28efe31dccc0dc939…","""MAKPFFRLQKFLRRTQFLLLFLTAAYLMTG…"
"""H2LNR5""","""9851de4d3521c90a80bc8cf68a948f…","""MAPMLVSLNCGIRVQRRTLTLLIRQTSSYH…"
"""Q8NKB0""","""17bb99c7a7a73802ae1dc5760af1cb…","""MRTRSTISTPNGITWYYEQEGTGPDVVLVP…"


In [22]:
clip_dir = "/home/stef/clipzyme/files"
af2_whitelist = {}
for file in (Path(cfg.data) / "time_split" / "af2").iterdir():
    af2_whitelist[file.stem.split("-")[1]] = str(file).replace("/home/stef/quest_data", "/projects/p30041/spn1560")

In [23]:
arc_clip = arc_negs.filter(
    pl.col("pid").is_in(list(af2_whitelist.keys()))
).join(
    toc,
    how="left",
    left_on="pid",
    right_on="Entry",
).select(
    ["am_smarts", "Sequence", "pid", "y"]
).with_columns(
    pl.col("pid").replace(af2_whitelist).alias("cif")
).rename(
    {
        "am_smarts": "reaction",
        "Sequence": "sequence",
        "pid": "protein_id",
    }
)
print(f"# pairs post af2 filtering: {arc_clip.height} vs pre-filtering: {arc_negs.height}")
arc_clip.head()

# pairs post af2 filtering: 6332 vs pre-filtering: 7234


reaction,sequence,protein_id,y,cif
str,str,str,i64,str
"""[*:3][C:1](=[O:4])[O:5][CH2:6]…","""MCVICFVKALVRVFKIYLTASYTYPFRGWP…","""Q9H3Z7""",1,"""/projects/p30041/spn1560/hiec/…"
"""[O:12]=[C:8]([OH:13])[c:5]1[cH…","""MNETDIEGARVVVAEACRVAAARGLMEGIL…","""Q0RWC9""",1,"""/projects/p30041/spn1560/hiec/…"
"""[1*:73][CH:71]([OH:74])[CH:70]…","""MAKPFFRLQKFLRRTQFLLLFLTAAYLMTG…","""Q80XH4""",1,"""/projects/p30041/spn1560/hiec/…"
"""[NH2:17][CH:16]([CH2:15][CH2:1…","""MAPMLVSLNCGIRVQRRTLTLLIRQTSSYH…","""H2LNR5""",1,"""/projects/p30041/spn1560/hiec/…"
"""[CH3:4][CH:2]1[CH2:5][CH2:6][C…","""MRTRSTISTPNGITWYYEQEGTGPDVVLVP…","""Q8NKB0""",1,"""/projects/p30041/spn1560/hiec/…"


In [24]:
arc_clip.write_csv(Path(clip_dir) / "time_split_arc.csv")

In [25]:
random_clip = rand_negs.filter(
    pl.col("pid").is_in(list(af2_whitelist.keys()))
).join(
    toc,
    how="left",
    left_on="pid",
    right_on="Entry",
).select(
    ["am_smarts", "Sequence", "pid", "y"]
).with_columns(
    pl.col("pid").replace(af2_whitelist).alias("cif")
).rename(
    {
        "am_smarts": "reaction",
        "Sequence": "sequence",
        "pid": "protein_id",
    }
)
print(f"# pairs post af2 filtering: {random_clip.height} vs pre-filtering: {rand_negs.height}")
random_clip.head()

# pairs post af2 filtering: 5302 vs pre-filtering: 6072


reaction,sequence,protein_id,y,cif
str,str,str,i64,str
"""[*:3][C:1](=[O:4])[O:5][CH2:6]…","""MCVICFVKALVRVFKIYLTASYTYPFRGWP…","""Q9H3Z7""",1,"""/projects/p30041/spn1560/hiec/…"
"""[O:12]=[C:8]([OH:13])[c:5]1[cH…","""MNETDIEGARVVVAEACRVAAARGLMEGIL…","""Q0RWC9""",1,"""/projects/p30041/spn1560/hiec/…"
"""[1*:73][CH:71]([OH:74])[CH:70]…","""MAKPFFRLQKFLRRTQFLLLFLTAAYLMTG…","""Q80XH4""",1,"""/projects/p30041/spn1560/hiec/…"
"""[NH2:17][CH:16]([CH2:15][CH2:1…","""MAPMLVSLNCGIRVQRRTLTLLIRQTSSYH…","""H2LNR5""",1,"""/projects/p30041/spn1560/hiec/…"
"""[CH3:4][CH:2]1[CH2:5][CH2:6][C…","""MRTRSTISTPNGITWYYEQEGTGPDVVLVP…","""Q8NKB0""",1,"""/projects/p30041/spn1560/hiec/…"


In [26]:
random_clip.write_csv(Path(clip_dir) / "time_split_random.csv")

In [29]:
random_clip['y'].unique()

y
i64
0
1
