In [30]:
import polars as pl
from ergochemics.standardize import hash_compound, hash_reaction
import json
from pathlib import Path

In [2]:
krs = pl.read_parquet("/home/stef/enz_rxn_data/data/processed/pathway/known_reactions.parquet")
kcs = pl.read_parquet("/home/stef/enz_rxn_data/data/processed/pathway/known_compounds.parquet")
old_krs = pl.read_parquet("/home/stef/enz_rxn_data/data/raw/old_known_reactions.parquet")
old_kcs = pl.read_parquet("/home/stef/enz_rxn_data/data/raw/old_known_compounds.parquet")

In [3]:
print(f"Old known reactions: {old_krs.shape[0]}")
print(f"New known reactions: {krs.shape[0]}")

Old known reactions: 28142
New known reactions: 28142


In [5]:
print(f"Old known compounds: {old_kcs.shape[0]}")
print(f"New known compounds: {kcs.shape[0]}")

Old known compounds: 10594
New known compounds: 10594


In [6]:
print(f"Old reactions {old_krs.columns}")
print(f"New reactions {krs.columns}")

Old reactions ['id', 'smarts', 'enzymes', 'reverse', 'db_ids']
New reactions ['id', 'smarts', 'enzymes', 'reverse', 'db_ids']


In [7]:
print(f"Old known compounds: {old_kcs.columns}")
print(f"New known compounds: {kcs.columns}")

Old known compounds: ['id', 'smiles', 'name', 'chebi_id', 'n_atoms']
New known compounds: ['id', 'smiles', 'name', 'chebi_id', 'n_atoms']


In [None]:
old_smi_hashes = old_kcs.select(
    pl.col("smiles").map_elements(hash_compound, return_dtype=pl.String)
).to_series().to_list()
old_smi_hashes = set(old_smi_hashes)
print(len(old_smi_hashes))
new_smi_hashes = kcs.select(
    pl.col("id")
).to_series().to_list()
new_smi_hashes = set(new_smi_hashes)
print(len(new_smi_hashes))

assert len(old_smi_hashes ^ new_smi_hashes) == 0
assert len(old_smi_hashes) == len(new_smi_hashes)
assert len(new_smi_hashes) == kcs.shape[0]

10594
10594
xor: 0


In [21]:
old_sma_hashes = old_krs.select(
    pl.col("smarts").map_elements(hash_reaction, return_dtype=pl.String)
).to_series().to_list()
old_sma_hashes = set(old_sma_hashes)
print(len(old_sma_hashes))
new_sma_hashes = krs.select(
    pl.col("id")
).to_series().to_list()
new_sma_hashes = set(new_sma_hashes)
print(len(new_sma_hashes))

assert len(old_sma_hashes ^ new_sma_hashes) == 0
assert len(old_sma_hashes) == len(new_sma_hashes)
assert len(new_sma_hashes) == krs.shape[0]

28142
28142


In [22]:
old_krs = old_krs.with_columns(
    pl.col("smarts").map_elements(hash_reaction, return_dtype=pl.String).alias("hash_id")
)
old_krs.head()

id,smarts,enzymes,reverse,db_ids,hash_id
i32,str,list[str],i32,list[str],str
0,"""**.NC(CCC(=O)NC(CS)C(=O)NCC(=O…","[""O75881"", ""Q60991"", ""Q63688""]",4980,"[""RHEA:16438""]","""e908a824c912d1e39c46de92d1f738…"
1,"""*.*.*.*.*.*.*.*.*.*.CC(C)(COP(…","[""P36328"", ""P08411"", … ""Q6NY98""]",7915,"[""RHEA:51350""]","""d86d99a8143d3be8fc861a5de5e625…"
2,"""*.*.*.*.*.*.*.*.*NC(COP(=O)(O)…",[],3251,"[""RHEA:64546""]","""2972b2db66715ba6f3d6aeaf07f31b…"
3,"""*.*.*.*.*.*.*.*.*OP(=O)(O)OCC(…",[],4792,"[""RHEA:64377""]","""572e4a84a946af07c9ab6fd1c0347b…"
4,"""*.*.*.*.*.*.*.*.CC(C)CCCC(C)CC…",[],10358,"[""RHEA:64369""]","""dfe7fc761de3c213bf2b18fae4f8a5…"


In [23]:
kr_convert = dict(zip(old_krs["id"], old_krs["hash_id"]))

In [24]:
old_kcs = old_kcs.with_columns(
    pl.col("smiles").map_elements(hash_compound, return_dtype=pl.String).alias("hash_id")
)
old_kcs.head()

id,smiles,name,chebi_id,n_atoms,hash_id
i32,str,str,str,i32,str
0,"""*""","""A""","""CHEBI:13193""",1,"""df58248c414f342c81e056b40bee12…"
1,"""**""","""RX""","""CHEBI:17792""",2,"""bc2f74c22f98f7b6ffbc2f67453dbf…"
2,"""*C""","""an alkane""","""CHEBI:18310""",2,"""e8ecd17bfc980925ff0b634bd13104…"
3,"""*C#N""","""a nitrile""","""CHEBI:18379""",3,"""b219689f21c828e7f057ef453af118…"
4,"""*C(*)(O)C(*)(*)O""","""an ethanediol""","""CHEBI:140594""",8,"""94295b0379859ca57b86c85b076852…"


In [25]:
kc_convert = dict(zip(old_kcs["id"], old_kcs["hash_id"]))

In [26]:
print(kr_convert)

{0: 'e908a824c912d1e39c46de92d1f738c92b01fdbd', 1: 'd86d99a8143d3be8fc861a5de5e625c502ef3124', 2: '2972b2db66715ba6f3d6aeaf07f31bea536176aa', 3: '572e4a84a946af07c9ab6fd1c0347b96aa6c4575', 4: 'dfe7fc761de3c213bf2b18fae4f8a56ff79abee1', 5: 'c12f7fecace87b6ca7d7373a399971a9fd71cd2e', 6: '5e7d7b49efeea559a16a246ab941986f2dd5ea37', 7: 'ae4c10e88391a484a7e32aec813103cd3b982047', 8: '0319fabd8e67cf3d58bbdff5a8ebddad203bca58', 9: 'cd9b69e95b7ef2d3fa64bc5c0936b4414a4d75ce', 10: '01e44f9e7267f006a4def12abebde2f5d7af306b', 11: 'a2ec55ca35ab810c2adc91ad0624da3cbdc8a7d8', 12: '4f91357591a5f9da5db622a39b835e74fc5c7ee0', 13: '7d8d2e73eee7b9b23fef5a74ec072c49eee7d38d', 14: 'b2147829aea1d5a986b429c98cc74004cf2b9270', 15: '933f2e220e8aa04b9a04d3a73751a8c9b42524c6', 16: 'f5dfa40b575d2a0c14395ba45db50ea2db4c2e37', 17: '88dbea04bde078f98772d4788cb0fed6b96834e2', 18: '0d3fee595ee3acae5ab9629befb742b8d7b2f2c7', 19: '8f1d86a61fdb36054cea33ebb3408c9d3193b938', 20: 'e66033d391151b14996e34f0f59b82530181c591', 2

In [28]:
with open("kr_convert.json", "w") as f:
    json.dump(kr_convert, f)

In [29]:
with open("kc_convert.json", "w") as f:
    json.dump(kc_convert, f)

In [35]:
for p in Path("./data/interim/pathway").glob("mappings*.parquet"):
    df = pl.read_parquet(p)
    df = df.with_columns(
        pl.col("rxn_id").replace_strict(kr_convert)
    )
    df.head()
    df.write_parquet(p)

In [39]:
for p in Path("./data/processed/pathway").glob("mapped*.parquet"):
    df = pl.read_parquet(p)
    df = df.with_columns(
        pl.col("rxn_id").replace_strict(kr_convert)
    )
    df.write_parquet(p)