In [75]:
import polars as pl
from pathlib import Path
import json
from tqdm import tqdm
from ergochemics.standardize import standardize_smiles, standardize_rxn
from functools import lru_cache
from rdkit import Chem

In [76]:
@lru_cache(maxsize=10000)
def std_smi(smi: str) -> str:
    return standardize_smiles(
        smiles=smi,
        do_canon_taut=True,
        neutralization_method="simple",
        quiet=True,
        max_tautomers=100,
    )

@lru_cache(maxsize=10000)
def std_rxn(smi: str) -> str:
    return standardize_rxn(
        smiles=smi,
        do_canon_taut=False, # Worried that canonicalizing tautomers could prevent reaction recapitulation
        neutralization_method="simple",
        quiet=True,
    )

In [None]:
# # Re-name mapping files
# map_dir = Path().cwd().parent / "artifacts/rxn_x_rule_mapping"
# for file in map_dir.glob("*.parquet"):
#     df = pl.read_parquet(file)
#     fn = file.name.replace("mapped_sprhea_240310_v3_mapped_no_subunits", "sprhea_v3_ns")
#     out = file.parent / fn
#     print(f"Writing {out}")
#     df.write_parquet(out)

In [51]:
known_dir = Path().cwd().parent / "artifacts/known"
with open("/home/stef/quest_data/bottle/data/sprhea/sprhea_240310_v3_mapped_no_subunits.json", "r") as f:
    sprhea = json.load(f)

In [73]:
k = list(sprhea.keys())[0]
sprhea[k].keys()

dict_keys(['smarts', 'min_rule', 'imt_rules', 'smi2name', 'enzymes', 'reaction_center', 'reverse', 'rhea_ids'])

In [57]:
# Create known compounds DataFrame

smi2name = {smi: name for entry in sprhea.values() for smi, name in entry['smi2name'].items()}
compounds = sorted(smi2name.items())
smiles, rxn_counts = zip(*compounds)
compounds_df = pl.DataFrame(
    {
        'id': range(len(compounds)),
        'smiles': smiles,
        'names': [smi2name.get(smi, '') for smi in smiles],
    }
)

compounds_df = compounds_df.with_columns(
    pl.col('smiles').map_elements(lambda smi: Chem.MolFromSmiles(smi).GetNumAtoms(), return_dtype=pl.Int32).alias('n_atoms')
)

compounds_df = compounds_df.with_columns(
    pl.col('names').str.split(';').alias('names'),
)

compounds_df.write_parquet(
    known_dir / "known_compounds.parquet",
)

compounds_df.head()

[15:17:29] Unusual charge on atom 0 number of radical electrons set to zero
[15:17:29] Unusual charge on atom 0 number of radical electrons set to zero


id,smiles,names,n_atoms
i64,str,list[str],i32
0,"""*""","[""A""]",1
1,"""**""","[""RX""]",2
2,"""*C""","[""an alkane""]",2
3,"""*C#N""","[""a nitrile""]",3
4,"""*C(*)(N)C(=O)O""","[""2,2-dialkylglycine""]",7


In [82]:
# Collect enzyme data

enz_data = {}
for k, v in sprhea.items():
    for enz in v['enzymes']:
        enz_data[enz['uniprot_id']] = enz

enz_data = list(enz_data.values())
enz_df = pl.DataFrame(enz_data)
enz_df.head()

uniprot_id,sequence,existence,reviewed,ec,organism,name
str,str,str,str,str,str,str
"""A2RJM8""","""MIEELGLKVKTASKEAAKLSTAEKNTFLQK…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Lactococcus lactis subsp. crem…","""Gamma-glutamyl phosphate reduc…"
"""Q1B639""","""MSVHAPAAPDLRTEVHDAARRARVASRTLA…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Mycobacterium sp. (strain MCS)""","""Gamma-glutamyl phosphate reduc…"
"""A2SCE2""","""MNAPDATPVIALMDRLGSAARSASTAMAAA…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Methylibium petroleiphilum (st…","""Gamma-glutamyl phosphate reduc…"
"""A1TC11""","""MSVQAPSVPDLRQQVHDAARRARGAARALA…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Mycolicibacterium vanbaalenii …","""Gamma-glutamyl phosphate reduc…"
"""A4G8E9""","""MDIKQYMKEVGQRARKASRAMAKADTAAKN…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Herminiimonas arsenicoxydans""","""Gamma-glutamyl phosphate reduc…"


In [83]:
existence_enum = pl.Enum(enz_df['existence'].unique())
reviewed_enum = pl.Enum(enz_df['reviewed'].unique())


In [84]:
enz_df['organism'].unique()

organism
str
"""Buchnera aphidicola subsp. Pem…"
"""Gnetum parvifolium (Small-leav…"
"""Marinomonas sp. (strain MWYL1)"""
"""Paraburkholderia xenovorans (s…"
"""Cutaneotrichosporon moniliifor…"
…
"""Anaeromyxobacter sp. (strain F…"
"""Simmondsia chinensis (Jojoba) …"
"""Escherichia coli O157:H7"""
"""Chrysanthemum virus B (CVB)"""


In [85]:
enz_df['name'].unique()

name
str
"""Ceramide kinase (OsCERK) (EC 2…"
"""Lipoyl synthase, mitochondrial…"
"""Actin-42 (EC 3.6.4.-)"""
"""Probable GMP synthase [glutami…"
"""1-acyl-sn-glycerol-3-phosphate…"
…
"""UTP--glucose-1-phosphate uridy…"
"""Type II methyltransferase M.Mj…"
"""Putative NADPH dehydrogenase C…"
"""10-deoxymethynolide desosaminy…"


In [86]:
enz_df.shape

(180013, 7)

In [88]:
# Use categoricals and enums

enz_df = enz_df.with_columns(
    pl.col('existence').cast(existence_enum),
    pl.col('reviewed').cast(reviewed_enum),
    pl.col('organism').cast(pl.Categorical),
    pl.col('name').cast(pl.Categorical),
)
enz_df = enz_df.rename({'uniprot_id': 'id'})

enz_df.write_parquet(
    known_dir / "known_enzymes.parquet",
)

enz_df.head()

id,sequence,existence,reviewed,ec,organism,name
str,str,enum,enum,str,cat,cat
"""A2RJM8""","""MIEELGLKVKTASKEAAKLSTAEKNTFLQK…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Lactococcus lactis subsp. crem…","""Gamma-glutamyl phosphate reduc…"
"""Q1B639""","""MSVHAPAAPDLRTEVHDAARRARVASRTLA…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Mycobacterium sp. (strain MCS)""","""Gamma-glutamyl phosphate reduc…"
"""A2SCE2""","""MNAPDATPVIALMDRLGSAARSASTAMAAA…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Methylibium petroleiphilum (st…","""Gamma-glutamyl phosphate reduc…"
"""A1TC11""","""MSVQAPSVPDLRQQVHDAARRARGAARALA…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Mycolicibacterium vanbaalenii …","""Gamma-glutamyl phosphate reduc…"
"""A4G8E9""","""MDIKQYMKEVGQRARKASRAMAKADTAAKN…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Herminiimonas arsenicoxydans""","""Gamma-glutamyl phosphate reduc…"


In [92]:
kr_data = []
for k, v in sprhea.items():
    kr_data.append(
        {
            'id': k,
            'smarts': v['smarts'],
            'enzymes': list(set(enz['uniprot_id'] for enz in v['enzymes'])),
            'reverse': v['reverse'],
            'db_ids': [f"rhea:{id}" for id in v['rhea_ids']],
        }
    )

kr_df = pl.DataFrame(kr_data)
kr_df = kr_df.with_columns(
    pl.col('reverse').cast(pl.String)
)
kr_df.write_parquet(
    known_dir / "known_reactions.parquet",
)

kr_df.head()

id,smarts,enzymes,reverse,db_ids
str,str,list[str],str,list[str]
"""0""","""NC(CCC=O)C(=O)O.NC(=O)c1ccc[n+…","[""B1ZFJ0"", ""B1XDY6"", … ""C0QLF1""]","""1651""","[""rhea:19542""]"
"""1""","""CC(O)C(O)C(O)C(O)C(=O)O>>CC(O)…","[""A8MA91"", ""A9BQY2"", … ""B5BCQ4""]","""1166""","[""rhea:23081"", ""rhea:22773"", ""rhea:12950""]"
"""3""","""CSCCC(=O)C(=O)O.O=CO>>CSCCC(=O…","[""A9VCM7"", ""A7E4S9"", … ""Q8YTJ3""]","""1126""","[""rhea:24506""]"
"""4""","""O.O.[Fe+3].[Fe+3]>>OO.[Fe+2].[…","[""A8GBU5"", ""P00431"", … ""B1LMA4""]","""1923""","[""rhea:48714"", ""rhea:16583""]"
"""5""","""Nc1nc2c(ncn2C2OC(COP(=O)(O)OP(…","[""O67214"", ""Q9WYJ4""]","""15231""","[""rhea:59085""]"
