In [1]:
import polars as pl
from pathlib import Path
import json
from tqdm import tqdm
from ergochemics.standardize import standardize_smiles, standardize_rxn
from src.schemas import (
    known_compounds_schema,
    known_reactions_schema,
    enzymes_schema
)
from functools import lru_cache
from rdkit import Chem

In [2]:
@lru_cache(maxsize=10000)
def std_smi(smi: str) -> str:
    return standardize_smiles(
        smiles=smi,
        do_canon_taut=True,
        neutralization_method="simple",
        quiet=True,
        max_tautomers=100,
    )

# TODO: Must use! Put in place once externalize this to enz-rxn-data repo
@lru_cache(maxsize=10000)
def std_rxn(smi: str) -> str:
    return standardize_rxn(
        smiles=smi,
        do_canon_taut=False, # Worried that canonicalizing tautomers could prevent reaction recapitulation
        neutralization_method="simple",
        quiet=True,
    )

In [3]:
# # Re-name mapping files
# map_dir = Path().cwd().parent / "artifacts/rxn_x_rule_mapping"
# for file in map_dir.glob("*.parquet"):
#     df = pl.read_parquet(file)
#     fn = file.name.replace("mapped_sprhea_240310_v3_mapped_no_subunits", "sprhea_v3_ns")
#     out = file.parent / fn
#     print(f"Writing {out}")
#     df.write_parquet(out)

In [4]:
known_dir = Path().cwd().parent / "artifacts/known"
with open("/home/stef/quest_data/bottle/data/sprhea/sprhea_240310_v3_mapped_no_subunits.json", "r") as f:
    sprhea = json.load(f)

In [5]:
# Create known compounds DataFrame

smi2name = {smi: name for entry in sprhea.values() for smi, name in entry['smi2name'].items()}
compounds = sorted(smi2name.items())
smiles, rxn_counts = zip(*compounds)
compounds_df = pl.DataFrame(
    {
        'id': range(len(compounds)),
        'smiles': smiles,
        'names': [smi2name.get(smi, '') for smi in smiles],
    },
)

compounds_df = compounds_df.with_columns(
    pl.col('smiles').map_elements(lambda smi: Chem.MolFromSmiles(smi).GetNumAtoms(), return_dtype=pl.Int32).alias('n_atoms'),
    pl.col('names').str.split(';').alias('names'),
    pl.col('id').cast(pl.String),
)

assert compounds_df.schema == pl.Schema(known_compounds_schema)

compounds_df.write_parquet(
    known_dir / "known_compounds.parquet",
)

compounds_df.head()

[09:18:43] Unusual charge on atom 0 number of radical electrons set to zero
[09:18:43] Unusual charge on atom 0 number of radical electrons set to zero


id,smiles,names,n_atoms
str,str,list[str],i32
"""0""","""*""","[""A""]",1
"""1""","""**""","[""RX""]",2
"""2""","""*C""","[""an alkane""]",2
"""3""","""*C#N""","[""a nitrile""]",3
"""4""","""*C(*)(N)C(=O)O""","[""2,2-dialkylglycine""]",7


In [6]:
# Collect enzyme data

enz_data = {}
for k, v in sprhea.items():
    for enz in v['enzymes']:
        enz['id'] = enz['uniprot_id']
        enz.pop('uniprot_id', None)
        enz_data[enz['id']] = enz

enz_data = list(enz_data.values())
enz_df = pl.DataFrame(data=enz_data, schema=enzymes_schema)
enz_df.write_parquet(
    known_dir / "known_enzymes.parquet",
)

enz_df.head()

id,sequence,existence,reviewed,ec,organism,name
str,str,enum,enum,str,str,str
"""A2RJM8""","""MIEELGLKVKTASKEAAKLSTAEKNTFLQK…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Lactococcus lactis subsp. crem…","""Gamma-glutamyl phosphate reduc…"
"""Q1B639""","""MSVHAPAAPDLRTEVHDAARRARVASRTLA…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Mycobacterium sp. (strain MCS)""","""Gamma-glutamyl phosphate reduc…"
"""A2SCE2""","""MNAPDATPVIALMDRLGSAARSASTAMAAA…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Methylibium petroleiphilum (st…","""Gamma-glutamyl phosphate reduc…"
"""A1TC11""","""MSVQAPSVPDLRQQVHDAARRARGAARALA…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Mycolicibacterium vanbaalenii …","""Gamma-glutamyl phosphate reduc…"
"""A4G8E9""","""MDIKQYMKEVGQRARKASRAMAKADTAAKN…","""Inferred from homology""","""reviewed""","""1.2.1.41""","""Herminiimonas arsenicoxydans""","""Gamma-glutamyl phosphate reduc…"


In [7]:
kr_data = []
for k, v in sprhea.items():
    kr_data.append(
        {
            'id': k,
            'smarts': v['smarts'],
            'enzymes': list(set(enz['id'] for enz in v['enzymes'])),
            'reverse': v['reverse'],
            'db_ids': [f"rhea:{id}" for id in v['rhea_ids']],
        }
    )

kr_df = pl.DataFrame(kr_data)
kr_df = kr_df.with_columns(
    pl.col('reverse').cast(pl.String)
)

assert kr_df.schema == known_reactions_schema

kr_df.write_parquet(
    known_dir / "known_reactions.parquet",
)

kr_df.head()

id,smarts,enzymes,reverse,db_ids
str,str,list[str],str,list[str]
"""0""","""NC(CCC=O)C(=O)O.NC(=O)c1ccc[n+…","[""C1EZ15"", ""A5VIE0"", … ""Q3Z6Z9""]","""1651""","[""rhea:19542""]"
"""1""","""CC(O)C(O)C(O)C(O)C(=O)O>>CC(O)…","[""B7UFQ9"", ""A0KDV2"", … ""B1LLK1""]","""1166""","[""rhea:23081"", ""rhea:22773"", ""rhea:12950""]"
"""3""","""CSCCC(=O)C(=O)O.O=CO>>CSCCC(=O…","[""Q8H185"", ""A7FLL2"", … ""A1JP10""]","""1126""","[""rhea:24506""]"
"""4""","""O.O.[Fe+3].[Fe+3]>>OO.[Fe+2].[…","[""A0R692"", ""Q5AEN1"", … ""B7UM08""]","""1923""","[""rhea:48714"", ""rhea:16583""]"
"""5""","""Nc1nc2c(ncn2C2OC(COP(=O)(O)OP(…","[""Q9WYJ4"", ""O67214""]","""15231""","[""rhea:59085""]"
