In [32]:
from tqdm import tqdm
import polars as pl
import json

In [33]:
krs = pl.read_parquet("/home/stef/enz_rxn_data/data/processed/pathway/known_reactions.parquet")

In [34]:
with open("/home/stef/enz_rxn_data/data/raw/pathway/sprhea_240310_v3_mapped_no_subunits.json", "r") as f:
    old = json.load(f)

In [35]:
old_rxns_w_enz = [v for v in old.values() if len(v['enzymes']) > 0]

In [36]:
new_rxns_w_enz = krs.filter(pl.col("enzymes").list.len() > 0)

In [37]:
old_enz = set()
old_pairs = set()
old_proper_enz = set()
old_proper_pairs = set()
for k, rxn in tqdm(old.items()):
    for enz in rxn['enzymes']:
        old_enz.add(enz['uniprot_id'])
        old_pairs.add((k, enz['uniprot_id']))

        if enz['existence'] in ["Evidence at protein level", "Evidence at transcript level"]:
            old_proper_enz.add(enz['uniprot_id'])
            old_proper_pairs.add((k, enz['uniprot_id']))

100%|██████████| 18954/18954 [00:01<00:00, 15718.02it/s]


In [38]:
kes = pl.read_parquet("/home/stef/enz_rxn_data/data/processed/pathway/known_enzymes.parquet")
kes.head()

id,sequence,existence,reviewed,ec,organism,name
str,str,enum,enum,str,str,str
"""A0A009IHW8""","""MSLEQKKGADIISKILQIQNSIGKTTSPST…","""Evidence at protein level""","""reviewed""","""3.2.2.-; 3.2.2.6""","""Acinetobacter baumannii (strai…","""2' cyclic ADP-D-ribose synthas…"
"""A0A059TC02""","""MRSVSGQVVCVTGAGGFIASWLVKILLEKG…","""Evidence at protein level""","""reviewed""","""1.2.1.44""","""Petunia hybrida (Petunia)""","""Cinnamoyl-CoA reductase 1 (Ph-…"
"""A0A061I403""","""MPMASVIAVAEPKWISVWGRFLWLTLLSMA…","""Evidence at protein level""","""reviewed""","""2.7.7.108; 3.1.4.-""","""Cricetulus griseus (Chinese ha…","""Protein adenylyltransferase FI…"
"""A0A067XGX8""","""MALTATATTRGGSALPNSCLQTPKFQSLQK…","""Evidence at transcript level""","""reviewed""","""2.5.1.54""","""Petunia hybrida (Petunia)""","""Phospho-2-dehydro-3-deoxyhepto…"
"""A0A067XH53""","""MALSTNSTTSSLLPKTPLVQQPLLKNASLP…","""Evidence at transcript level""","""reviewed""","""2.5.1.54""","""Petunia hybrida (Petunia)""","""Phospho-2-dehydro-3-deoxyhepto…"


In [39]:
kes.filter(pl.col("existence").is_in(["Evidence at protein level", "Evidence at transcript level"]))

id,sequence,existence,reviewed,ec,organism,name
str,str,enum,enum,str,str,str
"""A0A009IHW8""","""MSLEQKKGADIISKILQIQNSIGKTTSPST…","""Evidence at protein level""","""reviewed""","""3.2.2.-; 3.2.2.6""","""Acinetobacter baumannii (strai…","""2' cyclic ADP-D-ribose synthas…"
"""A0A059TC02""","""MRSVSGQVVCVTGAGGFIASWLVKILLEKG…","""Evidence at protein level""","""reviewed""","""1.2.1.44""","""Petunia hybrida (Petunia)""","""Cinnamoyl-CoA reductase 1 (Ph-…"
"""A0A061I403""","""MPMASVIAVAEPKWISVWGRFLWLTLLSMA…","""Evidence at protein level""","""reviewed""","""2.7.7.108; 3.1.4.-""","""Cricetulus griseus (Chinese ha…","""Protein adenylyltransferase FI…"
"""A0A067XGX8""","""MALTATATTRGGSALPNSCLQTPKFQSLQK…","""Evidence at transcript level""","""reviewed""","""2.5.1.54""","""Petunia hybrida (Petunia)""","""Phospho-2-dehydro-3-deoxyhepto…"
"""A0A067XH53""","""MALSTNSTTSSLLPKTPLVQQPLLKNASLP…","""Evidence at transcript level""","""reviewed""","""2.5.1.54""","""Petunia hybrida (Petunia)""","""Phospho-2-dehydro-3-deoxyhepto…"
…,…,…,…,…,…,…
"""P80580""","""MKLYSFFNXRA""","""Evidence at protein level""","""reviewed""","""5.2.1.4""","""Klebsiella pneumoniae""","""Maleylpyruvate isomerase (EC 5…"
"""P83329""","""XXFENFLNANXAYVNLD""","""Evidence at protein level""","""reviewed""","""4.2.1.1""","""Streptococcus thermophilus""","""Putative carbonic anhydrase (E…"
"""P83855""","""GVYMEIGRCRXEAXRRRKEAV""","""Evidence at protein level""","""reviewed""","""4.6.1.1""","""Mus musculus (Mouse)""","""Putative sperm adenylate cycla…"
"""P85957""","""AMAGTATVQGQGTR""","""Evidence at protein level""","""reviewed""","""1.11.1.7""","""Pseudotsuga menziesii (Douglas…","""Peroxidase (EC 1.11.1.7)"""


In [40]:
enzymes = set()
pairs = set()
proper_pairs = set()
proper_enzymes = set()
evidence_dict = dict(zip(kes["id"], kes["existence"]))
for row in tqdm(krs.iter_rows(named=True), total=len(krs)):
    rxn = row['smarts']
    for enz in row['enzymes']:
        enzymes.add(enz)
        evidence = evidence_dict.get(enz, "Unknown")
        
        if evidence in ["Evidence at protein level", "Evidence at transcript level"]:
            proper_pairs.add((rxn, enz))
            proper_enzymes.add(enz)
        
        pairs.add((rxn, enz))

len(pairs), len(proper_pairs)

100%|██████████| 28142/28142 [00:00<00:00, 52638.85it/s]


(518578, 164968)

In [41]:
print("total reacitions")
print(f"old: {len(old_rxns_w_enz)} vs new: {krs.shape[0]}")
print("\nreactions with enzymes")
print(f"old: {len(old_rxns_w_enz)} vs new: {new_rxns_w_enz.shape[0]}")

total reacitions
old: 18954 vs new: 28142

reactions with enzymes
old: 18954 vs new: 21034


In [42]:
print("number of enzymes")
print(f"old: {len(old_enz)} vs new: {len(enzymes)}")
print("\nnumber of proper enzymes")
print(f"old: {len(old_proper_enz)} vs new: {len(proper_enzymes)}")
print("\nnumber of pairs")
print(f"old: {len(old_pairs)} vs new: {len(pairs)}")
print("\nnumber of proper pairs")
print(f"old: {len(old_proper_pairs)} vs new: {len(proper_pairs)}")

number of enzymes
old: 180013 vs new: 188937

number of proper enzymes
old: 36531 vs new: 41238

number of pairs
old: 485570 vs new: 518578

number of proper pairs
old: 140234 vs new: 164968
