In [78]:
from collections import defaultdict

from oaklib import get_adapter
from semsql.sqla.relation_graph import EntailedEdge, RdfsLabelStatement
from sqlalchemy import alias

chebi = get_adapter("sqlite:obo:chebi")
session = chebi.session

Downloading chebi.db.gz: 0.00B [00:00, ?B/s]

In [80]:
from semsql.sqla.semsql import Statements
from sqlalchemy.orm import aliased

child_label = aliased(RdfsLabelStatement)
parent_label = aliased(RdfsLabelStatement)
child_smiles = aliased(Statements)
parent_smiles = aliased(Statements)

q = session.query(
    EntailedEdge.subject,
    EntailedEdge.object,
    EntailedEdge.subject.label("child"),
    EntailedEdge.object.label("parent"),
    child_label.value.label("child_label"),
    parent_label.value.label("parent_label"),
    child_smiles.value.label("child_smiles"),
    parent_smiles.value.label("parent_smiles"),
).join(
    child_label,
    EntailedEdge.subject == child_label.subject,
).join(
    parent_label,
    EntailedEdge.object == parent_label.subject,
).join(
    child_smiles,
    EntailedEdge.subject == child_smiles.subject,
).join(
    parent_smiles,
    EntailedEdge.object == parent_smiles.subject,
).filter(
    child_smiles.predicate == "obo:chebi/smiles",
).filter(
    parent_smiles.predicate == "obo:chebi/smiles",
).filter(
    EntailedEdge.predicate == "rdfs:subClassOf",
)
    


In [81]:
import pandas as pd

df = pd.read_sql(q.statement, session.bind)

In [82]:
df

Unnamed: 0,subject,object,child,parent,child_label,parent_label,child_smiles,parent_smiles
0,CHEBI:10,CHEBI:10,CHEBI:10,CHEBI:10,(+)-Atherospermoline,(+)-Atherospermoline,COc1cc2CCN(C)[C@H]3Cc4ccc(Oc5cc(C[C@@H]6N(C)CC...,COc1cc2CCN(C)[C@H]3Cc4ccc(Oc5cc(C[C@@H]6N(C)CC...
1,CHEBI:100,CHEBI:100,CHEBI:100,CHEBI:100,(-)-medicarpin,(-)-medicarpin,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21
2,CHEBI:100,CHEBI:16114,CHEBI:100,CHEBI:16114,(-)-medicarpin,medicarpin,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1
3,CHEBI:100,CHEBI:26377,CHEBI:100,CHEBI:26377,(-)-medicarpin,pterocarpans,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21,[*]c1c([*])c([*])c2c(OC([*])([*])C3([*])c4c([*...
4,CHEBI:10000,CHEBI:10000,CHEBI:10000,CHEBI:10000,Vismione D,Vismione D,CC(C)=CCC\C(C)=C\COc1cc(O)c2c(O)c3C(=O)CC(C)(O...,CC(C)=CCC\C(C)=C\COc1cc(O)c2c(O)c3C(=O)CC(C)(O...
...,...,...,...,...,...,...,...,...
396276,CHEBI:99997,CHEBI:99997,CHEBI:99997,CHEBI:99997,"N-[(2S,4aS,12aS)-2-[2-(cyclohexylmethylamino)-...","N-[(2S,4aS,12aS)-2-[2-(cyclohexylmethylamino)-...",CN1[C@H]2CC[C@H](O[C@@H]2COC3=C(C1=O)C=C(C=C3)...,CN1[C@H]2CC[C@H](O[C@@H]2COC3=C(C1=O)C=C(C=C3)...
396277,CHEBI:99998,CHEBI:37622,CHEBI:99998,CHEBI:37622,"N-[[(3S,9S,10R)-16-(dimethylamino)-12-[(2S)-1-...",carboxamide,C[C@H]1CCCCO[C@@H]([C@@H](CN(C(=O)C2=C(O1)C=CC...,[*]C(=O)N([*])[*]
396278,CHEBI:99998,CHEBI:99998,CHEBI:99998,CHEBI:99998,"N-[[(3S,9S,10R)-16-(dimethylamino)-12-[(2S)-1-...","N-[[(3S,9S,10R)-16-(dimethylamino)-12-[(2S)-1-...",C[C@H]1CCCCO[C@@H]([C@@H](CN(C(=O)C2=C(O1)C=CC...,C[C@H]1CCCCO[C@@H]([C@@H](CN(C(=O)C2=C(O1)C=CC...
396279,CHEBI:99999,CHEBI:37622,CHEBI:99999,CHEBI:37622,"N-[(5S,6S,9S)-5-methoxy-3,6,9-trimethyl-2-oxo-...",carboxamide,C[C@H]1CN[C@H](COC2=C(C=CC(=C2)NC(=O)C3=NC4=CC...,[*]C(=O)N([*])[*]


In [83]:
df.to_csv("smarts-analysis/chebi-ancestor-smiles.csv", index=False)

In [85]:
cls_id_to_smarts = {row["parent"]: row["parent_smiles"] for _, row in df.iterrows()}

In [84]:
from c3p.datamodel import Dataset

with open("../results/2025/benchmark/dataset.json", "r") as f:
    dataset = Dataset.model_validate_json(f.read())

In [93]:
from typing import Set
from c3p.datamodel import ChemicalClass, Result, Outcome, EvaluationResult, ResultSet
from rdkit import Chem

from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

def validate_single_class(cls: ChemicalClass, dataset: Dataset):
    if cls.id not in cls_id_to_smarts:
        return None
    smarts = cls_id_to_smarts[cls.id]
    smarts_pattern = Chem.MolFromSmarts(smarts)
    cls_lite = cls.lite_copy()
    results = []
    pattern = Chem.MolFromSmarts("{smarts}")
    true_positives = []
    false_positives = []
    true_negatives = []
    false_negatives = []
    for smiles in dataset.validation_examples:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            false_negatives.append(smiles)
            continue
        prediction =  mol.HasSubstructMatch(smarts_pattern)
        actual = smiles in cls.all_positive_examples
        if prediction and actual:
            true_positives.append(smiles)
        elif prediction and not actual:
            false_positives.append(smiles)
        elif not prediction and actual:
            false_negatives.append(smiles)
        else:
            true_negatives.append(smiles)
            
    def outcomes(smiles_list: Set[str]):
        return [Outcome(smiles=x) for x in smiles_list]
        
    result = Result(
                chemical_class=cls_lite,
                code="",
                num_true_positives=len(true_positives),
                num_false_positives=len(false_positives),
                num_false_negatives=len(false_negatives),
                num_true_negatives=len(true_negatives),
                true_positives=outcomes(true_positives),
                false_positives=outcomes(false_positives),
                sample_true_negatives=outcomes(true_negatives)[:10],
                sample_false_negatives=outcomes(false_negatives)[:10],
                best=True,
            )
    result.calculate()
    return result


In [94]:
result = validate_single_class(dataset.classes[0], dataset)

In [97]:
result.f1, result.recall, result.precision

(0.005571030640668524, 0.45652173913043476, 0.002802615774723075)

In [98]:
ers = []
for cls in dataset.classes:
    result = validate_single_class(cls, dataset)
    if not result:
        continue
    er = EvaluationResult(
                train_results=ResultSet(results=[]),
                test_result=result
            )
    ers.append(er)

In [99]:
len(ers)

342

In [100]:
from c3p.learn import safe_name
from pathlib import Path

output_dir = Path("smartifier")
output_dir.mkdir(exist_ok=True, parents=True)
for er in ers:
    sn = safe_name(er.test_result.chemical_class.name)
    with open(output_dir / f"{sn}.json", "w") as f:
        f.write(er.model_dump_json(indent=2))

In [None]:
TMPL = """
from rdkit import Chem

def {func_name}(smiles: str):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False, "Invalid SMILES string"
    pattern = Chem.MolFromSmarts("{smarts}")
    matches = mol.GetSubstructMatches(pattern)
    if len(matches) == 0:
        return False, f"No {smarts} match group found"
    return True, f"Found {len(matches)} {smarts} match groups"
"""

In [21]:
from rdkit import Chem


In [22]:
parents = list(df["parent_smiles"].unique())

In [23]:
query_map = {smiles: Chem.MolFromSmarts(smiles) for smiles in parents}

In [32]:
assert all(query_map.values())

In [24]:
leaf_node_q = session.query(
    RdfsSubclassOfStatement.subject.label("id"),
    child_label.value.label("label"),
    child_smiles.value.label("smiles"),
).join(
    child_label,
    RdfsSubclassOfStatement.subject == child_label.subject,
).join(
    child_smiles,
    RdfsSubclassOfStatement.subject == child_smiles.subject,
).filter(
    child_smiles.predicate == "obo:chebi/smiles",
)

In [25]:
leaf_node_df = pd.read_sql(leaf_node_q.statement, session.bind)

In [26]:
leaf_node_df

Unnamed: 0,id,label,smiles
0,CHEBI:10,(+)-Atherospermoline,COc1cc2CCN(C)[C@H]3Cc4ccc(Oc5cc(C[C@@H]6N(C)CC...
1,CHEBI:100,(-)-medicarpin,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21
2,CHEBI:100,(-)-medicarpin,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21
3,CHEBI:100,(-)-medicarpin,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21
4,CHEBI:10000,Vismione D,CC(C)=CCC\C(C)=C\COc1cc(O)c2c(O)c3C(=O)CC(C)(O...
...,...,...,...
348062,CHEBI:99997,"N-[(2S,4aS,12aS)-2-[2-(cyclohexylmethylamino)-...",CN1[C@H]2CC[C@H](O[C@@H]2COC3=C(C1=O)C=C(C=C3)...
348063,CHEBI:99998,"N-[[(3S,9S,10R)-16-(dimethylamino)-12-[(2S)-1-...",C[C@H]1CCCCO[C@@H]([C@@H](CN(C(=O)C2=C(O1)C=CC...
348064,CHEBI:99998,"N-[[(3S,9S,10R)-16-(dimethylamino)-12-[(2S)-1-...",C[C@H]1CCCCO[C@@H]([C@@H](CN(C(=O)C2=C(O1)C=CC...
348065,CHEBI:99999,"N-[(5S,6S,9S)-5-methoxy-3,6,9-trimethyl-2-oxo-...",C[C@H]1CN[C@H](COC2=C(C=CC(=C2)NC(=O)C3=NC4=CC...


In [28]:
children = list(leaf_node_df["smiles"].unique())
len(children)

185103

In [29]:
smiles_map = {smiles: Chem.MolFromSmiles(smiles) for smiles in children}

[11:04:04] Explicit valence for atom # 0 F, 5, is greater than permitted
[11:04:10] Conflicting single bond directions around double bond at index 20.
[11:04:10]   BondStereo set to STEREONONE and single bond directions set to NONE.
[11:04:12] Explicit valence for atom # 2 O, 3, is greater than permitted
[11:04:12] Explicit valence for atom # 0 Br, 5, is greater than permitted
[11:04:12] Explicit valence for atom # 22 N, 4, is greater than permitted
[11:04:13] Explicit valence for atom # 22 C, 6, is greater than permitted
[11:04:16] Explicit valence for atom # 0 C, 5, is greater than permitted
[11:04:16] Explicit valence for atom # 21 N, 4, is greater than permitted
[11:04:16] Explicit valence for atom # 22 N, 4, is greater than permitted
[11:04:16] Explicit valence for atom # 0 O, 3, is greater than permitted
[11:04:16] Explicit valence for atom # 17 N, 4, is greater than permitted
[11:04:16] Explicit valence for atom # 1 C, 6, is greater than permitted
[11:04:16] Explicit valence for

In [30]:
bad_smiles = {smiles: mol for smiles, mol in smiles_map.items() if mol is None}
len(bad_smiles)

113

In [31]:
smiles = {smiles: mol for smiles, mol in smiles_map.items() if mol is not None}

In [35]:
classifications = []
n = 0
for qs, q in query_map.items():
    n += 1
    if n % 50 == 0:
        print(f"Processed {n} queries of {len(query_map)} :: {qs}")
    for ss, s in smiles.items():
        if s.HasSubstructMatch(q):
            classifications.append((qs, ss))
            

Processed 50 queries of 4692 :: C12C(C3C(C(CC3)*)(C)CC1)CCC4C2(CCCC4)C
Processed 100 queries of 4692 :: N(N=C(*)*)=C(*)*
Processed 150 queries of 4692 :: CN(C)C(=O)NC1=CC([*])=C([*])C=C1
Processed 200 queries of 4692 :: Oc1ccccc1*
Processed 250 queries of 4692 :: [H][C@]1(OC(O)(C[C@H](O)[C@H]1O)C(O)=O)[C@H](O)[C@H](O)CO
Processed 300 queries of 4692 :: C(S([O-])(=O)=O)*
Processed 350 queries of 4692 :: [C@H]1(O[C@@H]([C@@H](O)[C@@H]([C@H]1O)O)CO)S/C(=N\OS(O)(=O)=O)/*
Processed 400 queries of 4692 :: [NH3+]C(C(NC(C([O-])=O)*)=O)*
Processed 450 queries of 4692 :: O1[C@@H]([C@@H]([C@@H]([C@H]([C@@H]1OC[C@@H](CO*)O*)O)O)O)CO
Processed 500 queries of 4692 :: CCCCCC(OO)\C=C\C=C/CCCCCCCC([O-])=O
Processed 550 queries of 4692 :: [*][C@@H]1[C@@H]([*])C=CC1=O
Processed 600 queries of 4692 :: CN1C(CNC2=CC=C(C=C2)C(=O)N[C@@H](CCC(O)=O)C(O)=O)CNC2=C1C(=O)NC(N)=N2
Processed 650 queries of 4692 :: CC(=O)CC(N)C(O)=O
Processed 700 queries of 4692 :: CSCCCCCC(NO)C(O)=O
Processed 750 queries of 4692 :: C

In [36]:
len(classifications)

10087350

In [37]:
len(classifications) / len(smiles)

54.529163738580465

In [38]:
!mkdir smarts-analysis

In [56]:
cs_df = pd.DataFrame(classifications, columns=['class', 'structure'])

In [60]:
cs_df.head(20)

Unnamed: 0,class,structure
0,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21
1,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,C12=C3C(=CC=C2OC(CC1)(C)C)[C@]4([C@@](O3)(C=5C...
2,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,COC1=CC2=C(C=C1)[C@@H]3COC4=C([C@@H]3O2)C=CC(=...
3,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,COc1ccc2c(O[C@@H]3c4ccc(OC)cc4OC[C@]23O)c1
4,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1
5,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,O1C2C(C3=C1C4=C(OC(C=C4)(C)C)C=C3)COC5=C2C=CC(...
6,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,O1C2C(C=3C1=CC=4OCOC4C3)COC5=C2C(O)=C(OC)C(O)=C5
7,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,O1[C@@]2([C@](C3=C1C=C(OC)C=C3)(COC4=C2C=CC(O)...
8,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,O1C2C(C=3C1=CC(OC)=C(O)C3)COC4=C2C=CC(O)=C4
9,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,O1C2C(C3=C1C=C(OC)C=C3)COC4=C2C=CC(O)=C4O


In [51]:
cs_df.tail(20)

Unnamed: 0,class,structure
10087330,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,P(OC[C@H]1O[C@@H](N2C3=C(C(=NC=N3)N)N=C2)[C@@H...
10087331,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,C(CCCCCCCCCCCCCCCCC1=CC=C(C=C1)O)(=O)OP(OC[C@H...
10087332,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,O(P(OC[C@H]1O[C@@H](N2C=3N=CN=C(N)C3N=C2)[C@@H...
10087333,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,CCCCC#CC1=NC2=C(C(=N1)NC)N=CN2C3C(C(C(O3)CO)O)O
10087334,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,CCCC(=O)NC1=NC=NC2=C1N=CN2C3C(C4C(O3)COP(=O)(O...
10087335,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,CCCC(=O)NC1=NC=NC2=C1N=CN2C3[C@@H]([C@H]4[C@H]...
10087336,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,COC1C(C(OC1N2C=NC3=C2N=CN=C3NC4CCCCC4)CO)O
10087337,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,C1=CC=C(C=C1)NC2=NC3=C(C(=N2)N)N=CN3C4C(C(C(O4...
10087338,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,C1CCC(C1)NC2=NC=NC3=C2N=CN3C4C(C(C(O4)CO)O)O
10087339,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,C1C[C@@H]([C@H](C1)O)NC2=NC=NC3=C2N=CN3[C@H]4[...


In [43]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Downloading pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-19.0.1


In [44]:
cs_df.to_parquet("smarts-analysis/classifications.parquet", compression='snappy')

In [42]:
cs_df.to_csv("smarts-analysis/classifications.csv", index=False)

In [54]:
from collections import defaultdict
struct2class = defaultdict(list)
for c, s in classifications:
    struct2class[s].append(c)

In [55]:
struct2class["COc1ccc2c(O[C@@H]3c4ccc(OC)cc4OC[C@]23O)c1"]

['COc1ccc2C3COc4cc(O)ccc4C3Oc2c1',
 '**',
 '*C(*)O',
 '[*]O[*]',
 'Oc1ccccc1*',
 'O*',
 '*O',
 'C[*]',
 '*CO',
 'O[*]',
 'OC[*]',
 'OCC(O)CO[*]',
 'CC(O)c1ccccc1',
 '*C(O)*',
 'COc1ccccc1',
 'C(O)*',
 'CC(O)CO',
 '[O]',
 'C1Oc2ccccc2CC1c1ccccc1',
 '[C]',
 'CCCO',
 'OC(C(O)c1ccccc1)c1ccccc1',
 '[CH2]',
 'C(*)O',
 'Oc1cccc(O)c1',
 'COc1ccc(C2COc3cc(O)ccc3C2)c(O)c1']

In [None]:
# do a left join to make a new df that is like df, 
# with cs_df added. Left join: df.child_smiles==cs_df.structure, parent_smiles=cs_df.class


In [61]:
results_df = df.merge(
    cs_df.rename(columns={'class': 'child_class'}),
    left_on='child_smiles',
    right_on='structure',
    how='left'
).merge(
    cs_df.rename(columns={'class': 'parent_class'}),
    left_on='parent_smiles',
    right_on='structure',
    how='left'
)

In [64]:
results_df = df.merge(
    cs_df,
    left_on=['child_smiles', 'parent_smiles'],
    right_on=['structure', 'class'],
    how='left'
)

In [65]:
results_df

Unnamed: 0,subject,object,child,parent,child_label,parent_label,child_smiles,parent_smiles,class,structure
0,CHEBI:100,CHEBI:16114,CHEBI:100,CHEBI:16114,(-)-medicarpin,medicarpin,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,[H][C@@]12COc3cc(O)ccc3[C@]1([H])Oc1cc(OC)ccc21
1,CHEBI:100005,CHEBI:35358,CHEBI:100005,CHEBI:35358,"N-[(1S,3S,4aS,9aR)-1-(hydroxymethyl)-3-[2-oxo-...",sulfonamide,COC1=CC(=CC=C1)S(=O)(=O)NC2=CC3=C(C=C2)O[C@@H]...,[*]S(=O)(=O)N([*])[*],,
2,CHEBI:100011,CHEBI:17792,CHEBI:100011,CHEBI:17792,"2-[(3R,6aR,8R,10aR)-1-[(4-fluorophenyl)-oxomet...",organohalogen compound,C1CCN(CC1)C(=O)C[C@H]2CC[C@@H]3[C@@H](O2)COC[C...,**,**,C1CCN(CC1)C(=O)C[C@H]2CC[C@@H]3[C@@H](O2)COC[C...
3,CHEBI:100012,CHEBI:140326,CHEBI:100012,CHEBI:140326,"N-[(2R,4aR,12aR)-2-[2-(cyclohexylmethylamino)-...",tertiary carboxamide,CN1[C@@H]2CC[C@@H](O[C@H]2COC3=C(C1=O)C=C(C=C3...,N(C(*)=O)(*)*,N(C(*)=O)(*)*,CN1[C@@H]2CC[C@@H](O[C@H]2COC3=C(C1=O)C=C(C=C3...
4,CHEBI:100012,CHEBI:140325,CHEBI:100012,CHEBI:140325,"N-[(2R,4aR,12aR)-2-[2-(cyclohexylmethylamino)-...",secondary carboxamide,CN1[C@@H]2CC[C@@H](O[C@H]2COC3=C(C1=O)C=C(C=C3...,N(C(*)=O)(*)[H],,
...,...,...,...,...,...,...,...,...,...,...
56775,CHEBI:99977,CHEBI:35358,CHEBI:99977,CHEBI:35358,"2-[(2S,4aS,12aS)-8-[(4-fluorophenyl)sulfonylam...",sulfonamide,CN1[C@H]2CC[C@H](O[C@@H]2COC3=C(C1=O)C=C(C=C3)...,[*]S(=O)(=O)N([*])[*],,
56776,CHEBI:99978,CHEBI:35358,CHEBI:99978,CHEBI:35358,"2-[(2S,4aS,12aS)-8-(ethylsulfonylamino)-5-meth...",sulfonamide,CCS(=O)(=O)NC1=CC2=C(C=C1)OC[C@@H]3[C@H](CC[C@...,[*]S(=O)(=O)N([*])[*],,
56777,CHEBI:99990,CHEBI:35358,CHEBI:99990,CHEBI:35358,"N-[(2R,3R)-4-[(2,5-difluorophenyl)methyl-methy...",sulfonamide,CC1=CC=C(C=C1)S(=O)(=O)N(C[C@@H](C)[C@H](CN(C)...,[*]S(=O)(=O)N([*])[*],[*]S(=O)(=O)N([*])[*],CC1=CC=C(C=C1)S(=O)(=O)N(C[C@@H](C)[C@H](CN(C)...
56778,CHEBI:99992,CHEBI:35358,CHEBI:99992,CHEBI:35358,"2-[(3S,6aR,8S,10aR)-1-(3-chlorophenyl)sulfonyl...",sulfonamide,C1CCN(CC1)C(=O)C[C@@H]2CC[C@@H]3[C@@H](O2)COC[...,[*]S(=O)(=O)N([*])[*],[*]S(=O)(=O)N([*])[*],C1CCN(CC1)C(=O)C[C@@H]2CC[C@@H]3[C@@H](O2)COC[...


In [72]:
missing_from_smarts = df[
    ~df.merge(
        cs_df,
        left_on=['child_smiles', 'parent_smiles'],
        right_on=['structure', 'class'],
        how='left'
    )['structure'].notna()
]

In [73]:
missing_from_smarts

Unnamed: 0,subject,object,child,parent,child_label,parent_label,child_smiles,parent_smiles
1,CHEBI:100005,CHEBI:35358,CHEBI:100005,CHEBI:35358,"N-[(1S,3S,4aS,9aR)-1-(hydroxymethyl)-3-[2-oxo-...",sulfonamide,COC1=CC(=CC=C1)S(=O)(=O)NC2=CC3=C(C=C2)O[C@@H]...,[*]S(=O)(=O)N([*])[*]
4,CHEBI:100012,CHEBI:140325,CHEBI:100012,CHEBI:140325,"N-[(2R,4aR,12aR)-2-[2-(cyclohexylmethylamino)-...",secondary carboxamide,CN1[C@@H]2CC[C@@H](O[C@H]2COC3=C(C1=O)C=C(C=C3...,N(C(*)=O)(*)[H]
10,CHEBI:100054,CHEBI:35358,CHEBI:100054,CHEBI:35358,"2-[(1S,3S,4aR,9aS)-1-(hydroxymethyl)-6-[(3-met...",sulfonamide,COC1=CC(=CC=C1)S(=O)(=O)NC2=CC3=C(C=C2)O[C@H]4...,[*]S(=O)(=O)N([*])[*]
15,CHEBI:100076,CHEBI:35358,CHEBI:100076,CHEBI:35358,"2-[(1R,3S,4aS,9aR)-1-(hydroxymethyl)-6-[(3-met...",sulfonamide,COC1=CC(=CC=C1)S(=O)(=O)NC2=CC3=C(C=C2)O[C@@H]...,[*]S(=O)(=O)N([*])[*]
16,CHEBI:100079,CHEBI:35358,CHEBI:100079,CHEBI:35358,"2-[(2S,4aR,12aR)-8-(ethylsulfonylamino)-5-meth...",sulfonamide,CCS(=O)(=O)NC1=CC2=C(C=C1)OC[C@H]3[C@@H](CC[C@...,[*]S(=O)(=O)N([*])[*]
...,...,...,...,...,...,...,...,...
56769,CHEBI:99969,CHEBI:35358,CHEBI:99969,CHEBI:35358,"2-[(2R,4aS,12aS)-8-(methanesulfonamido)-5-meth...",sulfonamide,CN1[C@H]2CC[C@@H](O[C@@H]2COC3=C(C1=O)C=C(C=C3...,[*]S(=O)(=O)N([*])[*]
56770,CHEBI:9997,CHEBI:51751,CHEBI:9997,CHEBI:51751,pristinamycin IIA,enamide,CC(C)[C@H]1OC(=O)C2=CCCN2C(=O)c2coc(CC(=O)C[C@...,[*]\C([*])=C(\[*])C(=O)N([*])[*]
56773,CHEBI:9997,CHEBI:140325,CHEBI:9997,CHEBI:140325,pristinamycin IIA,secondary carboxamide,CC(C)[C@H]1OC(=O)C2=CCCN2C(=O)c2coc(CC(=O)C[C@...,N(C(*)=O)(*)[H]
56775,CHEBI:99977,CHEBI:35358,CHEBI:99977,CHEBI:35358,"2-[(2S,4aS,12aS)-8-[(4-fluorophenyl)sulfonylam...",sulfonamide,CN1[C@H]2CC[C@H](O[C@@H]2COC3=C(C1=O)C=C(C=C3)...,[*]S(=O)(=O)N([*])[*]


In [76]:
# Pairs in SMARTS that don't exist in CHEBI
missing_from_chebi = cs_df[
    ~cs_df.merge(
        df,
        left_on=['structure', 'class'],
        right_on=['child_smiles', 'parent_smiles'],
        how='left'
    )['child_smiles'].notna()
]

  missing_from_chebi = cs_df[


In [77]:
missing_from_chebi

Unnamed: 0,class,structure
1,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,C12=C3C(=CC=C2OC(CC1)(C)C)[C@]4([C@@](O3)(C=5C...
2,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,COC1=CC2=C(C=C1)[C@@H]3COC4=C([C@@H]3O2)C=CC(=...
3,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,COc1ccc2c(O[C@@H]3c4ccc(OC)cc4OC[C@]23O)c1
4,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1
5,COc1ccc2C3COc4cc(O)ccc4C3Oc2c1,O1C2C(C3=C1C4=C(OC(C=C4)(C)C)C=C3)COC5=C2C=CC(...
...,...,...
10087345,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,C1=CC(=CC=C1COC[C@@H]2[C@H]([C@H]([C@@H](O2)N3...
10087346,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,C1=COC(=C1)CNC2=NC=NC3=C2N=CN3[C@H]4[C@@H]([C@...
10087347,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,C1=NC2=C(C(=N1)N)N=CN2C3[C@@H]([C@@H]([C@H](O3...
10087348,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@@...,C1=NC2=C(C(=N1)N)N=CN2[C@H]3C([C@H]([C@H](O3)C...


In [69]:
# Option 1: Use string concatenation with nunique()
all_pairs = pd.concat([chebi_pairs, smarts_pairs])
summary_df = all_pairs.groupby(['child_smiles', 'parent_smiles']).agg(
    source_count=('source', 'nunique'),
    sources=('source', set)
).assign(
    source=lambda x: np.where(x['source_count'] == 2, 'CHEBI/SMARTS',
           np.where(x['sources'].str.contains('CHEBI'), 'CHEBI', 'SMARTS'))
).reset_index()

# Option 2: Even simpler - use value_counts
summary_df = (all_pairs.groupby(['child_smiles', 'parent_smiles'])
             ['source'].value_counts()
             .unstack(fill_value=0)
             .assign(source=lambda x: np.where((x['CHEBI'] > 0) & (x['SMARTS'] > 0), 
                                             'CHEBI/SMARTS',
                                             np.where(x['CHEBI'] > 0, 'CHEBI', 'SMARTS')))
             .reset_index())

KeyboardInterrupt: 