# Learning Chemical Classification Programs

This uses LLMs to learn programs for classifying chemical structures (SMILES strings) into chemical classes or groupings

In [3]:
from pydantic import BaseModel

from chebi_llm_classifier.datamodel import ChemicalStructure, ChemicalClass, Dataset

In [4]:
dataset = Dataset.parse_file("inputs/dataset.json")
filtered_classes = dataset.classes

## Utils

In [6]:
from chebi_llm_classifier.evaluation import split_to_training_test

In [7]:
a, b = split_to_training_test(filtered_classes, n=3)

In [8]:
a[0].instances

[ChemicalStructure(name='all-trans-3,4-didehydroretinol', smiles='C1(C)(C)C(\\C=C\\C(=C\\C=C\\C(=C\\CO)\\C)\\C)=C(C)C=CC1'),
 ChemicalStructure(name='all-trans-retinol', smiles='C\\C(=C/CO)\\C=C\\C=C(/C)\\C=C\\C1=C(C)CCCC1(C)C'),
 ChemicalStructure(name='all-trans-retinyl ester', smiles='CC(\\C=C\\C=C(C)\\C=C\\C1=C(C)CCCC1(C)C)=C/COC([*])=O'),
 ChemicalStructure(name='all-trans-3,4-didehydroretinoic acid', smiles='C1(C)(C)CC=CC(=C1\\C=C\\C(=C\\C=C\\C(=C\\C(=O)O)\\C)\\C)C'),
 ChemicalStructure(name='all-trans-retinoic acid', smiles='CC(\\C=C\\C1=C(C)CCCC1(C)C)=C/C=C/C(C)=C/C(O)=O')]

In [9]:
b[0].instances

[ChemicalStructure(name='all-trans-retinal', smiles='[H]C(=O)\\C=C(/C)\\C=C\\C=C(/C)\\C=C\\C1=C(C)CCCC1(C)C')]

## Run an individual experiment

In [37]:
from chebi_llm_classifier.datamodel import Config

# claude-sonnet seems best so far
config = Config(llm_model_name="lbl/claude-sonnet", max_attempts=5, accuracy_threshold=0.95)

In [38]:
len(filtered_classes)

615

In [39]:
training_set, test_set = split_to_training_test(filtered_classes, n=200, start=0)
len(training_set)


200

In [40]:
from chebi_llm_classifier.generator import generate_and_test_classifier

results = []
for test_cls in training_set:
    print("##", test_cls.name)
    for result in generate_and_test_classifier(test_cls, config=config):
        print(result.attempt, result.num_true_positives, result.num_true_negatives, result.num_false_positives, result.f1)
        results.append(result)
        result.calculate()

## vitamin A
0 0 19 0 0
1 0 19 0 0
2 2 19 0 0.5714285714285715
3 3 19 0 0.7499999999999999
4 3 19 0 0.7499999999999999
FAILED: vitamin A err=
Attempt failed: F1 score of 0.749999999
## pyrrolobenzodiazepine
0 None None None None
1 None None None None
2 0 20 0 0
3 0 20 0 0
4 0 20 0 0
FAILED: pyrrolobenzodiazepine err=
Attempt failed: F1 score of 0 is too lo
## steroid aldehyde
0 0 20 0 0
1 1 20 0 0.04878048780487806
2 0 20 0 0
3 0 20 0 0
4 0 20 0 0
FAILED: steroid aldehyde err=
Attempt failed: F1 score of 0 is too lo
## C27-steroid
0 15 18 2 0.8108108108108107
1 1 20 0 0.09523809523809523
2 18 18 2 0.9
3 18 18 2 0.9
4 18 14 6 0.8181818181818182
FAILED: C27-steroid err=
Attempt failed: F1 score of 0.818181818
## C24-steroid
0 None None None None
1 0 16 0 0
2 0 16 0 0
3 0 16 0 0
4 0 16 0 0
FAILED: C24-steroid err=
Attempt failed: F1 score of 0 is too lo
## C19-steroid
0 0 20 0 0
1 5 18 2 0.7692307692307692
2 None None None None
3 5 18 2 0.7692307692307692
4 0 20 0 0
FAILED: C19-steroid er

KeyboardInterrupt: 

In [41]:
print(len(results))

379


In [42]:
def calculate_best():
    best_by_cls = {}
    for r in results:
        cid = r.chemical_class.id
        if r.f1 and (cid not in best_by_cls or r.f1 > best_by_cls[cid]):
            best_by_cls[cid] = r.f1
    for r in results:
        r.best = False
        cid = r.chemical_class.id
        if cid in best_by_cls and best_by_cls[cid] == r.f1:
            r.best = True
            
calculate_best()

In [43]:
import pandas as pd


def calc_eval_results(results, min_f1=0):
    eval_results = []
    for result in results:
        if result.f1 < min_f1:
            continue
        # print(result.f1)
        train_cls = result.chemical_class
        code = result.code
        [test_cls] = [c for c in test_set if c.id == train_cls.id]
        for eval_result in generate_and_test_classifier(test_cls, suppress_llm=True, prog=code, config=config):
            eval_results.append(eval_result)
            eval_result.calculate()
            # print(eval_result.f1)
    return pd.DataFrame([r.model_dump() for r in eval_results])



    

In [44]:
eval_df = calc_eval_results([r for r in results if r.best]) 

In [45]:
from pathlib import Path

results_dir = Path("latest")
results_dir.mkdir(parents=True, exist_ok=True)
with open(results_dir / "results.json", "w") as f:
    import json
    results_objs = [r.model_dump() for r in results]
    f.write(json.dumps(results_objs, indent=2))

In [46]:
def results_as_df(results):
    rows = []
    for r in results:
        r.calculate()
        row = r.model_dump()
        rows.append(row)
    return pd.DataFrame(rows)
        

In [47]:
#eval_df = results_as_df(eval_results)

In [48]:
eval_df.to_csv( results_dir / "eval_results.csv")

In [49]:
from chebi_llm_classifier.stats import calculate_metrics_pandas


def df_stats(df):
    return calculate_metrics_pandas(df.aggregate({"num_true_positives": "sum", "num_true_negatives": "sum", "num_false_positives": "sum",  "num_false_negatives": "sum"}))

In [50]:
df_stats(calc_eval_results([r for r in results if r.best]))

total                        1596.0000
positives                     297.0000
negatives                    1299.0000
actual_positives              268.0000
actual_negatives             1328.0000
accuracy                        0.9530
precision                       0.8249
recall                          0.9142
specificity                     0.9608
f1_score                        0.8673
false_positive_rate             0.0392
negative_predictive_value       0.9823
balanced_accuracy               0.9375
dtype: float64

In [51]:
df_stats(calc_eval_results(results, min_f1=1.0))

total                        1013.0000
positives                     138.0000
negatives                     875.0000
actual_positives              134.0000
actual_negatives              879.0000
accuracy                        0.9882
precision                       0.9420
recall                          0.9701
specificity                     0.9909
f1_score                        0.9559
false_positive_rate             0.0091
negative_predictive_value       0.9954
balanced_accuracy               0.9805
dtype: float64

In [52]:
results_df = results_as_df(results)

In [53]:
results_df.query('best == True')


Unnamed: 0,chemical_class,config,code,true_positives,false_positives,true_negatives,false_negatives,attempt,success,best,error,stdout,num_true_positives,num_false_positives,num_true_negatives,num_false_negatives,precision,recall,f1
3,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(C\C(=C/CO)\C=C\C=C(/C)\C=C\C1=C(C)CCCC1(C)C,...",[],[(CC[C@@H](C)C(=O)O[C@@H]1[C@H](OC(C)=O)C2=C(O...,[(C1(C)(C)C(\C=C\C(=C\C=C\C(=C\CO)\C)\C)=C(C)C...,3,True,True,,,3,0,19,2,1.0,0.600,0.75000
4,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(C\C(=C/CO)\C=C\C=C(/C)\C=C\C1=C(C)CCCC1(C)C,...",[],[(CC[C@@H](C)C(=O)O[C@@H]1[C@H](OC(C)=O)C2=C(O...,[(C1(C)(C)C(\C=C\C(=C\C=C\C(=C\CO)\C)\C)=C(C)C...,4,True,True,,,3,0,19,2,1.0,0.600,0.75000
11,"{'id': 'CHEBI:131565', 'name': 'steroid aldehy...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(CC(=C)[C@@H]1CC[C@@]2(CC[C@]3(C)[C@H](CC[C@@...,[],[(C[C@@H]([C@H]1CC[C@H]2[C@@H]3C[C@H]4O[C@]44[...,[(C[C@@H]([C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)C=C[...,1,True,True,,,1,0,20,39,1.0,0.025,0.04878
17,"{'id': 'CHEBI:131619', 'name': 'C27-steroid', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[([H][C@]12C[C@@]3([H])[C@]4([H])CC=C5C[C@@H](...,[([H][C@@]12C[C@@H](C)[C@](O)(C(=O)COC(=O)C(C)...,[(C[C@H]([C@@H]1OC(=O)C(C)=C(C)[C@@H]1O)[C@@]1...,[([H][C@@]1(CC[C@@]2([H])[C@]3([H])CC(C)=C4C(C...,2,True,True,,,18,2,18,2,0.9,0.900,0.90000
18,"{'id': 'CHEBI:131619', 'name': 'C27-steroid', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[([H][C@]12C[C@@]3([H])[C@]4([H])CC=C5C[C@@H](...,[([H][C@@]12C[C@@H](C)[C@](O)(C(=O)COC(=O)C(C)...,[(C[C@H]([C@@H]1OC(=O)C(C)=C(C)[C@@H]1O)[C@@]1...,[([H][C@@]1(CC[C@@]2([H])[C@]3([H])CC(C)=C4C(C...,3,True,True,,,18,2,18,2,0.9,0.900,0.90000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,"{'id': 'CHEBI:197290', 'name': 'tetradecanedio...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCCCCCCCCCC(O)CO, Tetradecanediol with OH ...",[],"[(CCCCCCCCCCCCC(C)O, Must have exactly 2 hydro...",[],0,True,True,,,6,0,20,0,1.0,1.000,1.00000
374,"{'id': 'CHEBI:197387', 'name': 'hexadecanol', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCCCCCCCCCC(O)CCC, Hexadecan-4-ol), (CCCCC...",[],"[(CCCCCCCCCCCCCCCCCCCCCCCCCO, Contains 25 carb...",[],2,True,True,,,7,0,20,0,1.0,1.000,1.00000
375,"{'id': 'CHEBI:197399', 'name': 'heptadecanol',...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCCCCCCC(O)CCCCCCC, Molecule is a heptadec...",[],"[(CCCCCCCCCCCCC(O)CCCCCCCCCC, Contains 23 carb...",[],0,True,True,[21:27:45] Initializing MetalDisconnector\n[21...,,8,0,20,0,1.0,1.000,1.00000
376,"{'id': 'CHEBI:197457', 'name': 'octadecanol', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCCCCCCCCCC(O)CCCCC, Octadecan-6-ol), (C(C...",[],"[(CCCCCCCCCCCCCC(C)O, Incorrect molecular form...",[],0,True,True,,,8,0,20,0,1.0,1.000,1.00000


In [54]:
results_df.query('best == True').aggregate({"precision": "mean", "recall": "mean", "f1": "mean"})


precision    0.943424
recall       0.939089
f1           0.929124
dtype: float64

In [55]:
results_df.query('best == True and precision == 1.0')

Unnamed: 0,chemical_class,config,code,true_positives,false_positives,true_negatives,false_negatives,attempt,success,best,error,stdout,num_true_positives,num_false_positives,num_true_negatives,num_false_negatives,precision,recall,f1
3,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(C\C(=C/CO)\C=C\C=C(/C)\C=C\C1=C(C)CCCC1(C)C,...",[],[(CC[C@@H](C)C(=O)O[C@@H]1[C@H](OC(C)=O)C2=C(O...,[(C1(C)(C)C(\C=C\C(=C\C=C\C(=C\CO)\C)\C)=C(C)C...,3,True,True,,,3,0,19,2,1.0,0.600,0.75000
4,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(C\C(=C/CO)\C=C\C=C(/C)\C=C\C1=C(C)CCCC1(C)C,...",[],[(CC[C@@H](C)C(=O)O[C@@H]1[C@H](OC(C)=O)C2=C(O...,[(C1(C)(C)C(\C=C\C(=C\C=C\C(=C\CO)\C)\C)=C(C)C...,4,True,True,,,3,0,19,2,1.0,0.600,0.75000
11,"{'id': 'CHEBI:131565', 'name': 'steroid aldehy...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[(CC(=C)[C@@H]1CC[C@@]2(CC[C@]3(C)[C@H](CC[C@@...,[],[(C[C@@H]([C@H]1CC[C@H]2[C@@H]3C[C@H]4O[C@]44[...,[(C[C@@H]([C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)C=C[...,1,True,True,,,1,0,20,39,1.0,0.025,0.04878
30,"{'id': 'CHEBI:131697', 'name': 'pyrimidotriazi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CN1C2=C(C(=O)N(C1=O)C)N=NC=N2, Contains orth...",[],[(COC(=O)C[C@@H]1C[C@H]2[C@@H]([C@@H](O1)CO)OC...,[],0,True,True,,,6,0,20,0,1.0,1.000,1.00000
41,"{'id': 'CHEBI:131862', 'name': 'HPODE(1-)', 'd...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCC\C=C/C\C=C/[C@@H](CCCCCCC([O-])=O)OO, M...",[],[(C(CCCCCCC/C=C\[C@H](/C=C\C/C=C\CC)OO)(=O)[O-...,[],0,True,True,,,8,0,20,0,1.0,1.000,1.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,"{'id': 'CHEBI:197290', 'name': 'tetradecanedio...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCCCCCCCCCC(O)CO, Tetradecanediol with OH ...",[],"[(CCCCCCCCCCCCC(C)O, Must have exactly 2 hydro...",[],0,True,True,,,6,0,20,0,1.0,1.000,1.00000
374,"{'id': 'CHEBI:197387', 'name': 'hexadecanol', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCCCCCCCCCC(O)CCC, Hexadecan-4-ol), (CCCCC...",[],"[(CCCCCCCCCCCCCCCCCCCCCCCCCO, Contains 25 carb...",[],2,True,True,,,7,0,20,0,1.0,1.000,1.00000
375,"{'id': 'CHEBI:197399', 'name': 'heptadecanol',...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCCCCCCC(O)CCCCCCC, Molecule is a heptadec...",[],"[(CCCCCCCCCCCCC(O)CCCCCCCCCC, Contains 23 carb...",[],0,True,True,[21:27:45] Initializing MetalDisconnector\n[21...,,8,0,20,0,1.0,1.000,1.00000
376,"{'id': 'CHEBI:197457', 'name': 'octadecanol', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCCCCCCCCCC(O)CCCCC, Octadecan-6-ol), (C(C...",[],"[(CCCCCCCCCCCCCC(C)O, Incorrect molecular form...",[],0,True,True,,,8,0,20,0,1.0,1.000,1.00000


In [56]:
results_df.query('success == True')

Unnamed: 0,chemical_class,config,code,true_positives,false_positives,true_negatives,false_negatives,attempt,success,best,error,stdout,num_true_positives,num_false_positives,num_true_negatives,num_false_negatives,precision,recall,f1
0,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[(CC[C@@H](C)C(=O)O[C@@H]1[C@H](OC(C)=O)C2=C(O...,"[(C\C(=C/CO)\C=C\C=C(/C)\C=C\C1=C(C)CCCC1(C)C,...",0,True,False,,,0,0,19,5,0.0,0.0,0.000000
1,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],[(CC[C@@H](C)C(=O)O[C@@H]1[C@H](OC(C)=O)C2=C(O...,"[(C\C(=C/CO)\C=C\C=C(/C)\C=C\C1=C(C)CCCC1(C)C,...",1,True,False,,,0,0,19,5,0.0,0.0,0.000000
2,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(C\C(=C/CO)\C=C\C=C(/C)\C=C\C1=C(C)CCCC1(C)C,...",[],[(CC[C@@H](C)C(=O)O[C@@H]1[C@H](OC(C)=O)C2=C(O...,[(CC(\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C)=C/COC([...,2,True,False,,,2,0,19,3,1.0,0.4,0.571429
3,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(C\C(=C/CO)\C=C\C=C(/C)\C=C\C1=C(C)CCCC1(C)C,...",[],[(CC[C@@H](C)C(=O)O[C@@H]1[C@H](OC(C)=O)C2=C(O...,[(C1(C)(C)C(\C=C\C(=C\C=C\C(=C\CO)\C)\C)=C(C)C...,3,True,True,,,3,0,19,2,1.0,0.6,0.750000
4,"{'id': 'CHEBI:12777', 'name': 'vitamin A', 'de...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(C\C(=C/CO)\C=C\C=C(/C)\C=C\C1=C(C)CCCC1(C)C,...",[],[(CC[C@@H](C)C(=O)O[C@@H]1[C@H](OC(C)=O)C2=C(O...,[(C1(C)(C)C(\C=C\C(=C\C=C\C(=C\CO)\C)\C)=C(C)C...,4,True,True,,,3,0,19,2,1.0,0.6,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,"{'id': 'CHEBI:197387', 'name': 'hexadecanol', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCCCCCCCCCC(O)CCC, Hexadecan-4-ol), (CCCCC...",[],"[(CCCCCCCCCCCCCCCCCCCCCCCCCO, Contains 25 carb...",[],2,True,True,,,7,0,20,0,1.0,1.0,1.000000
375,"{'id': 'CHEBI:197399', 'name': 'heptadecanol',...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCCCCCCC(O)CCCCCCC, Molecule is a heptadec...",[],"[(CCCCCCCCCCCCC(O)CCCCCCCCCC, Contains 23 carb...",[],0,True,True,[21:27:45] Initializing MetalDisconnector\n[21...,,8,0,20,0,1.0,1.0,1.000000
376,"{'id': 'CHEBI:197457', 'name': 'octadecanol', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,"[(CCCCCCCCCCCCC(O)CCCCC, Octadecan-6-ol), (C(C...",[],"[(CCCCCCCCCCCCCC(C)O, Incorrect molecular form...",[],0,True,True,,,8,0,20,0,1.0,1.0,1.000000
377,"{'id': 'CHEBI:197468', 'name': 'nonadecanol', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,[],[],"[(CCCCCCCCCCCCCCCC(O)CCCCCCCCCC, Incorrect mol...","[(CCCCCCCCCC(O)CCCCCCCCC, Contains 0 OH groups...",0,True,False,,,0,0,20,8,0.0,0.0,0.000000


In [57]:
results_df.query('success == False')

Unnamed: 0,chemical_class,config,code,true_positives,false_positives,true_negatives,false_negatives,attempt,success,best,error,stdout,num_true_positives,num_false_positives,num_true_negatives,num_false_negatives,precision,recall,f1
5,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,cannot import name 'GetSubstructMatches' from ...,,0,0,0,0,0.0,0.0,0.0
6,"{'id': 'CHEBI:131437', 'name': 'pyrrolobenzodi...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,1,False,False,Python argument types in\n Mol.HasSubstruct...,,0,0,0,0,0.0,0.0,0.0
20,"{'id': 'CHEBI:131620', 'name': 'C24-steroid', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,module 'rdkit.Chem.rdMolDescriptors' has no at...,,0,0,0,0,0.0,0.0,0.0
27,"{'id': 'CHEBI:131621', 'name': 'C19-steroid', ...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,2,False,False,module 'rdkit.Chem.rdqueries' has no attribute...,,0,0,0,0,0.0,0.0,0.0
31,"{'id': 'CHEBI:131858', 'name': 'HETE anion', '...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,module 'rdkit.Chem.rdMolDescriptors' has no at...,,0,0,0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,"{'id': 'CHEBI:19281', 'name': '2,2'-bithiophen...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,cannot import name 'rdDecomposition' from 'rdk...,,0,0,0,0,0.0,0.0,0.0
360,"{'id': 'CHEBI:195550', 'name': 'tetradecanol',...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,module 'rdkit.Chem.rdMolDescriptors' has no at...,,0,0,0,0,0.0,0.0,0.0
363,"{'id': 'CHEBI:195608', 'name': 'undecanol', 'd...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,module 'rdkit.Chem.rdMolDescriptors' has no at...,,0,0,0,0,0.0,0.0,0.0
368,"{'id': 'CHEBI:195629', 'name': 'pentadecanol',...","{'llm_model_name': 'lbl/claude-sonnet', 'accur...",from rdkit import Chem\nfrom rdkit.Chem import...,,,,,0,False,False,'_vecti' object has no attribute 'index',,0,0,0,0,0.0,0.0,0.0


In [58]:
slim_df = results_df.copy()
slim_df["code"] = ""

In [59]:
slim_df.to_csv(results_dir / "results.csv")

In [60]:
from chebi_llm_classifier.generator import safe_name

for r in results:
    cn = safe_name(r.chemical_class.name)
    prog_dir = results_dir / "programs"
    prog_dir.mkdir(exist_ok=True, parents=True)
    prog_path = f"{prog_dir / cn}.py"
    #print(prog_path)
    with open(prog_path, "w") as f:
        f.write(r.code)
        f.write(f"\n# Pr={r.precision}")
        f.write(f"\n# Recall={r.recall}")
    