In [1]:
import random
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import bootstrap
from tqdm import tqdm

tqdm.pandas()

# Set random seeds for reproducibility
seed = 0
random.seed(seed)
np.random.seed(seed)
pd.set_option('compute.use_numexpr', False)  # Disable numexpr to ensure reproducibility
pd.set_option('compute.use_bottleneck', False)  # Disable bottleneck to ensure reproducibility

This notebooks takes pickled test dataframes automatically stored during testing of the models (i.e., running `trainer.test(model, ...)`) and calculates means and confidence intervals for all metrics. The cell below shows an example of a test dataframe.


In [2]:
# Example of a test dataframe for the retrieval challenge:
pd.read_pickle('../data/test_results/retrieval/rebuttal_MIST_test_formula_2024-08-13_15-07-19.pkl')

Unnamed: 0,identifier,sorted_candidate_smiles,test_hit_rate@1,test_hit_rate@5,test_hit_rate@20,test_mces@1
0,MassSpecGymID0000201,[O=C(NC1CC2(CC(O)C2)C1)OCc1ccccc1.O=C(NC1CC2(C...,0.0,0.0,1.0,18.0
1,MassSpecGymID0000202,[O=C(NC1CC2(CC(O)C2)C1)OCc1ccccc1.O=C(NC1CC2(C...,0.0,0.0,0.0,18.0
2,MassSpecGymID0000203,[O=C(NC1CC2(CC(O)C2)C1)OCc1ccccc1.O=C(NC1CC2(C...,0.0,0.0,1.0,18.0
3,MassSpecGymID0000204,[O=C(NC1CC2(CC(O)C2)C1)OCc1ccccc1.O=C(NC1CC2(C...,0.0,0.0,1.0,18.0
4,MassSpecGymID0000205,[O=C(NC1CC2(CC(O)C2)C1)OCc1ccccc1.O=C(NC1CC2(C...,0.0,0.0,1.0,18.0
...,...,...,...,...,...,...
17551,MassSpecGymID0414164,[CCCN1C(=O)C(=O)/C(=C(/O)c2cc(Cl)c(OC)cc2OC)C1...,0.0,0.0,0.0,15.5
17552,MassSpecGymID0414165,[COc1c2c(c(CNC(=O)COc3ccc4c(c3)OCO4)c3c1C(=O)N...,0.0,0.0,0.0,18.5
17553,MassSpecGymID0414166,[CCCC(=O)O[C@H]1CC[C@@]2(C)C(=CC[C@@H]3C2CC[C@...,0.0,0.0,0.0,15.0
17554,MassSpecGymID0414167,[CCCC(=O)O[C@H]1CC[C@@]2(C)C(=CC[C@@H]3C2CC[C@...,0.0,0.0,0.0,15.0


In [3]:
def evaluate(dir_results, task, seed=0):
    np.random.seed(seed)

    # Prepare
    if task == 'retrieval':
        metric_cols = ['test_hit_rate@1', 'test_hit_rate@5', 'test_hit_rate@20', 'test_mces@1']
    elif task == 'de_novo':
        metric_cols = [
            'test_top_1_accuracy', 'test_top_1_mces_dist', 'test_top_1_max_tanimoto_sim',
            'test_top_10_accuracy', 'test_top_10_mces_dist', 'test_top_10_max_tanimoto_sim'
        ]

    # Load all data into a single data frame
    df = []
    for path in dir_results.glob('*.pkl'):
        df_method = pd.read_pickle(path)
        df_method['method'] = path.stem
        df_method = df_method.rename(columns={'test_mces_at_1': 'test_mces@1'})  # compatibility
        df.append(df_method)
    df = pd.concat(df)

    # Preprocess data frame
    for col in [c for c in df.columns if ('hit_rate' in c or 'accuracy' in c)]:
        df[col] *= 100

    # Calculate means for all metrics into a single table
    df_mean = df.groupby('method')[metric_cols].mean().round(2)

    # Calculate confidence intervals for all metrics into a single table
    def get_ci(col_vals, confidence_level=0.999, n_resamples=20_000):
        res = bootstrap((col_vals,), np.mean, confidence_level=confidence_level, n_resamples=n_resamples, random_state=seed)
        ci = res.confidence_interval
        return f'{ci.low:.2f}-{ci.high:.2f}'
    def get_ci_for_each_col(df_method):
        return df_method.apply(get_ci, axis=0)
    tqdm.pandas(desc="Bootstrapping predictions for each method", postfix=None)
    df_ci = df.groupby('method')[metric_cols].progress_apply(lambda df_method: get_ci_for_each_col(df_method))

    # Merge tables with means and confidence intervals
    for col in metric_cols:
        df_mean[col] = df_mean[col].astype(str) + ' (' + df_ci[col] + ')'
    return df_mean

## Evaluation for the retrieval challenge

In [4]:
dir_results = Path('../data/test_results/retrieval')
task = 'retrieval'

df = evaluate(dir_results, task)

Bootstrapping predictions for each method: 100%|██████████| 13/13 [07:49<00:00, 36.10s/it]


### Main challenge

In [5]:
df_paper = df.reset_index()
df_paper = df_paper[(~df_paper['method'].str.contains('formula')) | (df_paper['method'].str.contains('no_formula'))]
df_paper = df_paper.sort_values('test_hit_rate@1', ascending=True, key=lambda x: x.str.split(' ').str[0].astype(float))
print(df_paper.to_markdown(index=False))

| method                                                                           | test_hit_rate@1     | test_hit_rate@5     | test_hit_rate@20    | test_mces@1         |
|:---------------------------------------------------------------------------------|:--------------------|:--------------------|:--------------------|:--------------------|
| rebuttal_random_test_mass_2024-08-13_17-08-09                                    | 0.37 (0.24-0.54)    | 2.01 (1.68-2.39)    | 8.22 (7.53-8.89)    | 30.81 (30.40-31.21) |
| rebuttal_deepsets_test_mass_2024-08-14_22-51-05                                  | 1.47 (1.18-1.77)    | 6.21 (5.64-6.82)    | 19.23 (18.24-20.26) | 25.11 (24.84-25.39) |
| rebuttal_fingerprint_ffn_sigmoid_mist_canopus_1550_test_mass_2024-08-17_02-30-13 | 1.65 (1.36-1.98)    | 5.45 (4.89-6.02)    | 15.15 (14.29-16.05) | 26.76 (26.47-27.06) |
| rebuttal_fingerprint_ffn_test_mass_2024-08-15_15-39-32                           | 2.54 (2.17-2.99)    | 7.59 (6.96-8.28)    | 20.0 (

### Bonus chemical formulae challenge

In [6]:
df_paper = df.reset_index()
df_paper = df_paper[(df_paper['method'].str.contains('formula')) & (~df_paper['method'].str.contains('no_formula'))]
df_paper = df_paper.sort_values('test_hit_rate@1', ascending=True, key=lambda x: x.str.split(' ').str[0].astype(float))
print(df_paper.to_markdown(index=False))

| method                                                                              | test_hit_rate@1   | test_hit_rate@5     | test_hit_rate@20    | test_mces@1         |
|:------------------------------------------------------------------------------------|:------------------|:--------------------|:--------------------|:--------------------|
| rebuttal_random_test_formula_2024-08-13_16-14-07                                    | 3.06 (2.64-3.52)  | 11.35 (10.60-12.12) | 27.74 (26.52-28.84) | 13.87 (13.70-14.03) |
| rebuttal_random_test_formula_2024-08-13_17-08-09                                    | 3.06 (2.64-3.52)  | 11.35 (10.60-12.12) | 27.74 (26.52-28.84) | 13.87 (13.70-14.03) |
| rebuttal_fingerprint_ffn_sigmoid_mist_canopus_1550_test_formula_2024-08-17_02-30-13 | 4.07 (3.61-4.54)  | 13.13 (12.33-13.95) | 29.44 (28.32-30.53) | 15.5 (15.34-15.64)  |
| rebuttal_deepsets_test_formula_2024-08-15_16-45-06                                  | 4.42 (3.92-4.97)  | 14.46 (13.58-15.36) | 

## Evaluation for the de novo challenge

In [8]:
dir_results = Path('../data/test_results/de_novo')
task = 'de_novo'

df = evaluate(dir_results, task)

  a_hat = 1/6 * sum(nums) / sum(dens)**(3/2)
Bootstrapping predictions for each method: 100%|██████████| 11/11 [10:12<00:00, 55.66s/it]


### Main challenge

In [16]:
df_paper = df.reset_index()
df_paper = df_paper[(~df_paper['method'].str.contains('formula')) | (df_paper['method'].str.contains('no_formula'))]
df_paper = df_paper.sort_values('test_top_10_mces_dist', ascending=False, key=lambda x: x.str.split(' ').str[0].astype(float))
print(df_paper.to_markdown(index=False))

| method                                                                  | test_top_1_accuracy   | test_top_1_mces_dist   | test_top_1_max_tanimoto_sim   | test_top_10_accuracy   | test_top_10_mces_dist   | test_top_10_max_tanimoto_sim   |
|:------------------------------------------------------------------------|:----------------------|:-----------------------|:------------------------------|:-----------------------|:------------------------|:-------------------------------|
| rebuttal_smiles_transformer_mist_canopus_1550_test_2024-08-17_02-30-13  | 0.0 (nan-nan)         | 96.17 (95.78-96.53)    | 0.01 (0.00-0.01)              | 0.0 (nan-nan)          | 70.88 (70.09-71.68)     | 0.04 (0.04-0.04)               |
| rebuttal_smiles_transformer_mist_canopus_test_2024-08-16_22-34-55       | 0.0 (nan-nan)         | 96.06 (95.67-96.43)    | 0.01 (0.00-0.01)              | 0.0 (nan-nan)          | 70.77 (69.96-71.53)     | 0.04 (0.04-0.04)               |
| rebuttal_selfies_transformer_mist_

### Bonus chemical formulae challenge

In [18]:
df_paper = df.reset_index()
df_paper = df_paper[(df_paper['method'].str.contains('formula')) & (~df_paper['method'].str.contains('no_formula'))]
df_paper = df_paper.sort_values('test_top_10_mces_dist', ascending=False, key=lambda x: x.str.split(' ').str[0].astype(float))
print(df_paper.to_markdown(index=False))

| method                                                           | test_top_1_accuracy   | test_top_1_mces_dist   | test_top_1_max_tanimoto_sim   | test_top_10_accuracy   | test_top_10_mces_dist   | test_top_10_max_tanimoto_sim   |
|:-----------------------------------------------------------------|:----------------------|:-----------------------|:------------------------------|:-----------------------|:------------------------|:-------------------------------|
| rebuttal_smiles_transformer_formula_test_2024-08-17_02-30-13     | 0.0 (nan-nan)         | 79.39 (78.64-80.08)    | 0.03 (0.03-0.04)              | 0.0 (nan-nan)          | 52.13 (51.45-52.81)     | 0.1 (0.09-0.10)                |
| rebuttal_selfies_transformer_formula_test_2024-08-17_02-30-13    | 0.0 (nan-nan)         | 38.88 (38.57-39.20)    | 0.08 (0.08-0.08)              | 0.0 (nan-nan)          | 26.87 (26.66-27.11)     | 0.13 (0.13-0.13)               |
| rebuttal_selfies_transformer_formula_v2_test_2024-08-18_14-28-