In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import yaml
from ms_pred.common.plot_utils import *
set_style()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_names = ["nist20", "canopus_train_public"]

In [37]:

names = [
    "CFM-ID",
    "3DMolMS",
    "FixedVocab",
    "NEIMS (FFN)",
    "NEIMS (GNN)",
    "SCARF"
]


dataset_to_res = {}
for dataset_name in dataset_names:
    results_files = [
        f"../results/cfm_id_{dataset_name}/split_1/preds/pred_eval.yaml",
        f"../results/molnetms_baseline_{dataset_name}/split_1/preds/pred_eval.yaml",
        f"../results/graff_ms_baseline_{dataset_name}/split_1/preds/pred_eval.yaml",
        f"../results/ffn_baseline_{dataset_name}/split_1/preds/pred_eval.yaml",
        f"../results/gnn_baseline_{dataset_name}/split_1/preds/pred_eval.yaml",
        f"../results/scarf_inten_{dataset_name}/split_1/preds/pred_eval.yaml",
    ]

    yaml_files = {i: yaml.safe_load(open(j, "r")) for i, j in zip(names, results_files)}
    dataset_to_res[dataset_name] = yaml_files

In [38]:

name_to_time = {
    "CFM-ID": "../results/cfm_id_nist20_timer/time_out.json", 
    "3DMolMS": "../results/molnetms_baseline_nist20/split_1/time_out.json",
    "FixedVocab": "../results/graff_ms_baseline_nist20/split_1/time_out.json",
    "NEIMS (FFN)": "../results/ffn_baseline_nist20/split_1/time_out.json",
    "NEIMS (GNN)": "../results/gnn_baseline_nist20/split_1/time_out.json",
    "SCARF": "../results/scarf_inten_nist20/split_1/time_out.json"
}
name_to_seconds = {i: yaml.safe_load(open(j, "r"))["time (s)"] for i, j in name_to_time.items()}

In [39]:
out_df = []
for dataset_name, yaml_files in dataset_to_res.items():
    for k, v in yaml_files.items():
        new_entry = {
            "Method": k, 
            "Cosine sim.": v['avg_cos_sim'],
            "Cosine sim. sem": v['sem_cos_sim'],
            "Coverage": v["avg_coverage"],
            "Coverage sem": v["sem_coverage"],
            "Valid": v['avg_frac_valid'],
            "Valid sem": v['sem_frac_valid'],
            "Time (s)": name_to_seconds[k],
            "Dataset": dataset_name,

        }
        out_df.append(new_entry)
out_df = pd.DataFrame(out_df)
    

In [40]:
out_df_pivot = out_df.pivot_table(index="Method", columns="Dataset", values=["Cosine sim.", "Coverage",  "Valid", "Time (s)",])
out_df_pivot_sorted = out_df_pivot.loc[names]

out_df_pivot_sorted = out_df_pivot_sorted.swaplevel(0, 1, axis=1).round(3)

metric_order = {"Cosine sim.": 1, "Coverage": 2, "Valid": 3, "Time (s)": 4}
metric_order = ["Cosine sim.", "Coverage", "Valid", "Time (s)"]
dataset_order = {"nist20": 2, "canopus_train_public": 1,}
dataset_order = ["canopus_train_public", "nist20"]


# Create a new MultiIndex with the custom sort order
new_index = pd.MultiIndex.from_product([dataset_order, metric_order], names=['Dataset', 'Metric'])

out_df_pivot_sorted = out_df_pivot_sorted.loc[:, new_index]

# Drop time
out_df_pivot_sorted = out_df_pivot_sorted.drop(columns = [('canopus_train_public', "Time (s)")])

# Sort columns and make time last
out_df_pivot_sorted.index.name = None
display(out_df_pivot_sorted)
latex = out_df_pivot_sorted.to_latex(caption="Spectra prediction accuracy", label="tab:spec_acc")
print(latex)

Dataset,canopus_train_public,canopus_train_public,canopus_train_public,nist20,nist20,nist20,nist20
Unnamed: 0_level_1,Cosine sim.,Coverage,Valid,Cosine sim.,Coverage,Valid,Time (s)
CFM-ID,0.368,0.232,1.0,0.371,0.273,1.0,1114.652
3DMolMS,0.394,0.507,0.921,0.508,0.731,0.946,3.447
FixedVocab,0.479,0.548,0.997,0.587,0.749,0.996,4.149
NEIMS (FFN),0.494,0.528,0.948,0.614,0.739,0.951,3.439
NEIMS (GNN),0.52,0.552,0.942,0.689,0.777,0.949,4.328
SCARF,0.534,0.553,1.0,0.713,0.797,1.0,21.458


\begin{table}
\centering
\caption{Spectra prediction accuracy}
\label{tab:spec_acc}
\begin{tabular}{lrrrrrrr}
\toprule
Dataset & \multicolumn{3}{l}{canopus\_train\_public} & \multicolumn{4}{l}{nist20} \\
{} &          Cosine sim. & Coverage &  Valid & Cosine sim. & Coverage &  Valid &  Time (s) \\
\midrule
CFM-ID      &                0.368 &    0.232 &  1.000 &       0.371 &    0.273 &  1.000 &  1114.652 \\
3DMolMS     &                0.394 &    0.507 &  0.921 &       0.508 &    0.731 &  0.946 &     3.447 \\
FixedVocab  &                0.479 &    0.548 &  0.997 &       0.587 &    0.749 &  0.996 &     4.149 \\
NEIMS (FFN) &                0.494 &    0.528 &  0.948 &       0.614 &    0.739 &  0.951 &     3.439 \\
NEIMS (GNN) &                0.520 &    0.552 &  0.942 &       0.689 &    0.777 &  0.949 &     4.328 \\
SCARF       &                0.534 &    0.553 &  1.000 &       0.713 &    0.797 &  1.000 &    21.458 \\
\bottomrule
\end{tabular}
\end{table}



  latex = out_df_pivot_sorted.to_latex(caption="Spectra prediction accuracy", label="tab:spec_acc")


NIST results:

```
\begin{table}
\centering
\caption{Spectra prediction accuracy}
\label{tab:spec_acc}
\begin{tabular}{lrrrr}
\toprule
{} &  Cosine sim. &  Coverage &  Valid &  Time (s) / 100 \\
\midrule
CFM-ID      &        0.371 &     0.273 &  1.000 &        1114.652 \\
NEIMS (FFN) &        0.614 &     0.739 &  0.951 &           3.439 \\
NEIMS (GNN) &        0.689 &     0.777 &  0.949 &           4.328 \\
SCARF       &        0.713 &     0.797 &  1.000 &          21.458 \\
\bottomrule
\end{tabular}
\end{table}
```

Canopus results:

```
\begin{table}
\centering
\caption{Spectra prediction accuracy}
\label{tab:spec_acc}
\begin{tabular}{lrrrr}
\toprule
{} &  Cosine sim. &  Coverage &  Valid &  Time (s) / 100 \\
\midrule
CFM-ID      &        0.368 &     0.232 &  1.000 &        1114.652 \\
NEIMS (FFN) &        0.494 &     0.528 &  0.948 &           3.439 \\
NEIMS (GNN) &        0.520 &     0.552 &  0.942 &           4.328 \\
SCARF       &        0.534 &     0.553 &  1.000 &          21.458 \\
\bottomrule
\end{tabular}
\end{table}

```

In [41]:
out_df_temp = out_df.copy()
# rewrite cosine sim column with f"cos sim \pm cosine sem using list comprehension
out_df_temp["Cosine sim."] = [fr"${i:.3f} \pm {j:.3f}$" for i, j in zip(out_df_temp["Cosine sim."], out_df_temp["Cosine sim. sem"])]
# Rewrite coverage column with f"coverage \pm coverage sem using list comprehension
out_df_temp["Coverage"] = [fr"${i:.3f} \pm {j:.3f}$" for i, j in zip(out_df_temp["Coverage"], out_df_temp["Coverage sem"])]
# Same for valid
out_df_temp["Valid"] = [fr"${i:.2f} \pm {j:.3f}$" for i, j in zip(out_df_temp["Valid"], out_df_temp["Valid sem"])]
out_df_temp["Time (s)"] = [fr"${i:.2f}$" for i in out_df_temp["Time (s)"]]

out_df_pivot = out_df_temp.pivot_table(index="Method", columns="Dataset",
                                       values=["Cosine sim.", "Coverage",  "Valid", "Time (s)",], aggfunc=lambda x: "&".join(x))

out_df_pivot_sorted = out_df_pivot.loc[names]
out_df_pivot_sorted = out_df_pivot_sorted.swaplevel(0, 1, axis=1).round(3)

metric_order = {"Cosine sim.": 1, "Coverage": 2, "Valid": 3, "Time (s)": 4}
metric_order = ["Cosine sim.", "Coverage", "Valid", "Time (s)"]
dataset_order = {"nist20": 2, "canopus_train_public": 1,}
dataset_order = ["canopus_train_public", "nist20"]


# Create a new MultiIndex with the custom sort order
new_index = pd.MultiIndex.from_product([dataset_order, metric_order], names=['Dataset', 'Metric'])
print(out_df_pivot_sorted.columns)

out_df_pivot_sorted = out_df_pivot_sorted.loc[:, new_index]

# Drop time
out_df_pivot_sorted = out_df_pivot_sorted.drop(columns = [('canopus_train_public', "Time (s)")])

# Sort columns and make time last
out_df_pivot_sorted.index.name = None
display(out_df_pivot_sorted)
latex = out_df_pivot_sorted.to_latex(caption="Spectra prediction accuracy", label="tab:spec_acc", escape=False)
print(latex)

MultiIndex([('canopus_train_public', 'Cosine sim.'),
            (              'nist20', 'Cosine sim.'),
            ('canopus_train_public',    'Coverage'),
            (              'nist20',    'Coverage'),
            ('canopus_train_public',    'Time (s)'),
            (              'nist20',    'Time (s)'),
            ('canopus_train_public',       'Valid'),
            (              'nist20',       'Valid')],
           names=['Dataset', None])


Dataset,canopus_train_public,canopus_train_public,canopus_train_public,nist20,nist20,nist20,nist20
Unnamed: 0_level_1,Cosine sim.,Coverage,Valid,Cosine sim.,Coverage,Valid,Time (s)
CFM-ID,$0.368 \pm 0.008$,$0.232 \pm 0.005$,$1.00 \pm 0.000$,$0.371 \pm 0.004$,$0.273 \pm 0.003$,$1.00 \pm 0.000$,$1114.65$
3DMolMS,$0.394 \pm 0.009$,$0.507 \pm 0.008$,$0.92 \pm 0.003$,$0.508 \pm 0.004$,$0.731 \pm 0.004$,$0.95 \pm 0.002$,$3.45$
FixedVocab,$0.479 \pm 0.008$,$0.548 \pm 0.007$,$1.00 \pm 0.000$,$0.587 \pm 0.004$,$0.749 \pm 0.004$,$1.00 \pm 0.000$,$4.15$
NEIMS (FFN),$0.494 \pm 0.008$,$0.528 \pm 0.007$,$0.95 \pm 0.002$,$0.614 \pm 0.004$,$0.739 \pm 0.004$,$0.95 \pm 0.001$,$3.44$
NEIMS (GNN),$0.520 \pm 0.008$,$0.552 \pm 0.008$,$0.94 \pm 0.003$,$0.689 \pm 0.003$,$0.777 \pm 0.003$,$0.95 \pm 0.001$,$4.33$
SCARF,$0.534 \pm 0.008$,$0.553 \pm 0.008$,$1.00 \pm 0.000$,$0.713 \pm 0.003$,$0.797 \pm 0.003$,$1.00 \pm 0.000$,$21.46$


\begin{table}
\centering
\caption{Spectra prediction accuracy}
\label{tab:spec_acc}
\begin{tabular}{llllllll}
\toprule
Dataset & \multicolumn{3}{l}{canopus_train_public} & \multicolumn{4}{l}{nist20} \\
{} &          Cosine sim. &           Coverage &             Valid &        Cosine sim. &           Coverage &             Valid &   Time (s) \\
\midrule
CFM-ID      &    $0.368 \pm 0.008$ &  $0.232 \pm 0.005$ &  $1.00 \pm 0.000$ &  $0.371 \pm 0.004$ &  $0.273 \pm 0.003$ &  $1.00 \pm 0.000$ &  $1114.65$ \\
3DMolMS     &    $0.394 \pm 0.009$ &  $0.507 \pm 0.008$ &  $0.92 \pm 0.003$ &  $0.508 \pm 0.004$ &  $0.731 \pm 0.004$ &  $0.95 \pm 0.002$ &     $3.45$ \\
FixedVocab  &    $0.479 \pm 0.008$ &  $0.548 \pm 0.007$ &  $1.00 \pm 0.000$ &  $0.587 \pm 0.004$ &  $0.749 \pm 0.004$ &  $1.00 \pm 0.000$ &     $4.15$ \\
NEIMS (FFN) &    $0.494 \pm 0.008$ &  $0.528 \pm 0.007$ &  $0.95 \pm 0.002$ &  $0.614 \pm 0.004$ &  $0.739 \pm 0.004$ &  $0.95 \pm 0.001$ &     $3.44$ \\
NEIMS (GNN) &    $0.520 \pm 

  latex = out_df_pivot_sorted.to_latex(caption="Spectra prediction accuracy", label="tab:spec_acc", escape=False)
