In [26]:
from pathlib import Path
import pandas as pd
import numpy as np
import yaml
from ms_pred.common.plot_utils import *
set_style()

In [27]:
dataset_names = ["nist20", "canopus_train_public"]

In [28]:

names = [
    "CFM-ID",
    "3DMolMS",
    "FixedVocab",
    "NEIMS (FFN)",
    "NEIMS (GNN)",
    "SCARF"
]


dataset_to_res = {}
for dataset_name in dataset_names:
    results_files = [
        f"../results/cfm_id_{dataset_name}/split_1/preds/pred_eval.yaml",
        f"../results/molnetms_baseline_{dataset_name}/split_1/preds/pred_eval.yaml",
        f"../results/graff_ms_baseline_{dataset_name}/split_1/preds/pred_eval.yaml",
        f"../results/ffn_baseline_{dataset_name}/split_1/preds/pred_eval.yaml",
        f"../results/gnn_baseline_{dataset_name}/split_1/preds/pred_eval.yaml",
        f"../results/scarf_inten_{dataset_name}/split_1/preds/pred_eval.yaml",
    ]

    yaml_files = {i: yaml.safe_load(open(j, "r")) for i, j in zip(names, results_files)}
    dataset_to_res[dataset_name] = yaml_files

In [29]:

name_to_time = {
    "CFM-ID": "../results/cfm_id_nist20_timer/time_out.json", 
    "3DMolMS": "../results/molnetms_baseline_nist20/split_1/time_out.json",
    "FixedVocab": "../results/graff_ms_baseline_nist20/split_1/time_out.json",
    "NEIMS (FFN)": "../results/ffn_baseline_nist20/split_1/time_out.json",
    "NEIMS (GNN)": "../results/gnn_baseline_nist20/split_1/time_out.json",
    "SCARF": "../results/scarf_inten_nist20/split_1/time_out.json"
}
name_to_seconds = {i: yaml.safe_load(open(j, "r"))["time (s)"] for i, j in name_to_time.items()}

In [30]:
out_df = []
for dataset_name, yaml_files in dataset_to_res.items():
    for k, v in yaml_files.items():
        new_entry = {
            "Method": k, 
            "Cosine sim.": v['avg_cos_sim'],
            "Coverage": v["avg_coverage"],
            "Valid": v['avg_frac_valid'],
            "Time (s)": name_to_seconds[k],
            "Dataset": dataset_name,

        }
        out_df.append(new_entry)
out_df = pd.DataFrame(out_df)
    

In [31]:
out_df_pivot = out_df.pivot_table(index="Method", columns="Dataset", values=["Cosine sim.", "Coverage",  "Valid", "Time (s)",])
out_df_pivot_sorted = out_df_pivot.loc[names]

out_df_pivot_sorted = out_df_pivot_sorted.swaplevel(0, 1, axis=1).round(3)

metric_order = {"Cosine sim.": 1, "Coverage": 2, "Valid": 3, "Time (s)": 4}
metric_order = ["Cosine sim.", "Coverage", "Valid", "Time (s)"]
dataset_order = {"nist20": 2, "canopus_train_public": 1,}
dataset_order = ["canopus_train_public", "nist20"]


# Create a new MultiIndex with the custom sort order
new_index = pd.MultiIndex.from_product([dataset_order, metric_order], names=['Dataset', 'Metric'])

out_df_pivot_sorted = out_df_pivot_sorted.loc[:, new_index]

# Drop time
out_df_pivot_sorted = out_df_pivot_sorted.drop(columns = [('canopus_train_public', "Time (s)")])

# Sort columns and make time last
out_df_pivot_sorted.index.name = None
display(out_df_pivot_sorted)
latex = out_df_pivot_sorted.to_latex(caption="Spectra prediction accuracy", label="tab:spec_acc")
print(latex)

Dataset,canopus_train_public,canopus_train_public,canopus_train_public,nist20,nist20,nist20,nist20
Unnamed: 0_level_1,Cosine sim.,Coverage,Valid,Cosine sim.,Coverage,Valid,Time (s)
CFM-ID,0.368,0.232,1.0,0.371,0.273,1.0,1114.652
3DMolMS,0.394,0.507,0.921,0.508,0.731,0.946,3.447
FixedVocab,0.479,0.548,0.997,0.587,0.749,0.996,4.149
NEIMS (FFN),0.494,0.528,0.948,0.614,0.739,0.951,3.439
NEIMS (GNN),0.52,0.552,0.942,0.689,0.777,0.949,4.328
SCARF,0.534,0.553,1.0,0.713,0.797,1.0,21.458


\begin{table}
\centering
\caption{Spectra prediction accuracy}
\label{tab:spec_acc}
\begin{tabular}{lrrrrrrr}
\toprule
Dataset & \multicolumn{3}{l}{canopus\_train\_public} & \multicolumn{4}{l}{nist20} \\
{} &          Cosine sim. & Coverage &  Valid & Cosine sim. & Coverage &  Valid &  Time (s) \\
\midrule
CFM-ID      &                0.368 &    0.232 &  1.000 &       0.371 &    0.273 &  1.000 &  1114.652 \\
3DMolMS     &                0.394 &    0.507 &  0.921 &       0.508 &    0.731 &  0.946 &     3.447 \\
FixedVocab  &                0.479 &    0.548 &  0.997 &       0.587 &    0.749 &  0.996 &     4.149 \\
NEIMS (FFN) &                0.494 &    0.528 &  0.948 &       0.614 &    0.739 &  0.951 &     3.439 \\
NEIMS (GNN) &                0.520 &    0.552 &  0.942 &       0.689 &    0.777 &  0.949 &     4.328 \\
SCARF       &                0.534 &    0.553 &  1.000 &       0.713 &    0.797 &  1.000 &    21.458 \\
\bottomrule
\end{tabular}
\end{table}



  latex = out_df_pivot_sorted.to_latex(caption="Spectra prediction accuracy", label="tab:spec_acc")


NIST results:

```
\begin{table}
\centering
\caption{Spectra prediction accuracy}
\label{tab:spec_acc}
\begin{tabular}{lrrrr}
\toprule
{} &  Cosine sim. &  Coverage &  Valid &  Time (s) / 100 \\
\midrule
CFM-ID      &        0.371 &     0.273 &  1.000 &        1114.652 \\
NEIMS (FFN) &        0.614 &     0.739 &  0.951 &           3.439 \\
NEIMS (GNN) &        0.689 &     0.777 &  0.949 &           4.328 \\
SCARF       &        0.713 &     0.797 &  1.000 &          21.458 \\
\bottomrule
\end{tabular}
\end{table}
```

Canopus results:

```
\begin{table}
\centering
\caption{Spectra prediction accuracy}
\label{tab:spec_acc}
\begin{tabular}{lrrrr}
\toprule
{} &  Cosine sim. &  Coverage &  Valid &  Time (s) / 100 \\
\midrule
CFM-ID      &        0.368 &     0.232 &  1.000 &        1114.652 \\
NEIMS (FFN) &        0.494 &     0.528 &  0.948 &           3.439 \\
NEIMS (GNN) &        0.520 &     0.552 &  0.942 &           4.328 \\
SCARF       &        0.534 &     0.553 &  1.000 &          21.458 \\
\bottomrule
\end{tabular}
\end{table}

```