In [4]:
from pathlib import Path
import pandas as pd
import numpy as np
from ms_pred.common.plot_utils import *
set_style()

In [7]:
dataset_name = "nist20"
dataset_name = "canopus_train_public"
data_folder = Path(f"../data/spec_datasets/{dataset_name}/")
labels = data_folder / "labels.tsv"

In [8]:
# Create results table for fig inten

In [9]:
results_files = [
    f"../results/scarf_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
    f"../results/scarf_{dataset_name}_ablate/forward/inten_thresh_sweep/summary.tsv",
    f"../results/scarf_{dataset_name}_ablate/reverse/inten_thresh_sweep/summary.tsv",
    f"../results/cfm_id_{dataset_name}/inten_thresh_sweep/summary.tsv",
    f"../results/rand_baseline_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
    f"../results/freq_baseline_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
]

names = [
    "SCARF", 
    "SCARF-F",
    "SCARF-R",
    "CFM-ID",
    "Random",
    "Frequency"
]

cov_dfs = {i: pd.read_csv(j, sep="\t") for i, j in zip(names, results_files)}


In [10]:
combined_df = []
# max_preds = [10, 20, 30, 40, 50, 100, 200, 300, 500, 1000]
max_preds = [10, 20, 30, 50, 100, 300, 1000]
for name, sub_df in cov_dfs.items():
    for _, row in sub_df.iterrows():
        num_nodes = row['nm_nodes']
        if num_nodes not in max_preds: continue
        coverage = row['avg_coverage']
        digitized_coverage = row['avg_digitized_coverage']
        avg_num_pred = row['avg_num_pred']
        new_entry = {
            "Coverage": coverage,
            "Method": name,
            "Coverage (disc.)": digitized_coverage,
            "Num pred.": avg_num_pred,
            "Nodes": num_nodes
        }
        combined_df.append(new_entry)

new_df = pd.DataFrame(combined_df)



In [11]:
new_df_round = new_df.round(3)
round_df_pivot = new_df_round.pivot_table(index="Method", columns=["Nodes"], values=["Coverage"])

In [12]:
round_df_pivot.columns = [f"{int(i[1])}" for i in round_df_pivot.columns]
round_df_pivot.index.name = None
round_df_pivot.columns.name = "Coverage @"

In [13]:
round_df_pivot

Coverage @,10,20,30,50,100,300,1000
CFM-ID,0.169,0.229,0.267,0.298,0.303,,
Frequency,0.092,0.124,0.152,0.202,0.294,0.476,0.695
Random,0.003,0.008,0.015,0.021,0.046,0.13,0.337
SCARF,0.181,0.27,0.333,0.42,0.549,0.745,0.891
SCARF-F,0.163,0.251,0.319,0.409,0.538,0.721,0.868
SCARF-R,0.165,0.234,0.289,0.371,0.498,0.691,0.866


In [14]:
sort_order = {"CFM-ID": 3, "Frequency":2, "Random": 1, "SCARF": 4, "SCARF-R": 3.1, "SCARF-F": 3.5}
round_df_pivot  = round_df_pivot.sort_index(key=lambda x: [sort_order[i] for i in x])

In [15]:
tex_table = round_df_pivot.to_latex(
    na_rep="--", 
    label="tab:coverage", 
    caption=r"Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset."
)


print(tex_table)

\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset.}
\label{tab:coverage}
\begin{tabular}{lrrrrrrr}
\toprule
Coverage @ &     10 &     20 &     30 &     50 &    100 &    300 &   1000 \\
\midrule
Random    &  0.003 &  0.008 &  0.015 &  0.021 &  0.046 &  0.130 &  0.337 \\
Frequency &  0.092 &  0.124 &  0.152 &  0.202 &  0.294 &  0.476 &  0.695 \\
CFM-ID    &  0.169 &  0.229 &  0.267 &  0.298 &  0.303 &     -- &     -- \\
SCARF-R   &  0.165 &  0.234 &  0.289 &  0.371 &  0.498 &  0.691 &  0.866 \\
SCARF-F   &  0.163 &  0.251 &  0.319 &  0.409 &  0.538 &  0.721 &  0.868 \\
SCARF     &  0.181 &  0.270 &  0.333 &  0.420 &  0.549 &  0.745 &  0.891 \\
\bottomrule
\end{tabular}
\end{table}



  tex_table = round_df_pivot.to_latex(


NIST Output:

```
\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset.}
\label{tbl:coverage}
\begin{tabular}{lrrrrrrr}
\toprule
Coverage @ &     10 &     20 &     30 &     50 &    100 &    300 &   1000 \\
\midrule
Random    &  0.008 &  0.017 &  0.024 &  0.042 &  0.085 &  0.232 &  0.533 \\
Frequency &  0.164 &  0.224 &  0.268 &  0.336 &  0.462 &  0.659 &  0.831 \\
CFM-ID    &  0.198 &  0.254 &  0.281 &  0.302 &  0.305 &     -- &     -- \\
SCARF-R   &  0.252 &  0.356 &  0.431 &  0.536 &  0.675 &  0.843 &  0.942 \\
SCARF-F   &  0.263 &  0.404 &  0.491 &  0.598 &  0.719 &  0.859 &  0.943 \\
SCARF     &  0.316 &  0.465 &  0.559 &  0.674 &  0.796 &  0.911 &  0.970 \\
\bottomrule
\end{tabular}
\end{table}
```


Canopus output: 
```
\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset.}
\label{tab:coverage}
\begin{tabular}{lrrrrrrr}
\toprule
Coverage @ &     10 &     20 &     30 &     50 &    100 &    300 &   1000 \\
\midrule
Random    &  0.003 &  0.008 &  0.015 &  0.021 &  0.046 &  0.130 &  0.337 \\
Frequency &  0.092 &  0.124 &  0.152 &  0.202 &  0.294 &  0.476 &  0.695 \\
CFM-ID    &  0.169 &  0.229 &  0.267 &  0.298 &  0.303 &     -- &     -- \\
SCARF-R   &  0.165 &  0.234 &  0.289 &  0.371 &  0.498 &  0.691 &  0.866 \\
SCARF-F   &  0.163 &  0.251 &  0.319 &  0.409 &  0.538 &  0.721 &  0.868 \\
SCARF     &  0.181 &  0.270 &  0.333 &  0.420 &  0.549 &  0.745 &  0.891 \\
\bottomrule
\end{tabular}
\end{table}
```