In [4]:
from pathlib import Path
import pandas as pd
import numpy as np
from ms_pred.common.plot_utils import *

set_style()

In [5]:
dataset_names = ["nist20", "canopus_train_public"]
outfolder = "../results/figs_scarf/coverage/"
outfolder = Path(outfolder)
outfolder.mkdir(parents=True, exist_ok=True)

In [25]:
names = [
    "SCARF",
    "SCARF-F",
    "SCARF-R",
    "Autoregressive",
    "CFM-ID",
    "Random",
    "Frequency",
]
sort_order = {
    "CFM-ID": 3,
    "Frequency": 2,
    "Random": 1,
    "Autoregressive": 3.05,
    "SCARF": 4,
    "SCARF-R": 3.1,
    "SCARF-F": 3.5,
}

dataset_to_res = {}
for dataset_name in dataset_names:

    results_files = [
        f"../results/scarf_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
        f"../results/scarf_{dataset_name}_ablate/forward/inten_thresh_sweep/summary.tsv",
        f"../results/scarf_{dataset_name}_ablate/reverse/inten_thresh_sweep/summary.tsv",
        f"../results/autoregr_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
        f"../results/cfm_id_{dataset_name}/inten_thresh_sweep/summary.tsv",
        f"../results/rand_baseline_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
        f"../results/freq_baseline_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
    ]
    cov_dfs = {i: pd.read_csv(j, sep="\t") for i, j in zip(names, results_files)}
    dataset_to_res[dataset_name] = cov_dfs

In [26]:
dataset_to_res["canopus_train_public"];

In [27]:
combined_df = []
# max_preds = [10, 20, 30, 40, 50, 100, 200, 300, 500, 1000]
max_preds = [10, 20, 30, 50, 100, 300, 1000]
for dataset_name in dataset_names:
    cov_dfs = dataset_to_res[dataset_name]
    for name, sub_df in cov_dfs.items():
        for _, row in sub_df.iterrows():
            num_nodes = row["nm_nodes"]
            if num_nodes not in max_preds:
                continue
            coverage = row["avg_coverage"]
            digitized_coverage = row["avg_digitized_coverage"]
            sem_coverage = row["sem_coverage"]
            avg_num_pred = row["avg_num_pred"]
            new_entry = {
                "Coverage": coverage,
                "SEM Coverage": sem_coverage,
                "Method": name,
                "Coverage (disc.)": digitized_coverage,
                "Num pred.": avg_num_pred,
                "Nodes": num_nodes,
                "Dataset": dataset_name,
            }
            combined_df.append(new_entry)

new_df = pd.DataFrame(combined_df)

In [28]:
# Round coverage
new_df["Coverage"] = new_df["Coverage"].round(3)
new_df["SEM Coverage"] = new_df["SEM Coverage"].round(3)

# Create a single column that just has $Coverage \pm SEM Coverage$ using list comprehension
new_df["Coverage"] = [
    rf"${i} \pm {j}$" for i, j in zip(new_df["Coverage"], new_df["SEM Coverage"])
]

In [29]:
for dataset_name, temp_df in new_df.groupby("Dataset"):
    new_df_round = temp_df  # .round(3)

    # Filter df round to only have rows where Coverage is in [10, 30, 300, 10000]
    new_df_round = new_df_round[new_df_round["Nodes"].isin([10, 30, 300, 1000])]

    round_df_pivot = new_df_round.pivot_table(
        index="Method", columns=["Nodes"], values=["Coverage"], aggfunc=lambda x: x
    )
    display(round_df_pivot)
    round_df_pivot.columns = [f"{int(i[1])}" for i in round_df_pivot.columns]
    round_df_pivot.index.name = None
    round_df_pivot.columns.name = "Coverage @"
    round_df_pivot = round_df_pivot.sort_index(key=lambda x: [sort_order[i] for i in x])
    display(round_df_pivot)
    data_str = {"canopus_train_public": r"\gnpsData", "nist20": r"\nistData"}[
        dataset_name
    ]

    tex_table = round_df_pivot.to_latex(
        na_rep="--",
        label=f"tab:coverage_{dataset_name}",
        caption=rf"Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the {data_str} dataset.",
        escape=False,
    )
    print(tex_table)
    with open(outfolder / f"tab_coverage_{dataset_name}.tex", "w") as f:
        f.write(tex_table)

Unnamed: 0_level_0,Coverage,Coverage,Coverage,Coverage
Nodes,10.0,30.0,300.0,1000.0
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Autoregressive,$0.072 \pm 0.003$,$0.084 \pm 0.004$,$0.095 \pm 0.004$,$0.099 \pm 0.004$
CFM-ID,$0.17 \pm 0.005$,$0.267 \pm 0.006$,,
Frequency,$0.09 \pm 0.003$,$0.151 \pm 0.005$,$0.466 \pm 0.009$,$0.688 \pm 0.009$
Random,$0.004 \pm 0.001$,$0.014 \pm 0.002$,$0.126 \pm 0.006$,$0.336 \pm 0.011$
SCARF,$0.17 \pm 0.005$,$0.317 \pm 0.007$,$0.73 \pm 0.009$,$0.879 \pm 0.006$
SCARF-F,$0.155 \pm 0.004$,$0.316 \pm 0.007$,$0.719 \pm 0.009$,$0.865 \pm 0.007$
SCARF-R,$0.154 \pm 0.005$,$0.281 \pm 0.007$,$0.67 \pm 0.01$,$0.85 \pm 0.007$


Coverage @,10,30,300,1000
Random,$0.004 \pm 0.001$,$0.014 \pm 0.002$,$0.126 \pm 0.006$,$0.336 \pm 0.011$
Frequency,$0.09 \pm 0.003$,$0.151 \pm 0.005$,$0.466 \pm 0.009$,$0.688 \pm 0.009$
CFM-ID,$0.17 \pm 0.005$,$0.267 \pm 0.006$,,
Autoregressive,$0.072 \pm 0.003$,$0.084 \pm 0.004$,$0.095 \pm 0.004$,$0.099 \pm 0.004$
SCARF-R,$0.154 \pm 0.005$,$0.281 \pm 0.007$,$0.67 \pm 0.01$,$0.85 \pm 0.007$
SCARF-F,$0.155 \pm 0.004$,$0.316 \pm 0.007$,$0.719 \pm 0.009$,$0.865 \pm 0.007$
SCARF,$0.17 \pm 0.005$,$0.317 \pm 0.007$,$0.73 \pm 0.009$,$0.879 \pm 0.006$


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \gnpsData dataset.}
\label{tab:coverage_canopus_train_public}
\begin{tabular}{lllll}
\toprule
Coverage @ &                 10 &                 30 &                300 &               1000 \\
\midrule
Random         &  $0.004 \pm 0.001$ &  $0.014 \pm 0.002$ &  $0.126 \pm 0.006$ &  $0.336 \pm 0.011$ \\
Frequency      &   $0.09 \pm 0.003$ &  $0.151 \pm 0.005$ &  $0.466 \pm 0.009$ &  $0.688 \pm 0.009$ \\
CFM-ID         &   $0.17 \pm 0.005$ &  $0.267 \pm 0.006$ &                 -- &                 -- \\
Autoregressive &  $0.072 \pm 0.003$ &  $0.084 \pm 0.004$ &  $0.095 \pm 0.004$ &  $0.099 \pm 0.004$ \\
SCARF-R        &  $0.154 \pm 0.005$ &  $0.281 \pm 0.007$ &    $0.67 \pm 0.01$ &   $0.85 \pm 0.007$ \\
SCARF-F        &  $0.155 \pm 0.004$ &  $0.316 \pm 0.007$ &  $0.719 \pm 0.009$ &  $0.865 \pm 0.007$ \\
SCARF          &   $0.17 \pm 0.005$ &  $0.317 \pm 0.0

  tex_table = round_df_pivot.to_latex(


Unnamed: 0_level_0,Coverage,Coverage,Coverage,Coverage
Nodes,10.0,30.0,300.0,1000.0
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Autoregressive,$0.208 \pm 0.003$,$0.266 \pm 0.003$,$0.308 \pm 0.003$,$0.315 \pm 0.003$
CFM-ID,$0.197 \pm 0.002$,$0.282 \pm 0.002$,,
Frequency,$0.173 \pm 0.002$,$0.275 \pm 0.003$,$0.659 \pm 0.004$,$0.83 \pm 0.004$
Random,$0.009 \pm 0.001$,$0.026 \pm 0.001$,$0.232 \pm 0.004$,$0.532 \pm 0.006$
SCARF,$0.3 \pm 0.003$,$0.547 \pm 0.004$,$0.906 \pm 0.003$,$0.969 \pm 0.002$
SCARF-F,$0.248 \pm 0.003$,$0.473 \pm 0.004$,$0.855 \pm 0.003$,$0.943 \pm 0.002$
SCARF-R,$0.249 \pm 0.003$,$0.423 \pm 0.004$,$0.84 \pm 0.004$,$0.941 \pm 0.003$


Coverage @,10,30,300,1000
Random,$0.009 \pm 0.001$,$0.026 \pm 0.001$,$0.232 \pm 0.004$,$0.532 \pm 0.006$
Frequency,$0.173 \pm 0.002$,$0.275 \pm 0.003$,$0.659 \pm 0.004$,$0.83 \pm 0.004$
CFM-ID,$0.197 \pm 0.002$,$0.282 \pm 0.002$,,
Autoregressive,$0.208 \pm 0.003$,$0.266 \pm 0.003$,$0.308 \pm 0.003$,$0.315 \pm 0.003$
SCARF-R,$0.249 \pm 0.003$,$0.423 \pm 0.004$,$0.84 \pm 0.004$,$0.941 \pm 0.003$
SCARF-F,$0.248 \pm 0.003$,$0.473 \pm 0.004$,$0.855 \pm 0.003$,$0.943 \pm 0.002$
SCARF,$0.3 \pm 0.003$,$0.547 \pm 0.004$,$0.906 \pm 0.003$,$0.969 \pm 0.002$


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset.}
\label{tab:coverage_nist20}
\begin{tabular}{lllll}
\toprule
Coverage @ &                 10 &                 30 &                300 &               1000 \\
\midrule
Random         &  $0.009 \pm 0.001$ &  $0.026 \pm 0.001$ &  $0.232 \pm 0.004$ &  $0.532 \pm 0.006$ \\
Frequency      &  $0.173 \pm 0.002$ &  $0.275 \pm 0.003$ &  $0.659 \pm 0.004$ &   $0.83 \pm 0.004$ \\
CFM-ID         &  $0.197 \pm 0.002$ &  $0.282 \pm 0.002$ &                 -- &                 -- \\
Autoregressive &  $0.208 \pm 0.003$ &  $0.266 \pm 0.003$ &  $0.308 \pm 0.003$ &  $0.315 \pm 0.003$ \\
SCARF-R        &  $0.249 \pm 0.003$ &  $0.423 \pm 0.004$ &   $0.84 \pm 0.004$ &  $0.941 \pm 0.003$ \\
SCARF-F        &  $0.248 \pm 0.003$ &  $0.473 \pm 0.004$ &  $0.855 \pm 0.003$ &  $0.943 \pm 0.002$ \\
SCARF          &    $0.3 \pm 0.003$ &  $0.547 \pm 0.004$ &  $0.906 

  tex_table = round_df_pivot.to_latex(
