In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from ms_pred.common.plot_utils import *

set_style()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_names = ["nist20", "canopus_train_public"]
outfolder = "../results/figs_scarf/coverage/"
outfolder = Path(outfolder)
outfolder.mkdir(parents=True, exist_ok=True)

In [3]:
names = [
    "SCARF",
    "SCARF-F",
    "SCARF-R",
    "Autoregressive",
    "CFM-ID",
    "Random",
    "Frequency",
]
sort_order = {
    "CFM-ID": 3,
    "Frequency": 2,
    "Random": 1,
    "Autoregressive": 3.05,
    "SCARF": 4,
    "SCARF-R": 3.1,
    "SCARF-F": 3.5,
}

dataset_to_res = {}
for dataset_name in dataset_names:

    results_files = [
        f"../results/scarf_{dataset_name}/split_1_rnd1/inten_thresh_sweep/summary.tsv",
        f"../results/scarf_{dataset_name}_ablate/forward/inten_thresh_sweep/summary.tsv",
        f"../results/scarf_{dataset_name}_ablate/reverse/inten_thresh_sweep/summary.tsv",
        f"../results/autoregr_{dataset_name}/split_1_rnd1/inten_thresh_sweep/summary.tsv",
        f"../results/cfm_id_{dataset_name}/inten_thresh_sweep/summary.tsv",
        f"../results/rand_baseline_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
        f"../results/freq_baseline_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
    ]
    cov_dfs = {i: pd.read_csv(j, sep="\t") for i, j in zip(names, results_files)}
    dataset_to_res[dataset_name] = cov_dfs

In [4]:
dataset_to_res["canopus_train_public"];

In [12]:
combined_df = []
# max_preds = [10, 20, 30, 40, 50, 100, 200, 300, 500, 1000]
max_preds = [10, 20, 30, 50, 100, 300, 1000]
for dataset_name in dataset_names:
    cov_dfs = dataset_to_res[dataset_name]
    for name, sub_df in cov_dfs.items():
        for _, row in sub_df.iterrows():
            num_nodes = row["nm_nodes"]
            if num_nodes not in max_preds:
                continue
            coverage = row["avg_coverage"]
            digitized_coverage = row["avg_digitized_coverage"]
            sem_coverage = row["sem_coverage"]
            avg_num_pred = row["avg_num_pred"]
            new_entry = {
                "Coverage": coverage,
                "SEM Coverage": sem_coverage,
                "Method": name,
                "Coverage (disc.)": digitized_coverage,
                "Num pred.": avg_num_pred,
                "Nodes": num_nodes,
                "Dataset": dataset_name,
            }
            combined_df.append(new_entry)

new_df = pd.DataFrame(combined_df)

In [13]:
# Round coverage
new_df["Coverage"] = new_df["Coverage"].round(3).fillna(0)
new_df["SEM Coverage"] = new_df["SEM Coverage"].round(3).fillna(0)

# Create a single column that just has $Coverage \pm SEM Coverage$ using list comprehension
# NO STD
new_df["Coverage 95%"] = [
   rf"${i:.3f} \pm {1.96 * j:.3f}$" for i, j in zip(new_df["Coverage"], new_df["SEM Coverage"])
]

In [14]:
for dataset_name, temp_df in new_df.groupby("Dataset"):
    new_df_round = temp_df  # .round(3)

    # Filter df round to only have rows where Coverage is in [10, 30, 300, 10000]
    new_df_round = new_df_round[new_df_round["Nodes"].isin([10, 30, 300, 1000])]

    round_df_pivot = new_df_round.pivot_table(
        index="Method", columns=["Nodes"], values=["Coverage"], aggfunc=lambda x: x
    )
    # display(round_df_pivot)
    round_df_pivot.columns = [f"{int(i[1])}" for i in round_df_pivot.columns]
    round_df_pivot.index.name = None
    round_df_pivot.columns.name = "Coverage @"
    round_df_pivot = round_df_pivot.sort_index(key=lambda x: [sort_order[i] for i in x])
    display(round_df_pivot)
    data_str = {"canopus_train_public": r"\gnpsData", "nist20": r"\nistData"}[
        dataset_name
    ]

    tex_table = round_df_pivot.to_latex(
        na_rep="--",
        label=f"tab:coverage_{dataset_name}",
        caption=rf"Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the {data_str} dataset.",
        escape=False,
    )
    print(tex_table)
    with open(outfolder / f"tab_coverage_{dataset_name}.tex", "w") as f:
        f.write(tex_table)

Coverage @,10,30,300,1000
Random,0.004,0.014,0.126,0.336
Frequency,0.09,0.151,0.466,0.688
CFM-ID,0.17,0.267,,
Autoregressive,0.07,0.079,0.093,0.098
SCARF-R,0.158,0.29,0.685,0.859
SCARF-F,0.158,0.313,0.714,0.859
SCARF,0.162,0.303,0.72,0.879


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \gnpsData dataset.}
\label{tab:coverage_canopus_train_public}
\begin{tabular}{lrrrr}
\toprule
Coverage @ &     10 &     30 &    300 &   1000 \\
\midrule
Random         &  0.004 &  0.014 &  0.126 &  0.336 \\
Frequency      &  0.090 &  0.151 &  0.466 &  0.688 \\
CFM-ID         &  0.170 &  0.267 &     -- &     -- \\
Autoregressive &  0.070 &  0.079 &  0.093 &  0.098 \\
SCARF-R        &  0.158 &  0.290 &  0.685 &  0.859 \\
SCARF-F        &  0.158 &  0.313 &  0.714 &  0.859 \\
SCARF          &  0.162 &  0.303 &  0.720 &  0.879 \\
\bottomrule
\end{tabular}
\end{table}



  tex_table = round_df_pivot.to_latex(


Coverage @,10,30,300,1000
Random,0.009,0.026,0.232,0.532
Frequency,0.173,0.275,0.659,0.83
CFM-ID,0.197,0.282,,
Autoregressive,0.204,0.261,0.308,0.317
SCARF-R,0.248,0.419,0.835,0.938
SCARF-F,0.246,0.472,0.858,0.945
SCARF,0.303,0.554,0.91,0.969


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset.}
\label{tab:coverage_nist20}
\begin{tabular}{lrrrr}
\toprule
Coverage @ &     10 &     30 &    300 &   1000 \\
\midrule
Random         &  0.009 &  0.026 &  0.232 &  0.532 \\
Frequency      &  0.173 &  0.275 &  0.659 &  0.830 \\
CFM-ID         &  0.197 &  0.282 &     -- &     -- \\
Autoregressive &  0.204 &  0.261 &  0.308 &  0.317 \\
SCARF-R        &  0.248 &  0.419 &  0.835 &  0.938 \\
SCARF-F        &  0.246 &  0.472 &  0.858 &  0.945 \\
SCARF          &  0.303 &  0.554 &  0.910 &  0.969 \\
\bottomrule
\end{tabular}
\end{table}



  tex_table = round_df_pivot.to_latex(


In [15]:
for dataset_name, temp_df in new_df.groupby("Dataset"):
    new_df_round = temp_df  # .round(3)

    # Filter df round to only have rows where Coverage is in [10, 30, 300, 10000]
    new_df_round = new_df_round[new_df_round["Nodes"].isin([10, 30, 300, 1000])]

    round_df_pivot = new_df_round.pivot_table(
        index="Method", columns=["Nodes"], values=["Coverage 95%"], aggfunc=lambda x: x
    )
    # display(round_df_pivot)
    round_df_pivot.columns = [f"{int(i[1])}" for i in round_df_pivot.columns]
    round_df_pivot.index.name = None
    round_df_pivot.columns.name = "Coverage @"
    round_df_pivot = round_df_pivot.sort_index(key=lambda x: [sort_order[i] for i in x])
    display(round_df_pivot)
    data_str = {"canopus_train_public": r"\gnpsData", "nist20": r"\nistData"}[
        dataset_name
    ]

    tex_table = round_df_pivot.to_latex(
        na_rep="--",
        label=f"tab:coverage_{dataset_name}",
        caption=rf"Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the {data_str} dataset.",
        escape=False,
    )
    print(tex_table)
    with open(outfolder / f"tab_coverage_{dataset_name}_95.tex", "w") as f:
        f.write(tex_table)

Coverage @,10,30,300,1000
Random,$0.004 \pm 0.002$,$0.014 \pm 0.004$,$0.126 \pm 0.012$,$0.336 \pm 0.022$
Frequency,$0.090 \pm 0.006$,$0.151 \pm 0.010$,$0.466 \pm 0.018$,$0.688 \pm 0.018$
CFM-ID,$0.170 \pm 0.010$,$0.267 \pm 0.012$,,
Autoregressive,$0.070 \pm 0.006$,$0.079 \pm 0.006$,$0.093 \pm 0.006$,$0.098 \pm 0.006$
SCARF-R,$0.158 \pm 0.010$,$0.290 \pm 0.014$,$0.685 \pm 0.018$,$0.859 \pm 0.014$
SCARF-F,$0.158 \pm 0.010$,$0.313 \pm 0.014$,$0.714 \pm 0.018$,$0.859 \pm 0.014$
SCARF,$0.162 \pm 0.010$,$0.303 \pm 0.014$,$0.720 \pm 0.018$,$0.879 \pm 0.012$


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \gnpsData dataset.}
\label{tab:coverage_canopus_train_public}
\begin{tabular}{lllll}
\toprule
Coverage @ &                 10 &                 30 &                300 &               1000 \\
\midrule
Random         &  $0.004 \pm 0.002$ &  $0.014 \pm 0.004$ &  $0.126 \pm 0.012$ &  $0.336 \pm 0.022$ \\
Frequency      &  $0.090 \pm 0.006$ &  $0.151 \pm 0.010$ &  $0.466 \pm 0.018$ &  $0.688 \pm 0.018$ \\
CFM-ID         &  $0.170 \pm 0.010$ &  $0.267 \pm 0.012$ &                 -- &                 -- \\
Autoregressive &  $0.070 \pm 0.006$ &  $0.079 \pm 0.006$ &  $0.093 \pm 0.006$ &  $0.098 \pm 0.006$ \\
SCARF-R        &  $0.158 \pm 0.010$ &  $0.290 \pm 0.014$ &  $0.685 \pm 0.018$ &  $0.859 \pm 0.014$ \\
SCARF-F        &  $0.158 \pm 0.010$ &  $0.313 \pm 0.014$ &  $0.714 \pm 0.018$ &  $0.859 \pm 0.014$ \\
SCARF          &  $0.162 \pm 0.010$ &  $0.303 \pm 0.0

  tex_table = round_df_pivot.to_latex(


Coverage @,10,30,300,1000
Random,$0.009 \pm 0.002$,$0.026 \pm 0.002$,$0.232 \pm 0.008$,$0.532 \pm 0.012$
Frequency,$0.173 \pm 0.004$,$0.275 \pm 0.006$,$0.659 \pm 0.008$,$0.830 \pm 0.008$
CFM-ID,$0.197 \pm 0.004$,$0.282 \pm 0.004$,,
Autoregressive,$0.204 \pm 0.006$,$0.261 \pm 0.006$,$0.308 \pm 0.008$,$0.317 \pm 0.008$
SCARF-R,$0.248 \pm 0.006$,$0.419 \pm 0.008$,$0.835 \pm 0.008$,$0.938 \pm 0.006$
SCARF-F,$0.246 \pm 0.006$,$0.472 \pm 0.008$,$0.858 \pm 0.006$,$0.945 \pm 0.004$
SCARF,$0.303 \pm 0.006$,$0.554 \pm 0.008$,$0.910 \pm 0.006$,$0.969 \pm 0.004$


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset.}
\label{tab:coverage_nist20}
\begin{tabular}{lllll}
\toprule
Coverage @ &                 10 &                 30 &                300 &               1000 \\
\midrule
Random         &  $0.009 \pm 0.002$ &  $0.026 \pm 0.002$ &  $0.232 \pm 0.008$ &  $0.532 \pm 0.012$ \\
Frequency      &  $0.173 \pm 0.004$ &  $0.275 \pm 0.006$ &  $0.659 \pm 0.008$ &  $0.830 \pm 0.008$ \\
CFM-ID         &  $0.197 \pm 0.004$ &  $0.282 \pm 0.004$ &                 -- &                 -- \\
Autoregressive &  $0.204 \pm 0.006$ &  $0.261 \pm 0.006$ &  $0.308 \pm 0.008$ &  $0.317 \pm 0.008$ \\
SCARF-R        &  $0.248 \pm 0.006$ &  $0.419 \pm 0.008$ &  $0.835 \pm 0.008$ &  $0.938 \pm 0.006$ \\
SCARF-F        &  $0.246 \pm 0.006$ &  $0.472 \pm 0.008$ &  $0.858 \pm 0.006$ &  $0.945 \pm 0.004$ \\
SCARF          &  $0.303 \pm 0.006$ &  $0.554 \pm 0.008$ &  $0.910 

  tex_table = round_df_pivot.to_latex(
