In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from ms_pred.common.plot_utils import *


from collections import defaultdict

# Import sem calc
from scipy.stats import sem


set_style()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_names = ["nist20", "canopus_train_public"]
outfolder = "../results/figs_scarf/coverage/"
outfolder = Path(outfolder)
outfolder.mkdir(parents=True, exist_ok=True)

In [3]:
names = [
    "SCARF",
    "SCARF-F",
    "SCARF-R",
    "Autoregressive",
    "CFM-ID",
    "Random",
    "Frequency",
]
sort_order = {
    "CFM-ID": 3,
    "Frequency": 2,
    "Random": 1,
    "Autoregressive": 3.05,
    "SCARF": 4,
    "SCARF-R": 3.1,
    "SCARF-F": 3.5,
}

dataset_to_res = {}
for dataset_name in dataset_names:
    yaml_files = defaultdict(lambda : [])
    for seed in [1,2,3]:    
        results_files = [
            f"../results/scarf_{dataset_name}/split_1_rnd{seed}/inten_thresh_sweep/summary.tsv",
            f"../results/scarf_{dataset_name}_ablate/forward/split_1_rnd{seed}/inten_thresh_sweep/summary.tsv",
            f"../results/scarf_{dataset_name}_ablate/reverse/split_1_rnd{seed}/inten_thresh_sweep/summary.tsv",
            f"../results/autoregr_{dataset_name}/split_1_rnd{seed}/inten_thresh_sweep/summary.tsv",
            f"../results/cfm_id_{dataset_name}/inten_thresh_sweep/summary.tsv",
            f"../results/rand_baseline_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
            f"../results/freq_baseline_{dataset_name}/split_1/inten_thresh_sweep/summary.tsv",
        ]

        for i, j in zip(names, results_files):
            yaml_files[i].append(pd.read_csv(j, sep="\t"))
    dataset_to_res[dataset_name] = yaml_files

In [4]:
dataset_to_res["canopus_train_public"];

In [5]:
combined_df = []
# max_preds = [10, 20, 30, 40, 50, 100, 200, 300, 500, 1000]
max_preds = [10, 20, 30, 50, 100, 300, 1000]
for dataset_name in dataset_names:
    cov_dfs = dataset_to_res[dataset_name]
    for name, sub_dfs in cov_dfs.items():
        for seed_num, sub_df in enumerate(sub_dfs):
            for _, row in sub_df.iterrows():
                num_nodes = row["nm_nodes"]
                if num_nodes not in max_preds:
                    continue
                coverage = row["avg_coverage"]
                digitized_coverage = row["avg_digitized_coverage"]
                # sem_coverage = row["sem_coverage"]
                avg_num_pred = row["avg_num_pred"]
                new_entry = {
                    "Coverage": coverage,
                    # "SEM Coverage": sem_coverage,
                    "Method": name,
                    "Coverage (disc.)": digitized_coverage,
                    "Num pred.": avg_num_pred,
                    "Nodes": num_nodes,
                    "Dataset": dataset_name,
                    "Seed": seed_num,
                }
                combined_df.append(new_entry)

new_df = pd.DataFrame(combined_df)

In [6]:
# Groupby dataset, nodes, and method and compute mean and sem
grouped_df = new_df.groupby(["Dataset", "Nodes", "Method"]).agg(
    {"Coverage": ["mean", "sem"], "Coverage (disc.)": ["mean", "sem"]}
)

grouped_df = grouped_df.reset_index()
grouped_df.columns = [
    "Dataset",
    "Nodes",
    "Method",
    "Coverage",
    "SEM Coverage",
    "Coverage (disc.)",
    "SEM Coverage (disc.)",
]
new_df = grouped_df

In [7]:
# Round coverage
new_df["Coverage"] = new_df["Coverage"].round(3).fillna(0)
new_df["SEM Coverage"] = new_df["SEM Coverage"].round(3).fillna(0)

# Create a single column that just has $Coverage \pm SEM Coverage$ using list comprehension
new_df["Coverage 95%"] = [
   rf"${i:.3f} \pm {j:.3f}$" for i, j in zip(new_df["Coverage"], new_df["SEM Coverage"])
]

In [8]:
for dataset_name, temp_df in new_df.groupby("Dataset"):
    new_df_round = temp_df  # .round(3)

    # Filter df round to only have rows where Coverage is in [10, 30, 300, 10000]
    new_df_round = new_df_round[new_df_round["Nodes"].isin([10, 30, 300, 1000])]

    round_df_pivot = new_df_round.pivot_table(
        index="Method", columns=["Nodes"], values=["Coverage"], aggfunc=lambda x: x
    )
    # display(round_df_pivot)
    round_df_pivot.columns = [f"{int(i[1])}" for i in round_df_pivot.columns]
    round_df_pivot.index.name = None
    round_df_pivot.columns.name = "Coverage @"
    round_df_pivot = round_df_pivot.sort_index(key=lambda x: [sort_order[i] for i in x])
    display(round_df_pivot)
    data_str = {"canopus_train_public": r"\gnpsData", "nist20": r"\nistData"}[
        dataset_name
    ]

    tex_table = round_df_pivot.to_latex(
        na_rep="--",
        label=f"tab:coverage_{dataset_name}",
        caption=rf"Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the {data_str} dataset.",
        escape=False,
    )
    print(tex_table)
    with open(outfolder / f"tab_coverage_{dataset_name}.tex", "w") as f:
        f.write(tex_table)

Coverage @,10,30,300,1000
Random,0.004,0.014,0.126,0.336
Frequency,0.09,0.151,0.466,0.688
CFM-ID,0.17,0.267,,
Autoregressive,0.072,0.082,0.095,0.099
SCARF-R,0.158,0.284,0.681,0.856
SCARF-F,0.155,0.306,0.708,0.859
SCARF,0.164,0.309,0.724,0.879


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \gnpsData dataset.}
\label{tab:coverage_canopus_train_public}
\begin{tabular}{lrrrr}
\toprule
Coverage @ &     10 &     30 &    300 &   1000 \\
\midrule
Random         &  0.004 &  0.014 &  0.126 &  0.336 \\
Frequency      &  0.090 &  0.151 &  0.466 &  0.688 \\
CFM-ID         &  0.170 &  0.267 &     -- &     -- \\
Autoregressive &  0.072 &  0.082 &  0.095 &  0.099 \\
SCARF-R        &  0.158 &  0.284 &  0.681 &  0.856 \\
SCARF-F        &  0.155 &  0.306 &  0.708 &  0.859 \\
SCARF          &  0.164 &  0.309 &  0.724 &  0.879 \\
\bottomrule
\end{tabular}
\end{table}



  tex_table = round_df_pivot.to_latex(


Coverage @,10,30,300,1000
Random,0.009,0.026,0.232,0.532
Frequency,0.173,0.275,0.659,0.83
CFM-ID,0.197,0.282,,
Autoregressive,0.204,0.262,0.309,0.317
SCARF-R,0.248,0.425,0.839,0.941
SCARF-F,0.249,0.476,0.855,0.943
SCARF,0.308,0.552,0.907,0.968


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset.}
\label{tab:coverage_nist20}
\begin{tabular}{lrrrr}
\toprule
Coverage @ &     10 &     30 &    300 &   1000 \\
\midrule
Random         &  0.009 &  0.026 &  0.232 &  0.532 \\
Frequency      &  0.173 &  0.275 &  0.659 &  0.830 \\
CFM-ID         &  0.197 &  0.282 &     -- &     -- \\
Autoregressive &  0.204 &  0.262 &  0.309 &  0.317 \\
SCARF-R        &  0.248 &  0.425 &  0.839 &  0.941 \\
SCARF-F        &  0.249 &  0.476 &  0.855 &  0.943 \\
SCARF          &  0.308 &  0.552 &  0.907 &  0.968 \\
\bottomrule
\end{tabular}
\end{table}



  tex_table = round_df_pivot.to_latex(


In [9]:
for dataset_name, temp_df in new_df.groupby("Dataset"):
    new_df_round = temp_df  # .round(3)

    # Filter df round to only have rows where Coverage is in [10, 30, 300, 10000]
    new_df_round = new_df_round[new_df_round["Nodes"].isin([10, 30, 300, 1000])]

    round_df_pivot = new_df_round.pivot_table(
        index="Method", columns=["Nodes"], values=["Coverage 95%"], aggfunc=lambda x: x
    )
    # display(round_df_pivot)
    round_df_pivot.columns = [f"{int(i[1])}" for i in round_df_pivot.columns]
    round_df_pivot.index.name = None
    round_df_pivot.columns.name = "Coverage @"
    round_df_pivot = round_df_pivot.sort_index(key=lambda x: [sort_order[i] for i in x])
    display(round_df_pivot)
    data_str = {"canopus_train_public": r"\gnpsData", "nist20": r"\nistData"}[
        dataset_name
    ]

    tex_table = round_df_pivot.to_latex(
        na_rep="--",
        label=f"tab:coverage_{dataset_name}",
        caption=rf"Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the {data_str} dataset.",
        escape=False,
    )
    print(tex_table)
    with open(outfolder / f"tab_coverage_{dataset_name}_95.tex", "w") as f:
        f.write(tex_table)

Coverage @,10,30,300,1000
Random,$0.004 \pm 0.000$,$0.014 \pm 0.000$,$0.126 \pm 0.000$,$0.336 \pm 0.000$
Frequency,$0.090 \pm 0.000$,$0.151 \pm 0.000$,$0.466 \pm 0.000$,$0.688 \pm 0.000$
CFM-ID,$0.170 \pm 0.000$,$0.267 \pm 0.000$,,
Autoregressive,$0.072 \pm 0.001$,$0.082 \pm 0.002$,$0.095 \pm 0.001$,$0.099 \pm 0.000$
SCARF-R,$0.158 \pm 0.001$,$0.284 \pm 0.003$,$0.681 \pm 0.002$,$0.856 \pm 0.002$
SCARF-F,$0.155 \pm 0.002$,$0.306 \pm 0.002$,$0.708 \pm 0.003$,$0.859 \pm 0.001$
SCARF,$0.164 \pm 0.009$,$0.309 \pm 0.014$,$0.724 \pm 0.013$,$0.879 \pm 0.004$


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \gnpsData dataset.}
\label{tab:coverage_canopus_train_public}
\begin{tabular}{lllll}
\toprule
Coverage @ &                 10 &                 30 &                300 &               1000 \\
\midrule
Random         &  $0.004 \pm 0.000$ &  $0.014 \pm 0.000$ &  $0.126 \pm 0.000$ &  $0.336 \pm 0.000$ \\
Frequency      &  $0.090 \pm 0.000$ &  $0.151 \pm 0.000$ &  $0.466 \pm 0.000$ &  $0.688 \pm 0.000$ \\
CFM-ID         &  $0.170 \pm 0.000$ &  $0.267 \pm 0.000$ &                 -- &                 -- \\
Autoregressive &  $0.072 \pm 0.001$ &  $0.082 \pm 0.002$ &  $0.095 \pm 0.001$ &  $0.099 \pm 0.000$ \\
SCARF-R        &  $0.158 \pm 0.001$ &  $0.284 \pm 0.003$ &  $0.681 \pm 0.002$ &  $0.856 \pm 0.002$ \\
SCARF-F        &  $0.155 \pm 0.002$ &  $0.306 \pm 0.002$ &  $0.708 \pm 0.003$ &  $0.859 \pm 0.001$ \\
SCARF          &  $0.164 \pm 0.009$ &  $0.309 \pm 0.0

  tex_table = round_df_pivot.to_latex(


Coverage @,10,30,300,1000
Random,$0.009 \pm 0.000$,$0.026 \pm 0.000$,$0.232 \pm 0.000$,$0.532 \pm 0.000$
Frequency,$0.173 \pm 0.000$,$0.275 \pm 0.000$,$0.659 \pm 0.000$,$0.830 \pm 0.000$
CFM-ID,$0.197 \pm 0.000$,$0.282 \pm 0.000$,,
Autoregressive,$0.204 \pm 0.001$,$0.262 \pm 0.002$,$0.309 \pm 0.005$,$0.317 \pm 0.006$
SCARF-R,$0.248 \pm 0.001$,$0.425 \pm 0.002$,$0.839 \pm 0.002$,$0.941 \pm 0.001$
SCARF-F,$0.249 \pm 0.001$,$0.476 \pm 0.002$,$0.855 \pm 0.000$,$0.943 \pm 0.001$
SCARF,$0.308 \pm 0.002$,$0.552 \pm 0.001$,$0.907 \pm 0.002$,$0.968 \pm 0.001$


\begin{table}
\centering
\caption{Model coverage of true peak formulae as determined by \MAGMA at various max formula cutoffs for the \nistData dataset.}
\label{tab:coverage_nist20}
\begin{tabular}{lllll}
\toprule
Coverage @ &                 10 &                 30 &                300 &               1000 \\
\midrule
Random         &  $0.009 \pm 0.000$ &  $0.026 \pm 0.000$ &  $0.232 \pm 0.000$ &  $0.532 \pm 0.000$ \\
Frequency      &  $0.173 \pm 0.000$ &  $0.275 \pm 0.000$ &  $0.659 \pm 0.000$ &  $0.830 \pm 0.000$ \\
CFM-ID         &  $0.197 \pm 0.000$ &  $0.282 \pm 0.000$ &                 -- &                 -- \\
Autoregressive &  $0.204 \pm 0.001$ &  $0.262 \pm 0.002$ &  $0.309 \pm 0.005$ &  $0.317 \pm 0.006$ \\
SCARF-R        &  $0.248 \pm 0.001$ &  $0.425 \pm 0.002$ &  $0.839 \pm 0.002$ &  $0.941 \pm 0.001$ \\
SCARF-F        &  $0.249 \pm 0.001$ &  $0.476 \pm 0.002$ &  $0.855 \pm 0.000$ &  $0.943 \pm 0.001$ \\
SCARF          &  $0.308 \pm 0.002$ &  $0.552 \pm 0.001$ &  $0.907 

  tex_table = round_df_pivot.to_latex(
