## Fingerprint comparison on CANOPUS dataset

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import pickle
from mist.utils.plot_utils import *
from itertools import product
from mist.utils import analysis_utils 

from matplotlib.patches import Patch
import matplotlib.ticker as mtick

# reload analysis_utils
from importlib import reload
reload(analysis_utils)

set_style()

  from .autonotebook import tqdm as notebook_tqdm


## Load results and compute metrics

In [3]:
out_folder = Path("../results/figures/canopus_fp_compare/")
out_folder.mkdir(parents=True, exist_ok=True)

res_folders = {
    "mist": "../results/canopus_fp_mist/",
    "ffn": "../results/canopus_fp_ffn/",
}

In [4]:
bit_metrics = analysis_utils.bit_metrics
spec_metrics = analysis_utils.spec_metrics

out_df = []
for method_name, res_folder in res_folders.items():
    res_folder = Path(res_folder)
    # Generate rglob pattern to match fp_preds_csi2022.p or 
    for res_file in res_folder.rglob("*canopus_train*.p"):
        print(method_name)
        res_out = pickle.load(open(res_file, "rb"))
        seed = res_out['args']['seed'] if 'seed' in res_out['args'] else 0
        split = res_out['split_name']

        # Compute all_metrics
        p, t = (np.array(res_out["preds"]), np.array(res_out["targs"]))
        names = res_out["names"]
        for spec_metric, val_fn in spec_metrics.items():
            extra_args = {}
            if spec_metric == "Tani":
                extra_args['thresh'] = 0.2
                
            res = val_fn(p, t, **extra_args)
            for name, r in zip(names, res): 
                new_out = {"name": name, 
                           "val": r, 
                           "metric": spec_metric,
                           "type": "spectra",
                           "method": method_name,
                           "seed": seed,
                           "split": split}
                out_df.append(new_out)
        for bit_metric, val_fn in bit_metrics.items():
            res = val_fn(p, t)
            for bit_num, r in enumerate(res):
                new_out = {"name": bit_num, 
                           "val": r,
                           "metric": bit_metric,
                           "type": "bits",
                           "method": method_name,
                           "seed": seed,
                           "split": split}
                out_df.append(new_out)

mist
ffn


  res = np.log(x)
  res = np.log(x)


In [5]:
out_df = pd.DataFrame(out_df)

split_num = out_df['split'].apply(lambda x: int(x[-1]))
out_df['split_num'] = split_num
out_df_orig = out_df.copy()

In [6]:
out_df['split'].unique()

array(['canopus_hplus_100_0'], dtype=object)

In [7]:
keep_split_nums = [0]
out_df = out_df_orig[out_df_orig['split_num'].isin(keep_split_nums)]

## Create table stats 

In [8]:
stds = out_df.groupby(["method", "metric"]).std()['val'].reset_index()
counts = out_df.groupby(["method", "metric"]).count()['val'].reset_index()
means = out_df.groupby(["method", "metric"]).mean()['val'].reset_index()

In [9]:
# Sort both by method then metric
stds.sort_values(by=["method", "metric"], inplace=True)
means.sort_values(by=["method", "metric"], inplace=True)
counts.sort_values(by=["method", "metric"], inplace=True)

# Compute conf interval from stds and confs
confs_vals = stds['val'] * 1.96 / np.sqrt(counts['val'])
confs = stds.copy() 
confs['val'] = confs_vals

In [10]:
display(means)
means.pivot_table(index='method', columns='metric', values='val')

Unnamed: 0,method,metric,val
0,ffn,Cosine,0.558328
1,ffn,LL_bit,-0.032357
2,ffn,LL_spec,-0.032357
3,ffn,Tani,0.35363
4,mist,Cosine,0.703056
5,mist,LL_bit,-0.022403
6,mist,LL_spec,-0.022403
7,mist,Tani,0.508239


metric,Cosine,LL_bit,LL_spec,Tani
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ffn,0.558328,-0.032357,-0.032357,0.35363
mist,0.703056,-0.022403,-0.022403,0.508239


In [11]:
# Join mean and conf
new_col = [fr"${mean:0.3f} \pm {conf:0.3f}$" for mean, conf in zip(means["val"], confs["val"])]
latex_df = means.copy()
latex_df['val'] = new_col


# Pivot
latex_table = latex_df.pivot_table(index="method", columns="metric", values="val",
                                      aggfunc=lambda x: x, 
                                    )
latex_table.columns.name = None
latex_table.index.name = "Method"


# Resort row index
#new_order = [i for i in method_order if i in latex_table.index]
#latex_table = latex_table.reindex(new_order)
latex_table.index = [method_rename.get(i, i) for i in latex_table.index]

# Resort col index
new_cols = [i for i in metric_order if i in latex_table.columns]
latex_table = latex_table.reindex(columns=new_cols)
latex_table.columns = [metric_rename.get(i, i) for i in latex_table.columns]

# Convert to latex
latex_out = latex_table.to_latex(escape=False)
display(latex_table)
print(latex_out)

with open(out_folder / "latex_table.tex", "w") as f:
    f.write(latex_out)

  latex_out = latex_table.to_latex(escape=False)


Unnamed: 0,Tanimoto,Cosine sim.,Log likelihood (spectra),Log likelihood (bits)
FFN,$0.354 \pm 0.013$,$0.558 \pm 0.014$,$-0.032 \pm 0.001$,$-0.032 \pm 0.002$
MIST,$0.508 \pm 0.015$,$0.703 \pm 0.012$,$-0.022 \pm 0.001$,$-0.022 \pm 0.001$


\begin{tabular}{lllll}
\toprule
{} &           Tanimoto &        Cosine sim. & Log likelihood (spectra) & Log likelihood (bits) \\
\midrule
FFN  &  $0.354 \pm 0.013$ &  $0.558 \pm 0.014$ &       $-0.032 \pm 0.001$ &    $-0.032 \pm 0.002$ \\
MIST &  $0.508 \pm 0.015$ &  $0.703 \pm 0.012$ &       $-0.022 \pm 0.001$ &    $-0.022 \pm 0.001$ \\
\bottomrule
\end{tabular}

