In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import mist_cf.common as common
import matplotlib.pyplot as plt

In [3]:
bile_acid_label = "../data/bile_acid/bile_acid_refined_processed_labels.tsv"
bile_acid_label_df = pd.read_csv(bile_acid_label, sep="\t")

nist_label = "../data/nist_canopus/labels.tsv"
nist_df = pd.read_csv(nist_label, sep="\t")
nist_spec_to_formula = dict(zip(nist_df.spec.values, nist_df.formula.values))

nist_split_file = "../data/nist_canopus/splits/split_1_with_nist.tsv"
nist_split_df = pd.read_csv(nist_split_file, sep="\t")
nist_split_df = nist_split_df[nist_split_df["Fold_0"]=="train"]

stat_dir = Path("../results/bile_acid_stat/")
stat_dir.mkdir(exist_ok=True)

## Dataset Statistics

In [4]:
bile_acid_specs = bile_acid_label_df.name.values
bile_acid_formulae = bile_acid_label_df.formula.values

In [5]:
# unique chemical formula out of 1k bile acid spec
len(np.unique(bile_acid_formulae))

47

In [6]:
# unique specs out of 1k bile acid spec
len(np.unique(bile_acid_specs))

213

In [7]:
# form, cnt = np.unique(bile_acid_formulae, return_counts=True)
# pd.DataFrame.from_dict({'Formula': form, 
#                         'Count': cnt}).to_csv(stat_dir / 'bile_acid_unique_form.tsv',
#                                                                sep='\t',
#                                                                index=False)

# spec, cnt = np.unique(bile_acid_specs, return_counts=True)
# pd.DataFrame.from_dict({'Spec': spec, 
#                         'Count': cnt}).to_csv(stat_dir / 'bile_acid_unique_spec.tsv',
#                                                                sep='\t',
#                                                                index=False)

## Overlapped formula and spec found in our training set

In [8]:
train_specs = nist_split_df.spec.values
overlap_specs = []

for b_spec in bile_acid_specs:
    if b_spec in train_specs:
        overlap_specs.append(b_spec)

# The total number of overlapped specs
len(overlap_specs)

39

In [9]:
specs, cnt = np.unique(overlap_specs, return_counts=True)
spec_forms = []
for spec in specs:
    spec_forms.append(nist_spec_to_formula[spec])
pd.DataFrame.from_dict({'Specrum ID': specs, 
                        'Count': cnt,
                        'Precursor Formula': spec_forms
                        }).to_csv(stat_dir / 'bile_acid_unique_overlap_spec_annotated.tsv',
                                                            sep='\t',
                                                            index=False)
# The number of unique overlapped specs
len(specs)

10

In [10]:
nist_split_formulae = [nist_spec_to_formula[spec] for spec in train_specs]
overlap_formulae = []

for b_form in bile_acid_formulae:
    if b_form in nist_split_formulae:
        overlap_formulae.append(b_form)

# The total number of overlapped forms
len(overlap_formulae)

210

In [11]:
unique_overlap_form, cnt = np.unique(overlap_formulae, return_counts=True)
# pd.DataFrame.from_dict({'Precursor Formula': unique_overlap_form,
#                          'Count': cnt}).to_csv(stat_dir / 'overlap_form.tsv',
#                                                                sep='\t',
#                                                                index=False)

# The number of unique forms
len(unique_overlap_form)

24

In [12]:
form, cnt = np.unique(bile_acid_formulae, return_counts=True)
overlap = []
for f in form:
    if f in unique_overlap_form:
        overlap.append('Yes')
    else:
        overlap.append('No')
pd.DataFrame.from_dict({'Formula': form, 
                        'Count': cnt,
                        'Overlap': overlap
                        }).to_csv(stat_dir / 'bile_acid_unique_form_annotated.tsv',
                                                               sep='\t',
                                                               index=False)

## Create MIST-CF output sheet

In [13]:
infile = "../results/mist_cf_predict_bile_acid/formatted_output.tsv"
outfile = "../results/mist_cf_predict_bile_acid/mist_cf_top_5_annotation.csv"

k = 5
df = pd.read_csv(infile, sep="\t")
counted = df.sort_values("scores",
                         ascending=False).groupby('spec').head(k).sort_values(["spec",
                                                                               "scores"],
                                                                              ascending=False)
spec_counts = counted.groupby("spec").count()['scores'].values
inds = [j for i in spec_counts for j in np.arange(1, i+1)]
counted['rank'] = inds
counted = counted.sort_values(by=["spec", "rank"]).reset_index(drop=True)

out_df = counted.set_index('spec').join(bile_acid_label_df.set_index('spec'),
                       how='inner')
out_df=out_df.reset_index()
out_df=out_df.rename(columns={'spec':'ScanNumber',
                              'scores': 'MIST_CF_score',
                              'parentmasses': 'MS1',
                              'name': 'SpectrumID',
                              'formula': 'true_form',
                              'ionization': 'true_ion'
                              })

# Reorder columns
cols = out_df.columns.tolist()
reordered_cols = ["ScanNumber"]+cols[6:]+['MS1',
                                           'cand_form',
                                           'cand_ion',
                                           'MIST_CF_score',
                                           'rank'
                                           ]
out_df = out_df[reordered_cols]
out_df.to_csv(outfile, index=None)

## Compute accuracy not including adducts

In [25]:
res_files = ["../results/sirius_predict_bile_acid/sirius_1/evaluation_results/full_out.tsv",
             "../results/mist_cf_predict_bile_acid/evaluation_results/full_out.tsv",]
model_names = ["Sirius", "MIST-CF"]

all_dfs = []
for res, model in zip(res_files, model_names):
    df = pd.read_csv(res, sep="\t")
    df['model'] = model
    df['split'] = "bile_acid"
    all_dfs.append(df)

all_df = pd.concat(all_dfs, axis=0)

In [28]:
# Group by method, split, and compute top k accuracy through 20

denom = len(bile_acid_label_df)
def top_k_accuracy(x, top_k, key_str="ind_found"):
    k_dict = []
    for k in top_k: 
        acc = np.sum(x[key_str] <= k)  / denom
        k_dict.append({"k": k, "acc": acc})
    return k_dict

top_k = [1, 2, 3]
#top_k_acc = all_df.groupby(["model", "split",]).apply(lambda x: top_k_accuracy(x, top_k))

keys = ["model", "split"]#, "mass_bin"]
key_strs = ["ind_found", "ind_found_adduct", "ind_found_full_form"]
out_df = []
for names, sub_df in all_df.groupby(keys):
    update_dict = dict(zip(keys, names))
    for key_str in key_strs:
        sub_dicts = top_k_accuracy(sub_df, top_k, key_str=key_str)

        out_df.extend([dict(**i, **update_dict, val=key_str) for i in sub_dicts ])
    num_submitted = int(len(sub_df))
    out_df.append(dict(k="num_submitted", val="num_submitted", acc=num_submitted,  **update_dict))
print(out_df)
out_df = pd.DataFrame(out_df)

[{'k': 1, 'acc': 0.7015873015873015, 'model': 'MIST-CF', 'split': 'bile_acid', 'val': 'ind_found'}, {'k': 2, 'acc': 0.8137566137566138, 'model': 'MIST-CF', 'split': 'bile_acid', 'val': 'ind_found'}, {'k': 3, 'acc': 0.9502645502645503, 'model': 'MIST-CF', 'split': 'bile_acid', 'val': 'ind_found'}, {'k': 1, 'acc': 0.726984126984127, 'model': 'MIST-CF', 'split': 'bile_acid', 'val': 'ind_found_adduct'}, {'k': 2, 'acc': 0.9693121693121693, 'model': 'MIST-CF', 'split': 'bile_acid', 'val': 'ind_found_adduct'}, {'k': 3, 'acc': 0.9925925925925926, 'model': 'MIST-CF', 'split': 'bile_acid', 'val': 'ind_found_adduct'}, {'k': 1, 'acc': 0.7788359788359789, 'model': 'MIST-CF', 'split': 'bile_acid', 'val': 'ind_found_full_form'}, {'k': 2, 'acc': 0.8476190476190476, 'model': 'MIST-CF', 'split': 'bile_acid', 'val': 'ind_found_full_form'}, {'k': 3, 'acc': 0.9915343915343915, 'model': 'MIST-CF', 'split': 'bile_acid', 'val': 'ind_found_full_form'}, {'k': 'num_submitted', 'val': 'num_submitted', 'acc': 945,

In [29]:
out_df

Unnamed: 0,k,acc,model,split,val
0,1,0.701587,MIST-CF,bile_acid,ind_found
1,2,0.813757,MIST-CF,bile_acid,ind_found
2,3,0.950265,MIST-CF,bile_acid,ind_found
3,1,0.726984,MIST-CF,bile_acid,ind_found_adduct
4,2,0.969312,MIST-CF,bile_acid,ind_found_adduct
5,3,0.992593,MIST-CF,bile_acid,ind_found_adduct
6,1,0.778836,MIST-CF,bile_acid,ind_found_full_form
7,2,0.847619,MIST-CF,bile_acid,ind_found_full_form
8,3,0.991534,MIST-CF,bile_acid,ind_found_full_form
9,num_submitted,945.0,MIST-CF,bile_acid,num_submitted
