In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from mist_cf.common.plot_utils import *
from mist_cf import common, decomp
from collections import Counter
import json
set_style()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name = "nist_canopus"
data_folder = Path(f"../data/{dataset_name}/")
labels = data_folder / "labels.tsv"

In [3]:
df = pd.read_csv(labels, sep="\t")
df_no_nist = df[df['spec'].str.contains('nist') == False]

In [4]:
len(df), len(df_no_nist)

(45838, 10709)

In [5]:
df.groupby("inchikey").count().shape, df_no_nist.groupby("inchikey").count().shape

((30950, 7), (8553, 7))

In [6]:
df.groupby("formula").count().shape, df_no_nist.groupby("formula").count().shape

((15315, 7), (5433, 7))

In [7]:
display(df.groupby("ionization").count())
display(df_no_nist.groupby("ionization").count())

Unnamed: 0_level_0,dataset,spec,name,formula,smiles,inchikey,instrument
ionization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
[M+H3N+H]+,1412,1412,681,1412,1412,1412,1412
[M+H]+,29642,29642,21439,29642,29642,29642,29642
[M+K]+,158,158,138,158,158,158,158
[M+Na]+,5968,5968,4888,5968,5968,5968,5968
[M-H2O+H]+,7104,7104,6276,7104,7104,7104,7104
[M-H4O2+H]+,1507,1507,1276,1507,1507,1507,1507
[M]+,47,47,0,47,47,47,47


Unnamed: 0_level_0,dataset,spec,name,formula,smiles,inchikey,instrument
ionization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
[M+H3N+H]+,718,718,0,718,718,718,718
[M+H]+,8030,8030,0,8030,8030,8030,8030
[M+K]+,20,20,0,20,20,20,20
[M+Na]+,998,998,0,998,998,998,998
[M-H2O+H]+,707,707,0,707,707,707,707
[M-H4O2+H]+,189,189,0,189,189,189,189
[M]+,47,47,0,47,47,47,47


In [8]:
split_1 = data_folder / "splits/split_1.tsv"
split_df = pd.read_csv(split_1, sep="\t")
split_df.groupby("Fold_0").count()

Unnamed: 0_level_0,spec
Fold_0,Unnamed: 1_level_1
test,2205
train,7727
val,777


In [9]:
split_1 = data_folder / "splits/split_1_with_nist.tsv"
split_df = pd.read_csv(split_1, sep="\t")
split_df.groupby("Fold_0").count()

Unnamed: 0_level_0,spec
Fold_0,Unnamed: 1_level_1
exclude,4078
test,2205
train,38778
val,777


## Investigating element counts

In [18]:
uniq_forms = df_no_nist['formula'].values
uniq_forms = df['formula'].values
# uniq_forms = pd.read_csv("../data/casmi22/CASMI_labels.tsv", sep="\t")['formula'].unique()
print(len(uniq_forms))

45838


In [19]:
# Convert each form into a vector of elements using common
# elements in the dataset
vecs = [common.formula_to_dense(i) for i in uniq_forms]
vecs = np.array(vecs)

In [20]:
el_to_ind = common.element_to_ind
ind_to_el = {v: k for k, v in el_to_ind.items()}

In [21]:
max_el = vecs.max(axis=0)
# Print in each column
tuples = []
for i in range(max_el.shape[0]):
    tuples.append((ind_to_el[i], max_el[i]))
# Sort and print
tuples = sorted(tuples, key=lambda x: x[1], reverse=True)
# Join with "\n" and print
tuples = "\n".join([f"{i[0]}: {i[1]}" for i in tuples])

print(tuples)

H: 125.0
C: 72.0
O: 36.0
N: 17.0
F: 17.0
Cl: 10.0
P: 6.0
S: 6.0
Si: 6.0
I: 4.0
Br: 3.0
Se: 2.0
As: 2.0
B: 1.0
Fe: 1.0
Co: 0.0
K: 0.0
Na: 0.0


In [22]:
# For each element get a histogram for how many counts there are across whole dataset
el_to_counts = {}
for i in range(max_el.shape[0]):
    ind = i
    el = ind_to_el[ind]
    dist = vecs[:, ind].astype(int)
    # Get counts
    el_to_counts[el] = Counter(dist)

In [23]:
# For each element, calculate the integer needed to capture thresh percentage
thresh = 0.95
el_to_thresh = {}
el_range = np.arange(256)
for el, counts in el_to_counts.items():
    # Get a cumulative sum of counts
    total = sum(counts.values())
    cum_sum = [sum([v for k,v in counts.items() if k <= r ]) for r in el_range]
    cum_sum_frac = [i / total for i in cum_sum]

    # Get first ind for which cum_sum_frac is greater than thresh
    ind = np.where(np.array(cum_sum_frac) >= thresh)[0][0]
    el_to_thresh[el] = float(ind)
print(el_to_thresh)
# Pretty print with json dump
print(json.dumps(el_to_thresh, indent=4))


{'C': 34.0, 'N': 4.0, 'P': 0.0, 'O': 13.0, 'S': 1.0, 'Si': 0.0, 'I': 0.0, 'H': 49.0, 'Cl': 1.0, 'F': 1.0, 'Br': 0.0, 'B': 0.0, 'Se': 0.0, 'Fe': 0.0, 'Co': 0.0, 'As': 0.0, 'K': 0.0, 'Na': 0.0}
{
    "C": 34.0,
    "N": 4.0,
    "P": 0.0,
    "O": 13.0,
    "S": 1.0,
    "Si": 0.0,
    "I": 0.0,
    "H": 49.0,
    "Cl": 1.0,
    "F": 1.0,
    "Br": 0.0,
    "B": 0.0,
    "Se": 0.0,
    "Fe": 0.0,
    "Co": 0.0,
    "As": 0.0,
    "K": 0.0,
    "Na": 0.0
}


In [24]:
# Check the fraction of vecs that are below max formula
EL_STR_DEFAULT = 'C[0-]N[0-]O[0-]H[0-]S[0-5]P[0-3]I[0-1]Cl[0-1]F[0-1]Br[0-1]'
decomp.sirius_decomp.parse_element_str(EL_STR_DEFAULT)
# Build max form
max_form = ""
for el, el_dict in decomp.sirius_decomp.parse_element_str(EL_STR_DEFAULT).items():
    max_form += el * el_dict['max']
dense_max = common.formula_to_dense(max_form)
print(dense_max)

# Get fraction of vecs that are _all_ below max form
print((vecs <= dense_max).all(axis=1).mean())

[999. 999.   3. 999.   5.   0.   1. 999.   1.   1.   1.   0.   0.   0.
   0.   0.   0.   0.]
0.9316505955757232


In [25]:
el_to_counts[el]

Counter({0: 43850, 1: 1838, 2: 144, 3: 6})