## Dataset analysis

Compare dataset spectra

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

from mist.utils.plot_utils import *
import mist.utils as utils

In [5]:
output_fig_folder = Path("../results/figures/datasets/")
output_fig_folder.mkdir(exist_ok=True, parents=True)

labels_files = [
    "../data/paired_spectra/csi2022/labels.tsv",
    "../data/paired_spectra/canopus_train/labels.tsv",
]

In [10]:
for label_file in labels_files:
    label_file = Path(label_file)
    dataset_name = label_file.parent.name
    save_name = Path(output_fig_folder) / dataset_name
    save_name.mkdir(exist_ok=True, parents=True)

    # Step 1: Get stats
    df = pd.read_csv(label_file, sep="\t")
    all_smis = set(df["smiles"].values)
    all_specs = set(df["spec"].values)
    print(f"Dataset {dataset_name}")
    print(f"Num unique smiles: {len(all_smis)}")
    print(f"Num unique specs: {len(all_specs)}")


    # Step 2: export example spec
    smi_lens = df["smiles"].apply(len).values
    inds = np.arange(len(smi_lens))
    inds = inds[np.logical_and(smi_lens > 15, smi_lens < 40)]
    examples = np.random.choice(inds, 5)
    df_sub = df.loc[examples]

    # Export df sub
    df_sub.to_csv(save_name / "sub_df.tsv", sep="\t")

    spectra_names = df_sub["spec"]
    spectra_formula = df_sub["formula"]
    spectra_smiles = df_sub["smiles"]

    spec_folder = label_file.parent / "spec_files"
    spec_files = [spec_folder / f"{i}.ms" for i in spectra_names]

    num_bins, upper_lim = 10000, 1000
    bins = np.linspace(0, upper_lim, num_bins)

    # Parse spec files
    parsed_spec_ars = [
        list(zip(*utils.parse_spectra(i)[1]))[1] for i in spec_files if i.exists()
    ]
    np.vstack(parsed_spec_ars[0]).shape
    binned = [
        utils.norm_spectrum(
            utils.bin_spectra(
                parsed_spec_ar,
                num_bins=num_bins,
                upper_limit=upper_lim,
            ).mean(0)[None, :]
        ).squeeze()
        for parsed_spec_ar in parsed_spec_ars
    ]

    for binned_spec, smiles, formula, name in zip(
        binned, spectra_smiles, spectra_formula, spectra_names
    ):
        # Draw molecule
        mol = Chem.MolFromSmiles(smiles)
        Draw.MolToImageFile(mol, save_name / f"{name}_{smiles}.png")

        # Plot spectrum
        fig = plt.figure(figsize=(10, 5))
        ax = fig.gca()
        inds_temp = np.nonzero(binned_spec)[0].flatten()
        for i in inds_temp:
            ax.axvline(bins[i], ymin=0, ymax=binned_spec[i])

        ax.set_xlabel("M/Z")
        ax.set_ylabel("I")
        ax.set_ylim([0, 1.08])
        ax.set_xlim([0, 1000])
        ax.set_title(f"{name}\n{formula}\n{smiles}")

        fig.savefig(save_name / f"{name}.pdf", bbox_inches="tight")
        fig.clf()

Dataset csi2022
Num unique smiles: 27797
Num unique specs: 31145


Dataset canopus_train
Num unique smiles: 8553
Num unique specs: 10709


<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>

<Figure size 1000x500 with 0 Axes>