# Explore a bit the clusters with Nomic

In [1]:
import re

import numpy as np
import pandas as pd
import torch
from nomic import atlas
from tqdm.notebook import tqdm

In [2]:
import pickle

# Load metadatas dict
with open("../data/processed/sg_db.bin", "rb") as f:
    sg_db = pickle.load(f)

In [3]:
%%time
# Save subsets of segment_embeds by class

# Get list of segment class
sgm_cls = np.array([s["sgm_cls"] for s in sg_db])

# Count samples per class
classes = pd.Series(sgm_cls).value_counts()
print(classes)
classes = classes[classes > 50].index
print(classes)

reaction set-up              1743928
work-up                      1492285
purification                  848536
analysis                      783855
purification and analysis      12490
Reaction set-up                   19
Conversion                         4
purification and work-up           2
Reaction Set-up                    2
Reportation                        1
Reporting                          1
dtype: int64
Index(['reaction set-up', 'work-up', 'purification', 'analysis',
       'purification and analysis'],
      dtype='object')
CPU times: user 1.07 s, sys: 164 ms, total: 1.23 s
Wall time: 1.23 s


In [4]:
import urllib.parse

import requests


def cdk(smiles):
    """
    Get a depiction of some smiles.
    """

    form_smi = urllib.parse.quote(smiles)
    img = f"https://www.simolecule.com/cdkdepict/depict/bot/svg?smi={form_smi}&w=-1&h=-1&abbr=on&hdisp=bridgehead&zoom=1.3&annotate=colmap&r=0"

    t = f"""
    <?xml version='1.0' encoding='UTF-8'?>
    <!DOCTYPE img PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
    <img src="{img}">
    """
    return t


d = cdk(
    "[Al+3].[Cl-].[Cl-].[Cl-].[Cl:5][CH2:6][CH2:7][CH2:8][C:9](Cl)=[O:10].[C:12]1([CH3:18])[CH:17]=[CH:16][CH:15]=[CH:14][CH:13]=1>>[Cl:5][CH2:6][CH2:7][CH2:8][C:9]([C:15]1[CH:16]=[CH:17][C:12]([CH3:18])=[CH:13][CH:14]=1)=[O:10] |f:0.1.2.3|"
)
from IPython.core.display import HTML
from IPython.display import display

display(HTML(d))

In [None]:
rxn_cls = pd.read_csv("../data/processed/rxn_classes_namerxn.csv")

In [38]:
rcls_dd = rxn_cls.drop_duplicates(subset=["orig_smi", "cls_1"])

In [48]:
for i, cl in tqdm(enumerate(classes), total=len(classes)):
    idx = np.ones(len(sgm_cls))
    idx = np.where(np.array(sgm_cls) == cl, idx, 0)

    print(f"There are {idx.sum()} {cl}s")

    # Load subset and make map
    cl = re.sub(" |-", "_", cl)
    embs = torch.load(f"../data/processed/embeds/segm_embs_{cl}.pt")

    meta = [b for i, b in enumerate(sg_db) if idx[i]]

    # Select subsample and plot
    sampl_idx = np.random.permutation(range(embs.shape[0]))[:100000]

    embs_ss = embs[sampl_idx]
    meta_ss = [meta[i] for i in sampl_idx]

    smis = pd.Series([b["rxn_smi"] for b in meta_ss], name="rxn_smi")
    rxn_clss = rcls_dd.merge(smis, left_on="orig_smi", right_on="rxn_smi", how="right").cls_1

    def pp(b, cls):
        b["prd_str"] = str(b["prd_str"])
        b["rxn_img"] = cdk(b["rxn_smi"])
        b["rxn_cls"] = cls
        return b

    meta_ss = [pp(b, cls) for b, cls in tqdm(zip(meta_ss, rxn_clss), total=len(meta_ss))]

    response = atlas.map_embeddings(
        name=f"Semantic synthesis: {cl}",
        embeddings=embs_ss.numpy(),
        data=meta_ss,
        colorable_fields=["rxn_cls", "stp_ord", "src_prg"],
        reset_project_if_exists=True,
    )
    print(response)

    break

  0%|          | 0/5 [00:00<?, ?it/s]

There are 1743928.0 reaction set-ups


  0%|          | 0/100000 [00:00<?, ?it/s]

[32m2023-09-30 05:03:09.351[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m790[0m - [1mCreating project `Semantic synthesis: reaction_set_up` in organization `doncamilom`[0m
[32m2023-09-30 05:03:10.480[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m108[0m - [1mUploading embeddings to Atlas.[0m

  0%|                                                                                                                          | 0/99 [00:00<?, ?it/s][A
  1%|█▏                                                                                                                | 1/99 [00:01<03:12,  1.96s/it][A
  3%|███▍                                                                                                              | 3/99 [00:02<00:56,  1.69it/s][A
 11%|████████████▌                                                                                                    | 11/99 [00:02<00:15,  5.58it/s][A
 12%|█████████████▋  

Semantic synthesis: reaction_set_up: https://atlas.nomic.ai/map/d9350369-60bd-4eee-84ec-1f2637687b96/f371e178-2e49-4a25-bad5-fac75a0884c3
