# Creation of the translation dictionary

In [None]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

## Create list of unique compounds and their occurrences

### Reading raw data

In [None]:
# load original data
cjhif = pd.read_csv("../data/raw/data_from_CJHIF_utf8", sep="\t", header=None)
# Fill NaN with empty strings
cjhif.fillna("empty", inplace=True)
# Drop columns 1 and 2
cjhif.drop(labels=[1, 2], axis=1, inplace=True)
# Rename columns
cjhif.rename(
    columns={
        0: "rxn_smiles",
        3: "reagent",
        4: "solvent",
        5: "catalyst",
        6: "yield",
    },
    inplace=True,
)

In [None]:
# Merge the reagent, solvent and catalyst columns into one column
cjhif["all_compounds"] = (
    cjhif["reagent"].astype(str)
    + "|"
    + cjhif["catalyst"].astype(str)
    + "|"
    + cjhif["solvent"].astype(str)
)

In [None]:
all_compounds = (
    cjhif["all_compounds"].str.split("|").explode().reset_index(drop=True)
)
all_compounds = all_compounds[all_compounds != "empty"]
compound_counts = all_compounds.value_counts().reset_index()
compound_counts.columns = ["Compound", "Occurrences"]
compound_counts = compound_counts.sort_values(by="Compound").reset_index(
    drop=True
)

In [None]:
compound_counts = all_compounds.value_counts().reset_index()
compound_counts.columns = ["Compound", "Occurrences"]
compound_counts = compound_counts.sort_values(by="Compound").reset_index(
    drop=True
)

In [None]:
print(f"There are {len(compound_counts)} unique compounds in the dataset.")

In [None]:
# Save the data in a tsv file
compound_counts.to_csv(
    "../data/helper/cjhif_translation_table.tsv", sep="\t", index=False
)

## Translate each compound name to SMILES using PubChem API and py2opsin

In [None]:
import pubchempy as pcp  # pip install pubchempy
from py2opsin import py2opsin  # pip install py2opsin

In [None]:
# Functions for PubChem and py2opsin translation


def query_opsin_pubchem(name: str) -> str:
    """Query name of compound in OpSin and if it fails query PubChem"""

    query = py2opsin(name)

    if query == "":
        query = pcp.get_compounds(name, "name")
        if query == []:
            return "empty"
        else:
            return query[0].isomeric_smiles
    else:
        return query


def get_smiles_opsin_pubchem(name: str, format_bond=True) -> str:
    """Get SMILES string from OpSin and if it fails query PubChem

    Args:
        name (str): name of compound
        format_bond (bool): if True, replace '.' with '~' in SMILES string

    Returns:
        str: SMILES string
    """

    names = name.split("|")

    structures = {str(query_opsin_pubchem(name)) for name in names}
    structures.discard("None")
    structures.discard("")

    if format_bond:
        structures = [structure.replace(".", "~") for structure in structures]

    structures = ".".join(structures)

    return structures

In [None]:
# Translate the compounds to SMILES strings
# [WARNING] : This takes a long time to run, the translation dictionnary is already provided in the dataset folder.


compound_counts["SMILES"] = compound_counts["Compound"].apply(
    get_smiles_opsin_pubchem
)

In [None]:
# save the translation table
compound_counts = compound_counts[compound_counts["Compound"] != "empty"]
compound_counts.to_csv(
    "../data/helper/cjhif_translation_table.tsv", sep="\t", index=False
)