In [None]:
import pandas as pd

from rxn.chemutils.reaction_smiles import parse_any_reaction_smiles, ReactionEquation
from rxn.chemutils.tokenization import tokenize_smiles
from rxn.chemutils.multicomponent_smiles import canonicalize_multicomponent_smiles
from rxn.chemutils.reaction_equation import canonicalize_compounds, sort_compounds

from rxn.chemutils.utils import remove_atom_mapping

from dar.tagging import get_tagged_products

In [None]:
def cast_agents_as_reactants(reaction_smiles: str) -> str:
    """
    Rearranges reaction SMILES by moving agents to the reactants.

    Args:
        reaction_smiles: Reaction SMILES string
    Returns:
        Reaction SMILES with agents cast as reactants.
    """

    rxn = parse_any_reaction_smiles(reaction_smiles)
    rxn.reactants.extend(rxn.agents)
    rxn.agents = []
    
    return rxn.to_string()


In [None]:
def standardise_reaction(reaction_smiles: str, remove_atom_maps: bool = True) -> str:
    """
    Canonicalises and sorts compounds. Optionally Removes atom mapping.

    Args:
        reaction_smiles: Reaction SMILES string
        remove_atom_mapping: Optionally remove atom 
    Returns:
        Reaction SMILES with agents cast as reactants.
    """
    try:
        if remove_atom_maps:
            reaction = remove_atom_mapping(reaction_smiles)
        reaction = parse_any_reaction_smiles(reaction)

        reaction = canonicalize_compounds(reaction)
        reaction = sort_compounds(reaction)

        return reaction.to_string()
    except:
        return ""

# Tagging Products from Reaction SMILES
As an example we demonstrate a basic pre-processing workflow for tagging products from a reaction SMILES. We leave the user to carry out their own pre-processing and cleaning steps as suits their needs. We start from the USPTO dataset which contain atom-mapped reaction SMILES, we recommend remapping the reaction SMILES with RXNMapper or an atom-mapping tool of your choice. Here we use the '1976_Sep2016_USPTOgrants_smiles.rsmi' obtained from unzipping '1976_Sep2016_USPTOgrants_smiles.7z', which can be found at:  

[US Patent Office extracts (USPTO) by Lowe] (https://figshare.com/articles/dataset/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873)



In [None]:
df = pd.read_csv('1976_Sep2016_USPTOgrants_smiles.rsmi', sep='\t')

In [None]:
df = df.sample(1000)

### Basic pre-processing

Note: the pre-processing shown here is a simplified version of that used in the manuscript. Please refer to the SI for a full list of preprocessing steps or adapt as needed to your problem. **Duplicates will remain in the pre-processing below.**

In [None]:
df["ReactionSmiles"] = df["ReactionSmiles"].apply(cast_agents_as_reactants)

Split reactions into reactants and products

In [None]:
df[["reactants", "products"]] = df["ReactionSmiles"].str.split(">>", expand=True)

In [None]:
df["number_of_products"] = df["products"].apply(lambda product: len(product.split('.')))

In [None]:
df["number_of_reactants"] = df["reactants"].apply(lambda product: len(product.split('.')))

In [None]:
filtered_df = df[(df["number_of_reactants"]>= 2) & (df["number_of_products"] == 1)].copy()

In [None]:
filtered_df["ReactionSmiles"] = filtered_df["ReactionSmiles"].apply(standardise_reaction)

In [None]:
filtered_df = filtered_df[filtered_df["ReactionSmiles"] != ""].copy()

In [None]:
filtered_df.drop_duplicates(subset=["ReactionSmiles"], inplace=True)

Tag products 

In [None]:
filtered_df["tagged_products"] = filtered_df.apply(lambda x: get_tagged_products(x['reactants'], x['products']), axis=1)

In [None]:
filtered_df[["reactants", "products"]] = filtered_df["ReactionSmiles"].str.split(">>", expand=True)

Tokenise samples for training disconnection aware retrosynthesis model and save

In [None]:
filtered_df["reactants"] = filtered_df["reactants"].apply(tokenize_smiles)
filtered_df["tagged_products"] = filtered_df["tagged_products"].apply(tokenize_smiles)

#filtered_df['reactants'].to_csv('precursors_tokens.txt', index=False, header=False)
#filtered_df['tagged_products'].to_csv('tagged_products_tokens.txt', index=False, header=False)

Prepare for training model for automatic identification of disconnection sites

In [None]:
filtered_df["products"] = filtered_df["products"].apply(tokenize_smiles)

In [None]:
#filtered_df['products'].to_csv('products_tokens.txt', index=False, header=False)
#filtered_df['tagged_products'].to_csv('tagged_products_tokens.txt', index=False, header=False)