In [1]:
import re
from tqdm import tqdm
from rdkit.Chem import AllChem
from rdkit import Chem, RDLogger
from collections import defaultdict
from hashlib import md5

In [2]:
reagents = {}

In [3]:
def process(smarts):
    """Process a SMARTS reaction string
    into source and target tokens"""
    rxn = AllChem.ReactionFromSmarts(smarts)
    prods = rxn.GetProducts()
    if len(prods) > 1:
        return None

    rxn.Initialize()
    try:
        reactants = list(zip(rxn.GetReactants(), rxn.GetReactingAtoms()))
    except ValueError:
        # Likely that initialization failed
        # print('Failed to initialize')
        return None

    prod_smis = []
    for mol in prods:
        # Clear atom mappings
        [x.ClearProp("molAtomMapNumber") for x in mol.GetAtoms()]
        smi = Chem.MolToSmiles(mol)
        prod_smis.append(smi)

    react_smis = []
    reagent_syms = []
    for mol, atoms in reactants:
        # Clear atom mappings
        [x.ClearProp("molAtomMapNumber") for x in mol.GetAtoms()]

        # Remove molecules with no reacting atoms (reagents)
        # But represent as a symbol
        if not atoms:
            smi = Chem.MolToSmiles(mol)
            if smi not in reagents:
                reagents[smi] = len(reagents)
            reagent_syms.append("[A{}]".format(reagents[smi]))

        else:
            smi = Chem.MolToSmiles(mol)
            react_smis.append(smi)

    source = react_smis
    if reagent_syms:
        source.extend(reagent_syms)
    target = prod_smis

    return source, target


def clean(line):
    return line.strip().split()[0]

In [4]:
# inspect
with open("../data/reactions.rsmi", "r") as f:
    lines = f.readlines()
    for line in lines[:2]:
        print(line.strip())

[Br:1][CH2:2][CH2:3][OH:4].[CH2:5]([S:7](Cl)(=[O:9])=[O:8])[CH3:6].CCOCC>C(N(CC)CC)C>[CH2:5]([S:7]([O:4][CH2:3][CH2:2][Br:1])(=[O:9])=[O:8])[CH3:6]
[Br:1][CH2:2][CH2:3][CH2:4][OH:5].[CH3:6][S:7](Cl)(=[O:9])=[O:8].CCOCC>C(N(CC)CC)C>[CH3:6][S:7]([O:5][CH2:4][CH2:3][CH2:2][Br:1])(=[O:9])=[O:8]


In [12]:
seen = set()
reactions = []

limit = 100

with open("../data/reactions.rsmi", "r") as f:
    lines = f.readlines()[:limit]
    it = tqdm(map(process, map(clean, lines)))
    for toks in it:
        if toks is None:
            continue

        # Hash the processed reaction to check for duplicates
        h = md5("_".join("".join(ts) for ts in toks).encode("utf8")).hexdigest()
        if h in seen:
            continue
        else:
            seen.add(h)

        reactions.append(toks)

        it.set_postfix(reactions=len(reactions), reagents=len(reagents))

# Results
print("Reagents:", len(reagents))
print("Reactions:", len(reactions))

# Save
# with open("../data/reagents.dat", "w") as f:
#     lines = []
#     for reagent, id in sorted(reagents.items(), key=lambda kv: kv[1]):
#         lines.append("{}\t{}".format(reagent, id))
#     f.write("\n".join(lines))

# with open("../data/reactions.dat", "w") as f:
#     lines = []
#     for source_toks, target_toks in reactions:
#         lines.append("{}\t{}".format(" ".join(source_toks), " ".join(target_toks)))
#     f.write("\n".join(lines))

0it [00:00, ?it/s][23:16:47] reactant 2 has no mapped atoms.
0it [00:00, ?it/s, reactions=1, reagents=1][23:16:47] reactant 2 has no mapped atoms.
0it [00:00, ?it/s, reactions=2, reagents=1][23:16:47] reactant 1 has no mapped atoms.
0it [00:00, ?it/s, reactions=3, reagents=1][23:16:47] reactant 2 has no mapped atoms.
[23:16:48] reactant 2 has no mapped atoms.
[23:16:48] reactant 1 has no mapped atoms.
[23:16:48] reactant 1 has no mapped atoms.
[23:16:48] reactant 2 has no mapped atoms.
[23:16:48] reactant 3 has no mapped atoms.
0it [00:00, ?it/s, reactions=5, reagents=1][23:16:48] reactant 0 has no mapped atoms.
0it [00:00, ?it/s, reactions=7, reagents=1][23:16:48] reactant 1 has no mapped atoms.
[23:16:48] reactant 2 has no mapped atoms.
[23:16:48] reactant 3 has no mapped atoms.
[23:16:48] reactant 4 has no mapped atoms.
[23:16:48] reactant 5 has no mapped atoms.
[23:16:48] reactant 6 has no mapped atoms.
0it [00:00, ?it/s, reactions=9, reagents=1][23:16:48] reactant 1 has no mapped 

Reagents: 1
Reactions: 84





In [16]:
reagents, product = reactions[2]

In [17]:
reagents

['[OH][CH2][CH2]Cl', 'CCOCC', '[CH3][CH]([CH3])[CH2]S(=O)(=O)Cl']

In [18]:
product

['[CH3][CH]([CH3])[CH2]S(=O)(=O)O[CH2][CH2]Cl']

In [None]:
### DATA FORMAT FOR FINETUNING
"""
<product></product>
<reagent></reagent>
...
<reagent></reagent>
<reagent>
"""

# completion should be SMILES</reagent>