In [1]:
from __future__ import print_function

from rdkit import Chem
from rdkit.Chem import AllChem

import gzip, cPickle
import copy
import progressbar
import random

In [2]:
## Aldehyde & Ketone

METAL = AllChem.ReactionFromSmarts('[C:1]=[O:2].[C,c:3][Mg+,Li:4]>>[*:3][C:1][O:2]')

RED = AllChem.ReactionFromSmarts('[C:1]=[O:2]>>[C:1][O:2]')
RRED = AllChem.ReactionFromSmarts('[C:1]=[O:2]>>[C:1]') # Clemmensen and Wolff-Kishner

WITTIG = AllChem.ReactionFromSmarts('[C:1]=[O:2].[C:3][P+:4]>>[C:1]=[C:3]')
_YLIDE = AllChem.ReactionFromSmarts('[C:1]=[O:2]>>[C:1][P+](c1ccccc1)(c1ccccc1)(c1ccccc1)')

HYD = AllChem.ReactionFromSmarts('[C:1]=[O:2].[O:3]>>[C:1]([O:2])[O:3]')
CYAN = AllChem.ReactionFromSmarts('[C:1]=[O:2].[C-:3]#[N:4]>>[C:1]([O:2])[C-0:3]#[N:4]')
IMINE = AllChem.ReactionFromSmarts('[C:1]=[O:2].[NH2:3]>>[C:1]=[N:3]')
OXIME = AllChem.ReactionFromSmarts('[C:1]=[O:2].[NH2:3][O:4]>>[C:1]=[N:3][O:4]')
HYDRAZONE = AllChem.ReactionFromSmarts('[C:1]=[O:2].[NH2:3][N:4]>>[C:1]=[N:3][N:4]')

ACETAL = AllChem.ReactionFromSmarts('[C:1]=[O:2].[O:3]>>[C:1]([O:3])[O:3]')
ACETAL_ALT = AllChem.ReactionFromSmarts('[C:1]=[O:2].([O:3].[O:4])>>[C:1]([O:3])[O:4]')

OXI = AllChem.ReactionFromSmarts('[C:1](=[O:2])[H]>>[C:1](=[O:2])O')

carbonyl_reactions_list = [
    
    ('METAL', METAL, ['[Li]C', '[Li]CC', '[Li]CCC', '[Li]CCCC', '[Li]c1ccccc1', \
                '[Br-].[Mg+]C', '[Br-].[Mg+]CC', '[Br-].[Mg+]CCC', '[Br-].[Mg+]CCCC', '[Br-].[Mg+]c1ccccc1'], ['']),
    
    ('RED', RED, [''], ['[H-].[H-].[H-].[H-].[Al+3].[Li+]', '[Na+].[BH4-]', '[H].[H]']),
    ('RRED', RRED, [''], ['[Hg].[Zn].Cl', 'NN.[K+].[OH-]']),
    
    ('WITTIG', WITTIG, [''], ['']),
    
    ('HYD', HYD, ['O'], ['']),
    ('CYAN', CYAN, ['[C-]#N'], ['C#N']),
    ('IMINE', IMINE, [''], ['Cl']),
    ('OXIME', OXIME, ['NO'], ['Cl']),
    ('HYDRAZONE', HYDRAZONE, ['NN', 'NNc1ccccc1', 'NNC(N)=O'], ['Cl']),
    
    ('ACETAL', ACETAL, ['CO.CO', 'CCO.CCO', 'CCCO.CCCO', 'CCCCO.CCCCO'], ['Cl']),
    ('ACETAL_ALT', ACETAL_ALT, ['OCCO'], ['Cl']),
    
    ('OXI', OXI, [''], ['[O-][Cr](=O)(=O)O[Cr](=O)(=O)[O-].[Na+].[Na+].OS(O)(=O)=O', \
                             '[O-][Cr](=O)(=O)O[Cr](=O)(=O)[O-].[K+].[K+].OS(O)(=O)=O', \
                             '[K+].[O-][Mn](=O)(=O)=O']),
    
]

In [3]:
def cano(smiles): # canonicalize smiles by MolToSmiles function
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))

def block(ch, smiles):
    return (ch + cano(smiles)) if (smiles != '') else ''

In [4]:
with gzip.open('data/subst/ald_ket.pkl.gz', 'rb') as f:
    ald_ket_list = cPickle.load(f)
    
with gzip.open('data/subst/amine_1.pkl.gz', 'rb') as f:
    amine_1_list = cPickle.load(f)

length = len(ald_ket_list)
print(length)

3398


In [5]:
rxns = []

bar = progressbar.ProgressBar(max_value=length)

for i, carbonyl_smi in enumerate(ald_ket_list):
    carbonyl = Chem.MolFromSmiles(carbonyl_smi)
    
    for reaction in carbonyl_reactions_list:
        if reaction[0] == 'IMINE':
            reagent_list = random.sample(amine_1_list,10)
        elif reaction[0] == 'WITTIG':
            ylide_smi = random.sample(ald_ket_list,10)
            ylide_mol = [Chem.MolFromSmiles(smi) for smi in ylide_smi]
            reagent_list = [Chem.MolToSmiles(_YLIDE.RunReactants((mol,))[0][0]) for mol in ylide_mol]
        else: reagent_list = reaction[2]
            
        for reagent in reagent_list:
            if reagent == '': products = reaction[1].RunReactants((carbonyl,))
            else: products = reaction[1].RunReactants((carbonyl, Chem.MolFromSmiles(reagent)))
            if len(products) == 0: continue
            product_smi = [Chem.MolToSmiles(product) for product in products[0]]
            if reaction[0] == 'WITTIG':
                for halogen in ['[Cl-]', '[Br-]', '[I-]']:
                    reagent_m = reagent + '.' + halogen
                    rxns.append(cano(carbonyl_smi) + block('.', reagent_m) + '>>' + '.'.join(product_smi))
            else:
                for sub_reagent in reaction[3]:
                    rxns.append(cano(carbonyl_smi) + block('.', reagent) + '>' + cano(sub_reagent) + '>' + '.'.join(product_smi))
                    
    bar.update(i)
    
bar.finish()

100% (3398 of 3398) |######################| Elapsed Time: 0:01:43 Time: 0:01:43


In [6]:
print(len(rxns))

224268


In [7]:
with gzip.open('data/rxns/ald_ket.pkl.gz', 'wb') as f:
    cPickle.dump(rxns, f, 2)