In [15]:
from __future__ import print_function

from rdkit import Chem
from rdkit.Chem import AllChem

import gzip, cPickle
import copy
import progressbar
import random

In [38]:
## Alcohols

KETONE_2 = AllChem.ReactionFromSmarts('[CH:1][O:2]>>[C:1]=[O:2]')
CARBOXY_1 = AllChem.ReactionFromSmarts('[CH2:1]O>>[C:1](=O)O')
ALDEHYDE_1 = AllChem.ReactionFromSmarts('[CH2:1][O:2]>>[C:1](=[O:2])')

CHLORO_1_2 = AllChem.ReactionFromSmarts('[CH,CH2:1]O>>[C:1]Cl')
CHLORO_3 = AllChem.ReactionFromSmarts('[CH0:1]O>>[C:1]Cl')
BROMO_1_2 = AllChem.ReactionFromSmarts('[CH,CH2:1]O>>[C:1]Br')
BROMO_3 = AllChem.ReactionFromSmarts('[CH0:1]O>>[C:1]Br')
IODO_3 = AllChem.ReactionFromSmarts('[CH0:1]O>>[C:1]I')

TOSYL = AllChem.ReactionFromSmarts('[C!H0:1][O:2].[Cl:3][S:4]>>[C:1][O:2][S:4]')

ETHER = AllChem.ReactionFromSmarts('[C!H0:1][O:2].[Cl,Br,I:3][CH2,CH3:4]>>[C:1][O:2][C:4]')


alcohol_reactions_list = [
    
    ('KETONE_2', KETONE_2, [''], ['[O-][Cr](=O)(=O)O[Cr](=O)(=O)[O-].[Na+].[Na+].OS(O)(=O)=O', \
                                  '[O-][Cr](=O)(=O)O[Cr](=O)(=O)[O-].[K+].[K+].OS(O)(=O)=O']),
    ('CARBOXY_1', CARBOXY_1, [''], ['[O-][Cr](=O)(=O)O[Cr](=O)(=O)[O-].[Na+].[Na+].OS(O)(=O)=O', \
                                    '[O-][Cr](=O)(=O)O[Cr](=O)(=O)[O-].[K+].[K+].OS(O)(=O)=O']),
    ('ALDEHYDE_1', ALDEHYDE_1, [''], ['Cl.O=[Cr](=O)=O.c1ccncc1']),
    
    ('CHLORO_1_2', CHLORO_1_2, [''], ['O=S(Cl)Cl.c1ccncc1', 'ClP(Cl)Cl', 'ClP(Cl)(Cl)(Cl)Cl']),
    ('CHLORO_3', CHLORO_3, [''], ['Cl']),
    ('BROMO_1_2', BROMO_1_2, [''], ['BrP(Br)Br']),
    ('BROMO_3', BROMO_3, [''], ['Br']),
    ('IODO_3', IODO_3, [''], ['I']),
    
    ('TOSYL', TOSYL, ['Cc1ccc(cc1)S(Cl)(=O)=O'], ['c1ccncc1']),
    
    ('ETHER', ETHER, [''], ['[Na]', '[K]', '[H-].[Na+]']),
    
]

In [39]:
def cano(smiles): # canonicalize smiles by MolToSmiles function
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))

def block(ch, smiles):
    return (ch + cano(smiles)) if (smiles != '') else ''

In [40]:
with gzip.open('data/subst/alcohol.pkl.gz', 'rb') as f:
    alcohol_list = cPickle.load(f)
    
with gzip.open('data/subst/halide_1.pkl.gz', 'rb') as f:
    halide_1_list = cPickle.load(f)

length = len(alcohol_list)
print(length)

6097


In [41]:
rxns = []

bar = progressbar.ProgressBar(max_value=length)

for i, alcohol_smi in enumerate(alcohol_list):
    alcohol = Chem.MolFromSmiles(alcohol_smi)
    
    for reaction in alcohol_reactions_list:
        if reaction[0] == 'ETHER':
            reagent_list = random.sample(halide_1_list,10)
        else: reagent_list = reaction[2]
        for reagent in reagent_list:
            if reagent == '': products = reaction[1].RunReactants((alcohol,))
            else: products = reaction[1].RunReactants((alcohol, Chem.MolFromSmiles(reagent)))
            if len(products) == 0: continue
            product_smi = [Chem.MolToSmiles(product) for product in products[0]]
            for sub_reagent in reaction[3]:
                rxns.append(cano(alcohol_smi) + block('.', reagent) + '>' + cano(sub_reagent) + '>' + '.'.join(product_smi))
                
    bar.update(i)

 99% (6092 of 6097) |###################### | Elapsed Time: 0:00:47 ETA: 0:00:00

In [36]:
print(len(rxns))

159813


In [42]:
with gzip.open('data/rxns/alcohol.pkl.gz', 'wb') as f:
    cPickle.dump(rxns, f, 2)