In [6]:
from __future__ import print_function

import glob

from rdkit import Chem
from rdkit.Chem import Lipinski

In [33]:
alkene = []
alkyne = []
alcohol_1 = []
alcohol = []
ald_ket = []
amine_1 = []
acid = []
ester = []
amide_1 = []
amide = []
nitrile = []
halide = []

# excluded sterically hindered structures (e.g. neopentyl, tertiary carbon)
s_alkene = Chem.MolFromSmarts('[CX3]=[CX3]')
s_alkyne = Chem.MolFromSmarts('[CX2]#[CX2]')
s_alcohol = Chem.MolFromSmarts('[OX2H1][CX4]')
s_alcohol_1 = Chem.MolFromSmarts('[OX2H1][CX4H2][c,C&!H0]')
s_ketone = Chem.MolFromSmarts('[#6][CX3](=O)[#6]')
s_aldehyde = Chem.MolFromSmarts('[CX3H1](=O)[#6]')
s_amine_1 = Chem.MolFromSmarts('[NX3H2][CX4H2][c,C&!H0]') # primary amine, N linked to primary carbon
s_acid = Chem.MolFromSmarts('[CX3](=O)[OX2H1]')
s_ester = Chem.MolFromSmarts('[c,C&!H0][CX3](=O)[OX2H0][c,C&!H0]')
s_amide = Chem.MolFromSmarts('[NX3][CX3](=[OX1])[c,C&!H0]')
s_amide_1 = Chem.MolFromSmarts('[NX3H2][CX3](=[OX1])[c,C&!H0]')
s_nitrile = Chem.MolFromSmarts('[NX1]#[CX2][c,C&!H0]')
s_halide = Chem.MolFromSmarts('[CX4][F,Cl,Br,I]') # only fluoride in GDB


for file in glob.iglob("data/gdb11/*.smi"): # excluded size 11
    print(file)
    with open(file) as f:
        for line in f:
            mol = Chem.MolFromSmiles(line.split()[0])
            smi = Chem.MolToSmiles(mol)
            cnt_hetatm = Lipinski.NumHeteroatoms(mol)
            
            double = len(mol.GetSubstructMatches(s_alkene))
            triple = len(mol.GetSubstructMatches(s_alkyne))
            
            if double == 0 and triple == 0:
                if cnt_hetatm == 2:
                    if mol.HasSubstructMatch(s_acid):
                        acid.append(smi)
                    elif mol.HasSubstructMatch(s_ester):
                        ester.append(smi)
                    elif mol.HasSubstructMatch(s_amide):
                        if mol.HasSubstructMatch(s_amide_1):
                            amide_1.append(smi)
                        amide.append(smi)

                elif cnt_hetatm == 1:
                    if mol.HasSubstructMatch(s_alcohol):
                        if mol.HasSubstructMatch(s_alcohol_1):
                            alcohol_1.append(smi)
                        alcohol.append(smi)
                    elif mol.HasSubstructMatch(s_aldehyde):
                        ald_ket.append(smi)
                    elif mol.HasSubstructMatch(s_ketone):
                        ald_ket.append(smi)
                    elif mol.HasSubstructMatch(s_amine_1):
                        amine_1.append(smi)
                    elif mol.HasSubstructMatch(s_nitrile):
                        nitrile.append(smi)
                    elif mol.HasSubstructMatch(s_halide):
                        halide.append(smi)
            
            elif cnt_hetatm == 0:
                if double == 1 and triple == 0: alkene.append(smi)
                if double == 0 and triple == 1: alkyne.append(smi)

data/gdb11/gdb11_size05.smi
data/gdb11/gdb11_size02.smi
data/gdb11/gdb11_size07.smi
data/gdb11/gdb11_size03.smi
data/gdb11/gdb11_size04.smi
data/gdb11/gdb11_size08.smi
data/gdb11/gdb11_size10.smi
data/gdb11/gdb11_size06.smi
data/gdb11/gdb11_size01.smi
data/gdb11/gdb11_size09.smi


In [34]:
subst_dict = {
    'alkene': alkene,
    'alkyne': alkyne,
    'alcohol_1': alcohol_1,
    'alcohol': alcohol,
    'ald_ket': ald_ket,
    'amine_1': amine_1,
    'acid': acid,
    'ester': ester,
    'amide_1': amide_1,
    'amide': amide,
    'nitrile': nitrile, 
    'halide': halide
}

for name, subst in subst_dict.iteritems():
    print(name, len(subst), sep='     \t')

alkene     	7781
ald_ket     	3398
alcohol_1     	1026
nitrile     	997
acid     	357
amide_1     	274
alcohol     	6097
amide     	2479
alkyne     	1862
halide     	6097
ester     	1036
amine_1     	1026


In [35]:
import cPickle, gzip

file_handles = []

for name, subst in subst_dict.iteritems():
    with gzip.open('data/subst/'+name+'.pkl.gz', 'wb') as f:
        cPickle.dump(subst, f, 2)

In [37]:
print(ester[:100])

['COC(C)=O', 'O=C1CCO1', 'CC(=O)OC(C)C', 'CCCOC(C)=O', 'CCCC(=O)OC', 'CCOC(=O)CC', 'COC(=O)C(C)C', 'CC1COC(=O)C1', 'CC1CCC(=O)O1', 'CCC1CC(=O)O1', 'CC(=O)OC1CC1', 'O=C1CCCCO1', 'CC1CCOC1=O', 'CC1OC(=O)C1C', 'O=C1CC2CC2O1', 'O=C1OC2CC1C2', 'CCC1COC1=O', 'COC(=O)C1CC1', 'O=C1OCC2CC12', 'CC(=O)OCC(C)C', 'CC1(C)COC(=O)C1', 'CCC(C)OC(C)=O', 'COC(=O)CC(C)C', 'CCC(=O)OC(C)C', 'CCCCOC(C)=O', 'CCCC(=O)OCC', 'CCOC(=O)C(C)C', 'CC(C)C1CC(=O)O1', 'CC1CCOC(=O)C1', 'CC1CCCC(=O)O1', 'CC(=O)OC1CC1C', 'CC1CC(=O)OC1C', 'CC1CC(C)C(=O)O1', 'CC12CC(=O)OC1C2', 'CC(=O)OC1CCC1', 'CCCC1CC(=O)O1', 'O=C1CC2CC(C2)O1', 'CCC(C)C(=O)OC', 'CCCCC(=O)OC', 'CCCOC(=O)CC', 'CC1CCC(=O)OC1', 'CC12COC(=O)C1C2', 'CCC1COC(=O)C1', 'CCC1CCC(=O)O1', 'COC(=O)CC1CC1', 'CCC(=O)OC1CC1', 'CC(=O)OCC1CC1', 'O=C1CC2(CC2)CO1', 'CCC1OC(=O)C1C', 'CC1CCCOC1=O', 'O=C1CCCCCO1', 'CC1COC(=O)C1C', 'CCCC1COC1=O', 'COC(=O)C1CCC1', 'CC(C)C1COC1=O', 'COC(=O)C1CC1C', 'CC1C2CC(=O)OC12', 'O=C1CC2CCC2O1', 'CCC1CCOC1=O', 'CCOC(=O)C1CC1', 'CCC1C(=O)OC1C', '