In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import pickle,gzip
from collections import defaultdict
import createFingerprintsReaction
import random
import time

starttime = time.time() # start time

Combine AP3 fingerprint with agent feature and Morgan2 FPs

In [2]:
infile = gzip.open('training_test_set_patent_data.pkl.gz', 'rb')
pklfile = gzip.open('transformationFPs_MG2_agentFPs_test_set_patent_data.pkl.gz','wb+')

lineNo=0
while 1:
    lineNo+=1
    try:
        smi,lbl,klass = pickle.load(infile, encoding='latin1') # input file pickled in python 2
    except EOFError:
        break
    try:
        rxn = AllChem.ReactionFromSmarts(smi,useSmiles=True)
        fp_AP3 = createFingerprintsReaction.create_transformation_FP(rxn,AllChem.FingerprintType.AtomPairFP)
        fp_MG2_agents = createFingerprintsReaction.create_agent_morgan2_FP(rxn)
        if fp_MG2_agents is None:
            fp_MG2_agents = DataStructs.UIntSparseIntVect(4096)
        fp_featureAgent = createFingerprintsReaction.create_agent_feature_FP(rxn)
    except:
        print ("Cannot build fingerprint/reaction of: %s\n"%smi)
        continue;
    pickle.dump((lbl,klass,fp_AP3,fp_featureAgent,fp_MG2_agents),pklfile,2)
    if not lineNo%5000:
        print ("[%6.1fs] creating transformation FP - %d"%(time.time()-starttime, lineNo))

[  18.8s] creating transformation FP - 5000
[  35.5s] creating transformation FP - 10000
[  51.4s] creating transformation FP - 15000
[  67.7s] creating transformation FP - 20000
[  84.3s] creating transformation FP - 25000
[ 101.3s] creating transformation FP - 30000
[ 118.4s] creating transformation FP - 35000
[ 134.0s] creating transformation FP - 40000
[ 149.4s] creating transformation FP - 45000
[ 166.1s] creating transformation FP - 50000


Load the AP3 fingerprint, agent feature and MG2 fingerprints

In [3]:
infile = gzip.open('transformationFPs_MG2_agentFPs_test_set_patent_data.pkl.gz', 'rb')

lineNo=0
fps=[]
idx=0
while 1:
    lineNo+=1
    try:
        lbl,cls,fp_AP3,fp_agentFeature,fp_agentMG2 = pickle.load(infile)        
    except EOFError:
        break
    fps.append([idx,lbl,cls,fp_AP3,fp_agentFeature,fp_agentMG2])
    idx+=1
    if not lineNo%10000:
        print ("[%6.1fs] loading pickle file - %d"%(time.time()-starttime, lineNo))

[ 169.2s] loading pickle file - 10000
[ 170.9s] loading pickle file - 20000
[ 172.2s] loading pickle file - 30000
[ 173.2s] loading pickle file - 40000


Split the FPs in training (70 %) and test data (30 %)

In [4]:
import numpy as np
import utilsFunctions

random.seed(0xd00f)
indices=list(range(len(fps))) # python3 range = python2 xrange
random.shuffle(indices)

nActive=700
fpsz=512
trainFps_AP3_agentMG2=[]
testFps_AP3_agentMG2=[]
trainActs=[]
testActs=[]

# input file pickled in python 2
reaction_types = pickle.load(open('reactionTypes_training_test_set_patent_data.pkl','rb'), encoding='latin1')
names_rTypes = pickle.load(open('names_rTypes_classes_superclasses_training_test_set_patent_data.pkl','rb'), encoding='latin1')
rtypes=sorted(list(reaction_types))
for i,klass in enumerate(rtypes):
    actIds = [x for x in indices if fps[x][2]==klass]
    for x in actIds[:nActive]:
        np1_morgan = utilsFunctions.fpToNP(fps[x][3],fpsz)
        np2_morgan = utilsFunctions.fpToNP(fps[x][5],fpsz)
        trainFps_AP3_agentMG2 += [np.concatenate([np1_morgan, np2_morgan])]
    trainActs += [i]*nActive
    nTest=len(actIds)-nActive
    for x in actIds[nActive:]:
        np1_morgan = utilsFunctions.fpToNP(fps[x][3],fpsz)
        np2_morgan = utilsFunctions.fpToNP(fps[x][5],fpsz)
        testFps_AP3_agentMG2 += [np.concatenate([np1_morgan, np2_morgan])]
    testActs += [i]*nTest
    
print ("[%6.1fs] split FP collection to training and test set"%(time.time()-starttime))

[ 208.2s] split FP collection to training and test set


Train LR Model

In [5]:
from sklearn.linear_model import LogisticRegression

lr_cls_AP3_MG2 = LogisticRegression()
result_lr_fp_AP3_MG2 = lr_cls_AP3_MG2.fit(trainFps_AP3_agentMG2,trainActs)
print ("[%6.1fs] LR model training finished"%(time.time()-starttime))

[ 340.3s] LR model training finished


Evaluate Model

In [6]:
cmat_fp_AP3_MG2 = utilsFunctions.evaluateModel(result_lr_fp_AP3_MG2, testFps_AP3_agentMG2, testActs, rtypes, names_rTypes)
print ("[%6.1fs] evaluation finished!"%(time.time()-starttime))

ID  recall    prec F-score      reaction class
 0  0.9100  0.9161  0.9130     1.2.1 Aldehyde reductive amination
 1  0.7900  0.8943  0.8389     1.2.4 Eschweiler-Clarke methylation
 2  0.9033  0.9443  0.9233     1.2.5 Ketone reductive amination
 3  0.9233  0.9295  0.9264     1.3.6 Bromo N-arylation
 4  0.8867  0.8444  0.8650     1.3.7 Chloro N-arylation
 5  0.9067  0.8947  0.9007     1.3.8 Fluoro N-arylation
 6  0.8800  0.8889  0.8844     1.6.2 Bromo N-alkylation
 7  0.8167  0.8140  0.8153     1.6.4 Chloro N-alkylation
 8  0.8533  0.8339  0.8435     1.6.8 Iodo N-alkylation
 9  0.8667  0.8497  0.8581     1.7.4 Hydroxy to methoxy
10  0.8800  0.8381  0.8585     1.7.6 Methyl esterification
11  0.8700  0.9094  0.8893     1.7.7 Mitsunobu aryl ether synthesis
12  0.7433  0.8139  0.7770     1.7.9 Williamson ether synthesis
13  0.9600  0.9351  0.9474     1.8.5 Thioether synthesis
14  0.8733  0.8942  0.8836    10.1.1 Bromination
15  0.8767  0.9038  0.8900    10.1.2 Chlorination
16  0.9538  0.9725

In [7]:
smi = '[H]C(=O)C1=CC=CC=C1.NC1=CC=CC=C1>[Na+].CCO.[BH4-]>'
rxn = AllChem.ReactionFromSmarts(smi,useSmiles=True)
fp_AP3 = createFingerprintsReaction.create_transformation_FP(rxn,AllChem.FingerprintType.AtomPairFP)
fp_MG2_agents = createFingerprintsReaction.create_agent_morgan2_FP(rxn)
if fp_MG2_agents is None:
    fp_MG2_agents = DataStructs.UIntSparseIntVect(4096)
np1_morgan = utilsFunctions.fpToNP(fp_AP3,fpsz)
np2_morgan = utilsFunctions.fpToNP(fp_MG2_agents,fpsz)
testFps_AP3_agentMG2 = [np.concatenate([np1_morgan, np2_morgan])]
predict = result_lr_fp_AP3_MG2.predict(testFps_AP3_agentMG2)
print ("%s %s"%(rtypes[predict[0]], names_rTypes[rtypes[predict[0]]]))

1.2.5 Ketone reductive amination


Save predictor to pickle file (for later use)

In [8]:
with gzip.open('result_lr_fp_AP3_MG2.pkl.gz','wb+') as pklfile:
    pickle.dump((result_lr_fp_AP3_MG2,rtypes,names_rTypes),pklfile,2)