In [8]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import cPickle,gzip
from collections import defaultdict
import createFingerprintsReaction
import random
import time

starttime = time.time() # start time


###########################################################
# Combine AP3 fingerprint with agent feature and Morgan2 FPs
###########################################################

infile = gzip.open('training_test_set_patent_data.pkl.gz', 'rb')
pklfile = gzip.open('transformationFPs_MG2_agentFPs_test_set_patent_data_2.pkl.gz','wb+')

lineNo=0
while 1:
    lineNo+=1
    try:
        smi,lbl,klass = cPickle.load(infile) 
    except EOFError:
        break
    try:
        rxn = AllChem.ReactionFromSmarts(smi,useSmiles=True)
        fp_AP3 = createFingerprintsReaction.create_transformation_FP(rxn,AllChem.FingerprintType.AtomPairFP)
        fp_MG2_agents = createFingerprintsReaction.create_agent_morgan2_FP(rxn)
        if fp_MG2_agents is None:
            fp_MG2_agents = DataStructs.UIntSparseIntVect(4096)
        fp_featureAgent = createFingerprintsReaction.create_agent_feature_FP(rxn)
    except:
        print "Cannot build fingerprint/reaction of: %s\n"%smi
        continue;
    cPickle.dump((lbl,klass,fp_AP3,fp_featureAgent,fp_MG2_agents),pklfile,2)
    if not lineNo%5000:
        print "[%6.1fs] creating transformation FP - %d"%(time.time()-starttime, lineNo)

[  28.7s] creating transformation FP - 5000
[  57.4s] creating transformation FP - 10000
[  85.6s] creating transformation FP - 15000
[ 114.0s] creating transformation FP - 20000
[ 143.1s] creating transformation FP - 25000
[ 174.5s] creating transformation FP - 30000
[ 204.5s] creating transformation FP - 35000
[ 232.6s] creating transformation FP - 40000
[ 260.7s] creating transformation FP - 45000
[ 291.2s] creating transformation FP - 50000


In [9]:
###########################################################
# Load the AP3 fingerprint, agent feature and MG2 fingerprints
###########################################################

from sklearn.linear_model import LogisticRegression
import utilsFunctions

infile = gzip.open("transformationFPs_MG2_agentFPs_test_set_patent_data_2.pkl.gz", 'rb')

lineNo=0
fps=[]
idx=0
while 1:
    lineNo+=1
    try:
        lbl,cls,fp_AP3,fp_agentFeature,fp_agentMG2 = cPickle.load(infile)        
    except EOFError:
        break
    fps.append([idx,lbl,cls,fp_AP3,fp_agentFeature,fp_agentMG2])
    idx+=1
    if not lineNo%10000:
        print "[%6.1fs] loading pickle file - %d"%(time.time()-starttime, lineNo)

[ 316.9s] loading pickle file - 10000
[ 319.2s] loading pickle file - 20000
[ 321.4s] loading pickle file - 30000
[ 323.6s] loading pickle file - 40000
[ 325.8s] loading pickle file - 50000


In [10]:
###########################################################
# Split the FPs in training (20 %) and test data (80 %)
###########################################################

import numpy as np

random.seed(0xd00f)
indices=range(len(fps))
random.shuffle(indices)

nActive=200
fpsz=256
trainFps_AP3_agentMG2=[]
testFps_AP3_agentMG2=[]
trainActs=[]
testActs=[]

reaction_types = cPickle.load(file("reactionTypes_training_test_set_patent_data.pkl"))
names_rTypes = cPickle.load(file("names_rTypes_classes_superclasses_training_test_set_patent_data.pkl"))

rtypes=sorted(list(reaction_types))
for i,klass in enumerate(rtypes):
    actIds = [x for x in indices if fps[x][2]==klass]
    for x in actIds[:nActive]:
        # np1_feature = utilsFunctions.fpToNPfloat(fps[x][3],fpsz)
        # np2_feature = np.asarray(fps[x][4], dtype=float)
        # trainFps_AP3_agentFeature += [np.concatenate([np1_feature, np2_feature])]
        np1_morgan = utilsFunctions.fpToNP(fps[x][3],fpsz)
        # trainFps_AP3 += [np1_morgan]
        np2_morgan = utilsFunctions.fpToNP(fps[x][5],fpsz)
        trainFps_AP3_agentMG2 += [np.concatenate([np1_morgan, np2_morgan])]
    trainActs += [i]*nActive
    nTest=len(actIds)-nActive
    for x in actIds[nActive:]:
        # np1_feature = utilsFunctions.fpToNPfloat(fps[x][3],fpsz)
        # np2_feature = np.asarray(fps[x][4], dtype=float)
        # testFps_AP3_agentFeature += [np.concatenate([np1_feature, np2_feature])]
        np1_morgan = utilsFunctions.fpToNP(fps[x][3],fpsz)
        # testFps_AP3 += [np1_morgan]
        np2_morgan = utilsFunctions.fpToNP(fps[x][5],fpsz)
        testFps_AP3_agentMG2 += [np.concatenate([np1_morgan, np2_morgan])]
    testActs += [i]*nTest
    
print "[%6.1fs] splited FP collection to training and test set"%(time.time()-starttime)

[ 338.2s] splited FP collection to training and test set


In [11]:
###########################################################
# Train LR Model
###########################################################

lr_cls_AP3_MG2 = LogisticRegression()
result_lr_fp_AP3_MG2 = lr_cls_AP3_MG2.fit(trainFps_AP3_agentMG2,trainActs)
print "[%6.1fs] LR model training finished"%(time.time()-starttime)

[ 359.0s] LR model training finished


In [13]:
###########################################################
# Evaluate Model
###########################################################

cmat_fp_AP3_MG2 = utilsFunctions.evaluateModel(result_lr_fp_AP3_MG2, testFps_AP3_agentMG2, testActs, rtypes, names_rTypes)
print "[%6.1fs] evaluation finished!"%(time.time()-starttime)

ID  recall    prec F-score      reaction class
 0  0.9825  0.9692  0.9758     1.2.1 Aldehyde reductive amination
 1  0.9750  0.9559  0.9653     1.2.4 Eschweiler-Clarke methylation
 2  0.9800  0.9739  0.9769     1.2.5 Ketone reductive amination
 3  0.9700  0.9688  0.9694     1.3.6 Bromo N-arylation
 4  0.9788  0.9584  0.9685     1.3.7 Chloro N-arylation
 5  0.9900  0.9730  0.9814     1.3.8 Fluoro N-arylation
 6  0.9450  0.9618  0.9533     1.6.2 Bromo N-alkylation
 7  0.9137  0.9469  0.9300     1.6.4 Chloro N-alkylation
 8  0.9187  0.9199  0.9193     1.6.8 Iodo N-alkylation
 9  0.9663  0.8968  0.9302     1.7.4 Hydroxy to methoxy
10  0.9900  0.9647  0.9772     1.7.6 Methyl esterification
11  0.9413  0.9484  0.9448     1.7.7 Mitsunobu aryl ether synthesis
12  0.9413  0.9448  0.9430     1.7.9 Williamson ether synthesis
13  0.9700  0.9936  0.9817     1.8.5 Thioether synthesis
14  0.9425  0.9654  0.9538    10.1.1 Bromination
15  0.9337  0.9676  0.9504    10.1.2 Chlorination
16  0.9925  0.9742