In [1]:
# generate feature subsets
from CustomFeatures import MyFeatures
from NERmodel import NERmodel
import subprocess
import traceback, sys
import pickle
import yaml
import os

def initConf(filepath=None):
    """
        every plugin uses its own config file
    """
    if not filepath:
        return yaml.load(open(os.getcwd()+'/models.conf','r'))
    else:
        return yaml.load(open(filepath,'r'))
    
def writeConfBack(conf, filepath=None):
    if not filepath:
        filepath = os.getcwd()+'/models.conf'
    with open(filepath,'w') as f:
        f.write(yaml.dump(conf))
        
     
def writeConf(conf,modelname,key, msg ):
    for modeln,params in conf["models"].items():
        try:
            if modelname == modeln:
                params[key]=msg
                writeConfBack(conf)
        except Exception:
            print("Exception in user code:")
            print("-"*60)
            traceback.print_exc(file=sys.stdout)
            print("-"*60) 
        

conf = initConf()


# name the batch training
training_set_name =  conf["session"]
print(training_set_name)

# load the different MyFeature instances
fs = []
#for i in range(len(conf["models"])):
for modelname,params in conf["models"].items():
    #print(modelname)
    #print(params["word"])
    try:
        mf = MyFeatures(params=params)
        #print(mf)
        #print(modelname)
        fs.append((mf,modelname,params["modeltype"]))
        #print("Active features")
        #print(mf.printActiveFeatureFunctions())
        #print()
        writeConf(conf,modelname,"status", "doing" )
    except Exception:
        print("Exception in user code:")
        print("-"*60)
        traceback.print_exc(file=sys.stdout)
        print("-"*60)
        writeConf(conf,modelname,"status", "error" )
        
print("feature instances:")
print(fs)
print()    

# initialize
models=[]
for mf in fs:
    try:
        fset=mf[0]
        modelname=mf[1]
        modeltype=mf[2]
        model = NERmodel(featureset=fset, name=modelname, modeltype=modeltype)
        model.setName(training_set_name)
        models.append(model)
    except:
        writeConf(conf,modelname,"status", "error" )
         
fsold = fs
fs = [mf[0] for mf in fs]
print("models:")
print(models)
print()
    

# train all models
models_pickles =[None]*len(models)
print(len(models_pickles))
print(models_pickles)
for i in range(len(models)):
    model = models[i]
    try:
        model.trainFeatureExtraction(
            conf["trainingFolder"], 
            limit=50)
        model.saveTrainingFeatures(conf["savingFolder"] +"/" +conf["session"]+"-"+model.name+".json")
        model.newModelPipeline(ertype=model.modeltype
)
        models_pickles[i] = model.saveModelPipeline()
    except Exception:
        print("Exception in user code:",i)
        print("-"*60)
        traceback.print_exc(file=sys.stdout)
        print("-"*60)
        print("model "+str(i)+" has failed... ")
        models_pickles[i] = None
        
        writeConf(conf,modelname,"status", "error" )
        
        
print("model pickles:")
print(models_pickles)
print()

# test all models
# PROBLEM I need the same feature set! so maybe the 
# pipeline should be pickled with the featureset
# so maybe we should use pickle of the whole NER?
results_files = [None]*len(models)
evaluation_files = [None]*len(models)
for i in range(len(models_pickles)):
    modelfile = models_pickles[i]
    fset = fs[i]
    try:
        print(fsold[i][2])
        model = NERmodel(featureset=fs[i], name=fsold[i][1], modeltype=fsold[i][2])
        model.loadModelPipeline(filepath=modelfile)
        model.testFeatureExtraction(
            conf["testFolder"], 
            limit=40)
        model.predict()
        model.parsePredictionOutput(conf["savingFolder"]+'/task9.1_'+training_set_name+'_'+str(i)+'.txt')
        results_files[i] = conf["savingFolder"]+ "/" + conf["session"]+"-"+model.name+'-output.csv'
        print("model "+model.name+":")
        evaluation_files[i] = model.autoEvaluation()
        
        writeConf(conf,modelname,"status", "done" )
    except  Exception:
        print("Exception in user code:",i)
        print("-"*60)
        traceback.print_exc(file=sys.stdout)
        print("-"*60)
        
        writeConf(conf,modelname,"status", "error" )






testyaml20180516
feature instances:
[(<CustomFeatures.MyFeatures object at 0x7ff692c2aeb8>, 'mod001', 'NER'), (<CustomFeatures.MyFeatures object at 0x7ff692b95128>, 'mod002', 'NER')]

models:
[<NERmodel.NERmodel object at 0x7ff692b98080>, <NERmodel.NERmodel object at 0x7ff692b982b0>]

2
[None, None]
model pickles:
['../data/models/svm-pipeline_20180517002106.pkl', '../data/models/svm-pipeline_20180517002113.pkl']

NER
NER
the model is NER
predict ertype=NER
model mod001:
None 
NER
NER
the model is NER
predict ertype=NER
model mod002:
None 


In [1]:
# generate feature subsets
from CustomFeatures import MyFeatures
from NERmodel import NERmodel
import subprocess
import traceback, sys
import pickle
import yaml
import os

def initConf(filepath=None):
    """
        every plugin uses its own config file
    """
    if not filepath:
        return yaml.load(open(os.getcwd()+'/models.conf','r'))
    else:
        return yaml.load(open(filepath,'r'))
    
def writeConfBack(conf, filepath=None):
    if not filepath:
        filepath = os.getcwd()+'/models.conf'
    with open(filepath,'w') as f:
        f.write(yaml.dump(conf))
        
     
def writeConf(conf,modelname,key, msg ):
    for modeln,params in conf["models"].items():
        try:
            if modelname == modeln:
                params[key]=msg
                writeConfBack(conf)
        except Exception:
            print("Exception in user code:")
            print("-"*60)
            traceback.print_exc(file=sys.stdout)
            print("-"*60) 
        

conf = initConf('debug_batch3.yaml')


# name the batch training
training_set_name =  conf["session"]
print(training_set_name)

# load the different MyFeature instances
fs = []
#for i in range(len(conf["models"])):
for modelname,params in conf["models"].items():
    try:
        mf = MyFeatures(params=params)
        fs.append((mf,modelname,params["modeltype"],params))
        writeConf(conf,modelname,"status", "doing" )
    except Exception:
        print("Exception in user code:")
        print("-"*60)
        traceback.print_exc(file=sys.stdout)
        print("-"*60)
        writeConf(conf,modelname,"status", "error" )
        
print([e[1] for e in fs])   

# initialize
models=[]
models_pickles =[None]*len(fs)
results_files = [None]*len(fs)
evaluation_files = [None]*len(fs)
for i in range(len(fs)):
    mf = fs[i]
    try:
        fset=mf[0]
        modelname=mf[1]
        modeltype=mf[2]
        limitTrain=None
        if "limitTraining" in mf[3].keys():
            limitTrain = mf[3]["limitTraining"]
        limitTest=None
        if "limitTest" in mf[3].keys():
            limitTest = mf[3]["limitTest"]
        model = NERmodel(featureset=fset, name=modelname, modeltype=modeltype)
        model.setName(training_set_name)
        models.append(model)
        
        model.trainFeatureExtraction(
            conf["trainingFolder"], 
            limit=limitTrain)
        model.saveTrainingFeatures(conf["savingFolder"] +"/" +conf["session"]+"-"+model.name+".json")
        model.newModelPipeline(ertype=model.modeltype)
        models_pickles[i] = model.saveModelPipeline()
        modelfile=models_pickles[i]
        
        model.testFeatureExtraction(
            conf["testFolder"], 
            limit=limitTest)
        
        model.predict()
        
        # debug
        print(model.predictionResults[:10])
        
        model.parsePredictionOutput(conf["savingFolder"]+'/task9.1_'+training_set_name+model.name+'_'+str(i)+'.txt')
        results_files[i] = conf["savingFolder"]+ "/" + conf["session"]+"-"+model.name+'-output.csv'
        print("model "+model.name+":")
        evaluation_files[i], accuracy = model.autoEvaluation()
        writeConf(conf,modelname,"accuracy", accuracy)
        
        writeConf(conf,modelname,"status", "done" )
        
        
    except Exception:
        print("Exception in user code:",i)
        print("-"*60)
        traceback.print_exc(file=sys.stdout)
        print("-"*60)
        print("model "+str(i)+" has failed... ")
        models_pickles[i] = None
        writeConf(conf,modelname,"status", "error" )
         








bm3
['mod000', 'mod001', 'mod002']
target and models
['O', 'O', 'O']
[{'word': '[', 'lemma': '[', 'pos': 'Fca', 'sentenceid': 'DDI-MedLine.d134.s0', 'offsetstart': 0, 'offsetend': 1, 'drugtype': '', 'chunk': '', 'chunkGroup': '', 'isTitleCase': False, 'isUpperCase': False, 'isLowerCase': True, 'hasDigits': False, 'hasStrangeChars': '', 'moreThan10chars': '', 'lensuffix': 0, 'lenword': 9, 'wordStructure': '', 'prefix': '', 'suffix': '', 'lenprefix': 0}, {'word': 'Dose-time', 'lemma': 'dose-time', 'pos': 'NP', 'sentenceid': 'DDI-MedLine.d134.s0', 'offsetstart': 1, 'offsetend': 10, 'drugtype': '', 'chunk': '', 'chunkGroup': '', 'isTitleCase': False, 'isUpperCase': False, 'isLowerCase': True, 'hasDigits': False, 'hasStrangeChars': '', 'moreThan10chars': '', 'lensuffix': 1, 'lenword': 7, 'wordStructure': '', 'prefix': '', 'suffix': 's', 'lenprefix': 0}]

Exception in user code: 0
------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython

0.0 


In [1]:
# generate feature subsets
from CustomFeatures import MyFeatures
from NERmodel import NERmodel
import subprocess
import traceback, sys
import pickle
import yaml
import os

# quick tests (no match but fast)
NERmodel.batchTraining('../data/batches/debug_batch4.yaml')



bm2debug
['mod001', 'mod002', 'mod003', 'mod004', 'mod005']
[{'biotag': 'O',
  'chunk_word': '',
  'drugtype': '',
  'hasDigits_before1': '',
  'hasStrangeChars_before1': '',
  'isLowerCase_before1': '',
  'isTitleCase_before1': '',
  'lookup_word': False,
  'offsetend': 1,
  'offsetstart': 0,
  'pos_after1': 'NP',
  'pos_after2': 'NNS',
  'pos_after3': 'IN',
  'pos_before1': '',
  'pos_before2': '',
  'pos_before3': '',
  'sentenceid': 'DDI-MedLine.d134.s0',
  'word': '[',
  'wordStructure_word': '_'}]
[{'biotag': 'O',
  'chunk_word': 'NE',
  'drugtype': '',
  'hasDigits_before1': '',
  'hasStrangeChars_before1': '',
  'isLowerCase_before1': '',
  'isTitleCase_before1': '',
  'lookup_word': False,
  'offsetend': 4,
  'offsetstart': 0,
  'pos_after1': 'NN',
  'pos_after2': 'NNS',
  'pos_after3': 'VBP',
  'pos_before1': '',
  'pos_before2': '',
  'pos_before3': '',
  'sentenceid': 'DDI-DrugBank.d749.s0',
  'word': 'Drug',
  'wordStructure_word': 'Xx'}]
 non O tags:
0
matchdict
0
predict

0.0 


Exception in user code: 2
------------------------------------------------------------
Traceback (most recent call last):
  File "/media/disk/home/pau/Projectes/AHLT-mai/src/NERmodel.py", line 410, in batchTraining
    NERmodel.writeConf(conf,modelname,"status", "done" ).predictionResultFile+'_result'
AttributeError: 'NoneType' object has no attribute 'predictionResultFile'
------------------------------------------------------------
model 2 has failed... 
[{'biotag': 'O',
  'drugtype': '',
  'hasDigits_after1': False,
  'hasDigits_after2': False,
  'hasDigits_after3': False,
  'hasDigits_before1': '',
  'hasDigits_before3': '',
  'hasDigits_word': False,
  'hasStrangeChars_after1': True,
  'hasStrangeChars_before1': '',
  'hasStrangeChars_before2': '',
  'hasStrangeChars_word': True,
  'isLowerCase_after2': True,
  'isLowerCase_after3': True,
  'isLowerCase_before1': '',
  'isLowerCase_before2': '',
  'isLowerCase_word': False,
  'isTitleCase_after1': True,
  'isTitleCase_after

In [1]:
# generate feature subsets
from CustomFeatures import MyFeatures
from NERmodel import NERmodel
import subprocess
import traceback, sys
import pickle
import yaml
import os

# quick result parsing (fastests matchings found)
NERmodel.batchTraining('../data/batches/debug_batch3.yaml')



bm3
['mod000', 'mod001', 'mod002']
[{'biotag': 'O',
  'drugtype': '',
  'hasDigits_after1': False,
  'hasDigits_after2': False,
  'hasDigits_after3': False,
  'hasDigits_before1': '',
  'hasDigits_before3': '',
  'hasDigits_word': False,
  'hasStrangeChars_after1': True,
  'hasStrangeChars_before1': '',
  'hasStrangeChars_before2': '',
  'hasStrangeChars_word': True,
  'isLowerCase_after2': True,
  'isLowerCase_after3': True,
  'isLowerCase_before1': '',
  'isLowerCase_before2': '',
  'isLowerCase_word': False,
  'isTitleCase_after1': True,
  'isTitleCase_after2': False,
  'isTitleCase_after3': False,
  'isTitleCase_before1': '',
  'isTitleCase_before3': '',
  'isTitleCase_word': False,
  'isUpperCase_after1': False,
  'isUpperCase_after2': False,
  'isUpperCase_after3': False,
  'isUpperCase_before1': '',
  'isUpperCase_before2': '',
  'isUpperCase_word': False,
  'lemma_after1': 'dose-time',
  'lenprefix_after1': 0,
  'lenprefix_before1': '',
  'lensuffix_after1': 0,
  'lensuffix_wor

 non O tags:
121
matchdict
64
[[43, 52, 'estradiol', 'null', 'B'], [62, 74, 'progesterone', 'null', 'B']]
predictions length 121
[['DDI-DrugBank.d772.s1', 43, 52, 'estradiol', 'null'], ['DDI-DrugBank.d772.s1', 62, 74, 'progesterone', 'null']]
DDI-DrugBank.d772.s1|43-52|estradiol|null
DDI-DrugBank.d772.s1|62-74|progesterone|null
DDI-DrugBank.d646.s0|79-88|rifaximin|null
...
model bm3:
0.3980263157894737 




In [1]:
# generate feature subsets
from CustomFeatures import MyFeatures
from NERmodel import NERmodel
import subprocess
import traceback, sys
import pickle
import yaml
import os

# quick result parsing (fastests matchings found)
NERmodel.batchTestManualAccuracy(
    '../data/batches/debug_batch5.yaml',
    modelfilepath='../data/models/svm-pipeline_20180518151600.pkl')



bm3
['mod002']
model bm3:
accuracy 0.29008746355685133 precision 0.88 recall 0.29 F1 0.44
accuracy 0.3046757164404223 total 663 cor 202 par 14 mis 447 
accuracy 0.2880116959064328 total 684 cor 197 par 18 mis 469 




In [1]:
# generate feature subsets
from CustomFeatures import MyFeatures
from NERmodel import NERmodel
import subprocess
import traceback, sys
import pickle
import yaml
import os

# best models
NERmodel.batchTraining('../data/batches/debug_bestmodels20180517.yaml')
NERmodel.batchTraining('../data/batches/debug_bestmodels20180517NER.yaml')




bm
['mod001', 'mod002', 'mod003']
 non O tags:
196
matchdict
85
[[0, 10, 'Pegaptanib', 'null', 'B']]
predictions length 196
[['DDI-DrugBank.d749.s1', 0, 10, 'Pegaptanib', 'null'], ['DDI-DrugBank.d749.s2', 170, 180, 'pegaptanib', 'null']]
DDI-DrugBank.d749.s1|0-10|Pegaptanib|null
DDI-DrugBank.d749.s2|170-180|pegaptanib|null
DDI-DrugBank.d772.s1|43-52|estradiol|null
...
model bm:
0.0 


 non O tags:
195
matchdict
85
[[0, 10, 'Pegaptanib', 'null', 'B']]
predictions length 195
[['DDI-DrugBank.d749.s1', 0, 10, 'Pegaptanib', 'null'], ['DDI-DrugBank.d749.s2', 170, 180, 'pegaptanib', 'null']]
DDI-DrugBank.d749.s1|0-10|Pegaptanib|null
DDI-DrugBank.d749.s2|170-180|pegaptanib|null
DDI-DrugBank.d772.s1|43-52|estradiol|null
...
model bm:
0.0 


 non O tags:
194
matchdict
85
[[0, 10, 'Pegaptanib', 'null', 'B']]
predictions length 194
[['DDI-DrugBank.d749.s1', 0, 10, 'Pegaptanib', 'null'], ['DDI-DrugBank.d749.s2', 170, 180, 'pegaptanib', 'null']]
DDI-DrugBank.d749.s1|0-10|Pegaptanib|null
DDI-DrugBank

In [None]:
# generate feature subsets
from CustomFeatures import MyFeatures
from NERmodel import NERmodel
import subprocess
import traceback, sys
import pickle
import yaml
import os

# combinations ER
NERmodel.batchTraining('../data/batches/combinationsER20180517.yaml')
# combinations NER
NERmodel.batchTraining('../data/batches/debug_bestmodels_vs_NER.yaml')
#NERmodel.batchTraining('../data/batches/combinationsNER20180517.yaml')



combER0517
['mod000', 'mod001', 'mod002', 'mod003', 'mod004', 'mod005', 'mod006', 'mod007', 'mod008', 'mod009', 'mod010', 'mod011', 'mod012', 'mod013', 'mod014', 'mod015', 'mod016', 'mod017', 'mod019', 'mod020', 'mod021', 'mod022', 'mod023', 'mod024', 'mod025', 'mod026', 'mod027', 'mod028', 'mod029', 'mod030', 'mod031', 'mod032', 'mod033', 'mod034', 'mod035', 'mod036', 'mod037', 'mod038', 'mod039', 'mod040', 'mod041', 'mod042', 'mod043', 'mod044', 'mod045', 'mod046', 'mod047', 'mod048', 'mod049', 'mod050', 'mod051', 'mod052', 'mod053', 'mod054', 'mod055', 'mod056', 'mod057', 'mod058', 'mod059', 'mod060', 'mod061', 'mod062']
[{'biotag': 'O',
  'drugtype': '',
  'offsetend': 1,
  'offsetstart': 0,
  'pos_after1': 'NP',
  'pos_before1': '',
  'pos_word': 'Fca',
  'sentenceid': 'DDI-MedLine.d134.s0',
  'word': '['}]
[{'biotag': 'O',
  'drugtype': '',
  'offsetend': 4,
  'offsetstart': 0,
  'pos_after1': 'NN',
  'pos_before1': '',
  'pos_word': 'NN',
  'sentenceid': 'DDI-DrugBank.d749.s0',


In [1]:
# generate feature subsets
from CustomFeatures import MyFeatures
from NERmodel import NERmodel
import subprocess
import traceback, sys
import pickle
import yaml
import os


NERmodel.batchTraining('../data/batches/debug_bestmodels_vs_NER.yaml')









bm


KeyboardInterrupt: 