# hansard_gold_standard_evaluation

Extract POL entities from Hansard data (9 documents sample).

Compare to gold standard and calculate Coverage and Confidence measures.

V1.1 - remove leading "the" / "The" from Spacy predictions, to put predictions on a par with how NLTK and Stanford extract entities, and how the gold standard is created (i.e. without leading "the"'s).

V1.4 - Adding bespoke trained model, training on Motor Neurone document (therefore need to change evaluation to be based on only the other 9 documents).

V1.5 - Removed Event entities from the comparison (as some compared methods don't really handle it). Added run time metrics.



In [17]:
##### Imports #####

import csv, re, itertools, os
import pandas as pd
import nltk, time

# Import spacy and English models
import spacy, numpy

# Load English Spacy module
nlp = spacy.load('en')

# Stanford imports
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize.stanford import StanfordTokenizer

# NLTK imports
from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree





##### Stanford set up #####

#Set core path for Stanford NLP packages
main_path = os.path.join("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\StanfordNLP\\", "stanford-corenlp-full-2016-10-31\\")
# Set paths where the Standford NLP .jar files are located
pathlist = [os.path.join(main_path,"stanford-corenlp-3.7.0"),
            os.path.join(main_path,"ner\\stanford-ner.jar"),
            os.path.join(main_path,"postagger\\stanford-postagger.jar")]
    
# Set path to Stanford models
mpath = [os.path.join(main_path,"postagger\\models"), os.path.join(main_path,"ner\\classifiers")]

# Set path to java.exe
javapath = "C:\\Program Files\\Java\\jre1.8.0_121\\bin\\java.exe"

# Add paths to the CLASSPATH environmental variable (as instructed by NLTK)
os.environ['CLASSPATH'] = os.pathsep.join(pathlist)
os.environ['STANFORD_MODELS'] = os.pathsep.join(mpath)
os.environ['JAVAHOME'] = javapath



### Define function to tag NER sentence with BIO tags
def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
    # Return BIO tagged sentence
    return bio_tagged_sent



# Combo tagger
class NERComboTagger(StanfordNERTagger):

    def __init__(self, *args, **kwargs):
        self.stanford_ner_models = kwargs['stanford_ner_models']
        kwargs.pop("stanford_ner_models")
        super(NERComboTagger,self).__init__(*args, **kwargs)

    @property
    def _cmd(self):
        return ['edu.stanford.nlp.ie.NERClassifierCombiner',
            '-ner.model',
            self.stanford_ner_models,
            '-textFile',
            self._input_file_path,
            '-outputFormat',
            self._FORMAT,
            '-tokenizerFactory',
            'edu.stanford.nlp.process.WhitespaceTokenizer',
            '-tokenizerOptions',
            '\"tokenizeNLs=false\"']


    
# Run the Stanford NLP for the Combo tagger
def stanford_routine(data_in, model):
    
    data_out = []

    tkn_sent = StanfordTokenizer().tokenize(data_in)
    tag_sent = model.tag(tkn_sent) 
    bio_tagged_sent = stanfordNE2BIO(tag_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]
    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)

    # Get entities from the trees
    for subtree in ne_tree:
        if type(subtree) == Tree:
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            if ne_label in ('PERSON', 'ORGANISATION', 'ORGANIZATION', 'LOCATION', 'GSP', 'GPE'):
                data_out.append([ne_string, ne_label])
        
    return data_out



##### Functions used later on #####

def get_ents(ne_tree):
    ne_in_sent = []
    for subtree in ne_tree:
        if type(subtree) == nltk.tree.Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    # Return list of entity tuples
    return ne_in_sent



def accuracy_measures(data_in, label):
    
    output = []
    
    true_positive = data_in['Entity'].loc[(data_in['Predicted']=='Y') & (data_in['Actual']=='Y')].count()
    false_negative = data_in['Entity'].loc[(data_in['Predicted']=='N') & (data_in['Actual']=='Y')].count()
    false_positive = data_in['Entity'].loc[(data_in['Predicted']=='Y') & (data_in['Actual']=='N')].count()

    coverage = int(true_positive / (true_positive + false_negative) * 100) / 100
    confidence = int(true_positive / (true_positive + false_positive) * 100) / 100

    balanced_f1 = 2 * coverage * confidence / (coverage + confidence)
    
    output.append((label, round(coverage,3), round(confidence,3), round(balanced_f1,3)))

    print("\nAccuracy measures for ", label, ":")
    print(" Coverage =  {:.1%}".format(coverage), "\n Confidence = {:.1%}".format(confidence), "\n Balanced F1 = {:.1%}".format(balanced_f1))
    print(output)


##### Dictionary for mappings ######

dict_gold = {'PERSON':'Person', 'ORG':'Organisation', 'LOC':'Location', 'EVENT':'Event'}
dict_spacy = {'PERSON':'Person', 'NORP':'Organisation', 'ORG':'Organisation', 'FACILITY':'Location',\
              'GPE':'Location', 'LOC':'Location', 'EVENT':'Event'}
dict_nltk = {'PERSON':'Person', 'ORGANIZATION':'Organisation', 'GSP':'Organisation', 'GPE':'Location',\
             'FACILITY':'Location', 'LOCATION':'Location'}
dict_stanford = {'PERSON':'Person', 'ORGANIZATION':'Organisation', 'LOCATION':'Location', 'GSP':'Organisation', 'GPE':'Location'}
dict_bespoke = {'PERSON':'Person', 'ORGANISATION':'Organisation', 'LOCATION':'Location', 'EVENT':'Event'}
dict_combo = {'PERSON':'Person', 'ORGANISATION':'Organisation', 'ORGANIZATION':'Organisation', 'GSP':'Organisation', \
              'LOCATION':'Location', 'GPE':'Location', 'EVENT':'Event'}


## Main function

In [24]:
def hansard_gold_standard_eval(short_name, sample_name):

    ##### Read gold standard file into DataFrame - n.b. need to use tab separated, as some entities contain commas #####
    goldstandard_df =  pd.read_csv("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\gold_standard_%s.txt"\
                                  %short_name, sep='\t')
    goldstandard_df.rename(columns={'Value':'Entity'}, inplace=True)
    goldstandard_df['Entity'] = goldstandard_df['Entity'].str.strip().\
                            apply(lambda x: x[4:] if x.startswith("the ") or x.startswith("The ") else x)
    goldstandard_df = goldstandard_df.drop_duplicates().sort_values("Entity")
    goldstandard_df.drop(goldstandard_df[goldstandard_df[' Type'] == 'EVENT'].index, inplace=True)


    #####Read in sample file #####
    samplefilepath = "C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\%s.txt" %sample_name
    sample = open(samplefilepath, 'r', encoding='utf-8').read()


    ##### NLP Method 1: Spacy #####
    start_time = time.time()
    nlpd = nlp(sample)
    nlpd.ents = [ent for ent in nlpd.ents   if ent.label_ in ('PERSON', 'NORP', 'ORG', 'FACILITY', 'GPE', 'LOC')]

    extracted_spacy = []

    for row in nlpd.ents:
        joined = ""
        for item in row:
            spaced = str(item) + " "
            joined += spaced
        if joined not in extracted_spacy:
            extracted_spacy.append((joined, row.label_))
    print(short_name, ": Spacy: %0.1f seconds " % (time.time() - start_time))
        
    # Convert to DataFrame
    extracted_spacy_df = pd.DataFrame(extracted_spacy)
    extracted_spacy_df.columns = ['Entity', 'Type']
    extracted_spacy_df['Entity'] = extracted_spacy_df['Entity'].str.strip().\
                            apply(lambda x: x[4:] if x.startswith("the ") or x.startswith("The ") else x)
    extracted_spacy_df = extracted_spacy_df.drop_duplicates().sort_values("Entity")


    ##### NLP Method 2: NLTK #####
    start_time = time.time()
    tokenized_sentences = nltk.sent_tokenize(sample)
    tokenized_words = [nltk.word_tokenize(sent) for sent in tokenized_sentences]
    postagged_words = [nltk.pos_tag(sent) for sent in tokenized_words]

    chunked=[]

    for word in postagged_words:
        chunk = nltk.ne_chunk(word, binary=False)
        chunked.append(chunk)
        
    ents_nltk = [get_ents(tree) for tree in chunked]
    ents_list_nltk = list(itertools.chain.from_iterable(ents_nltk))
    extracted_list_nltk = []

    for row in ents_list_nltk:
        if (row[1] not in ('DATE', 'TIME', 'MONEY', 'PERCENT') and row not in extracted_list_nltk):
            extracted_list_nltk.append(row)
    print(short_name, ": NLTK: %0.1f seconds " % (time.time() - start_time))
              
    extracted_nltk_df = pd.DataFrame(extracted_list_nltk)
    extracted_nltk_df.columns = ['Entity', 'Type']
    extracted_nltk_df['Entity'] = extracted_nltk_df['Entity'].str.strip().\
                            apply(lambda x: x[4:] if x.startswith("the ") or x.startswith("The ") else x)
    extracted_nltk_df = extracted_nltk_df.drop_duplicates().sort_values("Entity")
        

    
    ##### NLP Method 3: Stanford #####
    extracted_stanford = []
    start_time = time.time()
    tkn_sent = StanfordTokenizer().tokenize(sample)
    tag_sent = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz').tag(tkn_sent) 
    bio_tagged_sent = stanfordNE2BIO(tag_sent)
    
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]
    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)

    # Get entities from the trees
    for subtree in ne_tree:
        if type(subtree) == Tree:
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            if ne_label in ('PERSON', 'ORGANISATION', 'ORGANIZATION', 'LOCATION', 'GSP', 'GPE'):
                extracted_stanford.append([ne_string, ne_label])
    print(short_name, ": Stanford CoreNLP: %0.1f seconds " % (time.time() - start_time))

    # Convert to DataFrame
    extracted_stanford_df = pd.DataFrame(extracted_stanford)
    extracted_stanford_df.columns = ['Entity', 'Type']
    extracted_stanford_df['Entity'] = extracted_stanford_df['Entity'].str.strip().\
                            apply(lambda x: x[4:] if x.startswith("the ") or x.startswith("The ") else x)
    extracted_stanford_df = extracted_stanford_df.drop_duplicates().sort_values("Entity")            
            
            
                
    ##### NLP method 4: bespoke Stanford model, trained on Hansard Motor Neurone debate #####
    
    # Modify classpaths to include new model file
    pathlist = [os.path.join(main_path,"stanford-corenlp-3.7.0"),
                os.path.join(main_path,"ner\\stanford-ner.jar"),
                os.path.join(main_path,"postagger\\stanford-postagger.jar"),
                os.path.join(main_path,"ner\\ner-model-hansard-mn.ser.gz")]

    mpath = [os.path.join(main_path,"postagger\\models"), os.path.join(main_path,"ner\\classifiers"), os.path.join(main_path,"ner")]

    os.environ['CLASSPATH'] = os.pathsep.join(pathlist)
    os.environ['STANFORD_MODELS'] = os.pathsep.join(mpath)
    
    extracted_bespoke = []
    start_time = time.time()
    tkn_sent = StanfordTokenizer().tokenize(sample)
    tag_sent = StanfordNERTagger('ner-model-hansard-mn.ser.gz').tag(tkn_sent) 
    bio_tagged_sent = stanfordNE2BIO(tag_sent)
    
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)

    # Get entities from the trees
    for subtree in ne_tree:
        if type(subtree) == Tree:
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            if ne_label in ('PERSON', 'ORGANISATION', 'LOCATION'):
                extracted_bespoke.append([ne_string, ne_label])
    print(short_name, ": Stanford Bespoke: %0.1f seconds " % (time.time() - start_time))

    extracted_bespoke_df = pd.DataFrame(extracted_bespoke)
    extracted_bespoke_df.columns = ['Entity', 'Type']
    extracted_bespoke_df = extracted_bespoke_df.drop_duplicates().sort_values("Entity")

    
    
    ##### NLP method 5: combining bespoke Stanford model trained on Hansard Motor Neurone debate with regular Stanford model #####
        
    classifier_path1 = "C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\StanfordNLP\\stanford-corenlp-full-2016-10-31\\ner\\ner-model-hansard-mn.ser.gz"
    classifier_path2 = "C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\StanfordNLP\\stanford-corenlp-full-2016-10-31\\ner\\classifiers\\english.conll.4class.distsim.crf.ser.gz"
    ner_jar_path = "C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\StanfordNLP\\stanford-corenlp-full-2016-10-31\\ner\\stanford-ner.jar"
    start_time = time.time()
    st12 = NERComboTagger(classifier_path1, ner_jar_path, stanford_ner_models = classifier_path1+","+classifier_path2)
    extracted_combo12 = stanford_routine(sample, st12)
    print(short_name, ": Stanford Combo 12: %0.1f seconds " % (time.time() - start_time))
    start_time = time.time()
    st21 = NERComboTagger(classifier_path1, ner_jar_path, stanford_ner_models = classifier_path2+","+classifier_path1)    
    extracted_combo21 = stanford_routine(sample, st21)
    print(short_name, ": Stanford Combo 21: %0.1f seconds " % (time.time() - start_time))
                
    extracted_combo12_df = pd.DataFrame(extracted_combo12)
    extracted_combo12_df.columns = ['Entity', 'Type']
    extracted_combo12_df = extracted_combo12_df.drop_duplicates().sort_values("Entity")
    extracted_combo21_df = pd.DataFrame(extracted_combo21)
    extracted_combo21_df.columns = ['Entity', 'Type']
    extracted_combo21_df = extracted_combo21_df.drop_duplicates().sort_values("Entity")
    
    
    ##### Compare extracted to gold standard #####

    # Prep columns
    goldstandard_df['Ent_type'] = goldstandard_df[' Type'].map(dict_gold)
    extracted_spacy_df['Ent_type'] = extracted_spacy_df['Type'].map(dict_spacy)
    extracted_nltk_df['Ent_type'] = extracted_nltk_df['Type'].map(dict_nltk)
    extracted_stanford_df['Ent_type'] = extracted_stanford_df['Type'].map(dict_stanford)
    extracted_bespoke_df['Ent_type'] = extracted_bespoke_df['Type'].map(dict_bespoke)
    extracted_combo12_df['Ent_type'] = extracted_combo12_df['Type'].map(dict_combo)
    extracted_combo21_df['Ent_type'] = extracted_combo21_df['Type'].map(dict_combo)

    
    goldstandard_df['Actual'] = 'Y'
    extracted_spacy_df['Predicted'] = 'Y'
    extracted_nltk_df['Predicted'] = 'Y'
    extracted_stanford_df['Predicted'] = 'Y'
    extracted_bespoke_df['Predicted'] = 'Y'
    extracted_combo12_df['Predicted'] = 'Y'
    extracted_combo21_df['Predicted'] = 'Y'
    
    goldstandard_df['Entity'] = goldstandard_df['Entity'].str.strip().\
                            apply(lambda x: x[4:] if x.startswith("the ") or x.startswith("The ") else x).drop_duplicates()
    extracted_spacy_df['Entity'] = extracted_spacy_df['Entity'].str.strip().\
                            apply(lambda x: x[4:] if x.startswith("the ") or x.startswith("The ") else x).drop_duplicates()
    extracted_nltk_df['Entity'] = extracted_nltk_df['Entity'].str.strip().\
                            apply(lambda x: x[4:] if x.startswith("the ") or x.startswith("The ") else x).drop_duplicates()
    extracted_stanford_df['Entity'] = extracted_stanford_df['Entity'].str.strip().\
                            apply(lambda x: x[4:] if x.startswith("the ") or x.startswith("The ") else x).drop_duplicates()
    extracted_bespoke_df['Entity'] = extracted_bespoke_df['Entity'].str.strip().\
                            apply(lambda x: x[4:] if x.startswith("the ") or x.startswith("The ") else x).drop_duplicates()   
    extracted_combo12_df['Entity'] = extracted_combo12_df['Entity'].str.strip().\
                            apply(lambda x: x[4:] if x.startswith("the ") or x.startswith("The ") else x).drop_duplicates()   
    extracted_combo21_df['Entity'] = extracted_combo21_df['Entity'].str.strip().\
                            apply(lambda x: x[4:] if x.startswith("the ") or x.startswith("The ") else x).drop_duplicates()   

    
    # Merge to compare to gold standard
    merged_spacy_df = pd.merge(goldstandard_df, extracted_spacy_df, on=['Entity', 'Ent_type'], how='outer').fillna('N')
    merged_nltk_df = pd.merge(goldstandard_df, extracted_nltk_df, on=['Entity', 'Ent_type'], how='outer').fillna('N')
    merged_stanford_df = pd.merge(goldstandard_df, extracted_stanford_df, on=['Entity', 'Ent_type'], how='outer').fillna('N')
    merged_bespoke_df = pd.merge(goldstandard_df, extracted_bespoke_df, on=['Entity', 'Ent_type'], how='outer').fillna('N')
    merged_combo12_df = pd.merge(goldstandard_df, extracted_combo12_df, on=['Entity', 'Ent_type'], how='outer').fillna('N')
    merged_combo21_df = pd.merge(goldstandard_df, extracted_combo21_df, on=['Entity', 'Ent_type'], how='outer').fillna('N')   

    # Do actual vs. predicted crosstab matrices
    print(pd.crosstab(merged_spacy_df['Predicted'], merged_spacy_df['Actual']))
    print(pd.crosstab(merged_nltk_df['Predicted'], merged_nltk_df['Actual']))
    print(pd.crosstab(merged_stanford_df['Predicted'], merged_stanford_df['Actual']))
    print(pd.crosstab(merged_bespoke_df['Predicted'], merged_bespoke_df['Actual']))
    print(pd.crosstab(merged_combo12_df['Predicted'], merged_combo12_df['Actual']))
    print(pd.crosstab(merged_combo21_df['Predicted'], merged_combo21_df['Actual']))
    
    ##### Calculate Coverage, Confidence and Balanced F1 measures #####
    accuracy_measures(merged_spacy_df, "Spacy")
    accuracy_measures(merged_nltk_df, "NLTK")
    accuracy_measures(merged_stanford_df, "Stanford")
    accuracy_measures(merged_bespoke_df, "Bespoke")
    accuracy_measures(merged_combo12_df, "Combo12")
    accuracy_measures(merged_combo21_df, "Combo21")

    ##### Output to CSV #####
    merged_spacy_df['Method'] = 'Spacy'
    merged_nltk_df['Method'] = 'NLTK'
    merged_stanford_df['Method'] = 'Stanford'
    merged_bespoke_df['Method'] = 'Bespoke'
    merged_combo12_df['Method'] = 'Combo12'
    merged_combo21_df['Method'] = 'Combo21'
    merged_all_df = pd.concat([merged_spacy_df, merged_nltk_df, merged_stanford_df, merged_bespoke_df, merged_combo12_df, merged_combo21_df])
    merged_all_df['Sample'] = sample_name
    merged_all_df.to_csv("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\gold_standard_eval_1_5_%s.txt"\
                           %short_name, sep='\t', index=False)
    
    ##### Save DataFrame to disk #####
    merged_all_df.to_pickle("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\df_eval_1_5_%s"%short_name)



## Call main function

In [3]:
### Motor Neurone not needed, as used to create bespoke trained NER model
# hansard_gold_standard_eval('motor_neurone_disease', 'Motor Neurone Disease-Gordon Aikman 2017-02-20')

In [23]:
hansard_gold_standard_eval('armed_forces_historical_cases', 'Armed Forces- Historical Cases 2017-02-23')

armed_forces_historical_cases : Spacy 5.5 seconds 
armed_forces_historical_cases : NLTK --- 38.754995584487915 seconds ---
armed_forces_historical_cases : Stanford CoreNLP --- 20.4059476852417 seconds ---
armed_forces_historical_cases : Stanford Bespoke --- 17.079110383987427 seconds ---
armed_forces_historical_cases : Stanford Combo 12 --- 69.9566650390625 seconds ---
armed_forces_historical_cases : Stanford Combo 21 --- 63.22291970252991 seconds ---
Actual       N    Y
Predicted          
N            0  150
Y          190  174
Actual       N    Y
Predicted          
N            0  180
Y          242  146
Actual       N    Y
Predicted          
N            0  129
Y          132  193
Actual       N    Y
Predicted          
N            0  186
Y          215  136
Actual       N    Y
Predicted          
N            0  176
Y          229  147
Actual       N    Y
Predicted          
N            0  128
Y          147  193

Accuracy measures for  Spacy :
 Coverage =  53.0% 
 Confidence 

In [25]:
hansard_gold_standard_eval('aster_housing_association', 'Aster Group Housing Association 2017-02-22')

aster_housing_association : Spacy: 1.2 seconds 
aster_housing_association : NLTK: 6.7 seconds 
aster_housing_association : Stanford CoreNLP: 9.6 seconds 
aster_housing_association : Stanford Bespoke: 6.5 seconds 
aster_housing_association : Stanford Combo 12: 22.2 seconds 
aster_housing_association : Stanford Combo 21: 25.3 seconds 
Actual      N   Y
Predicted        
N           0  25
Y          36  21
Actual      N   Y
Predicted        
N           0  31
Y          50  15
Actual      N   Y
Predicted        
N           0  25
Y          21  21
Actual      N   Y
Predicted        
N           0  26
Y          33  20
Actual      N   Y
Predicted        
N           0  26
Y          35  20
Actual      N   Y
Predicted        
N           0  23
Y          23  23

Accuracy measures for  Spacy :
 Coverage =  45.0% 
 Confidence = 36.0% 
 Balanced F1 = 40.0%
[('Spacy', 0.45, 0.36, 0.4)]

Accuracy measures for  NLTK :
 Coverage =  32.0% 
 Confidence = 23.0% 
 Balanced F1 = 26.8%
[('NLTK', 0.32, 0

In [26]:
hansard_gold_standard_eval('high_speed_rail', 'High Speed Rail (London - West Midlands) Bill 2017-02-20')

high_speed_rail : Spacy: 2.4 seconds 
high_speed_rail : NLTK: 15.9 seconds 
high_speed_rail : Stanford CoreNLP: 10.9 seconds 
high_speed_rail : Stanford Bespoke: 7.7 seconds 
high_speed_rail : Stanford Combo 12: 34.7 seconds 
high_speed_rail : Stanford Combo 21: 31.5 seconds 
Actual       N   Y
Predicted         
N            0  98
Y          115  77
Actual       N    Y
Predicted          
N            0  108
Y          147   68
Actual       N   Y
Predicted         
N            0  94
Y          100  76
Actual       N    Y
Predicted          
N            0  107
Y          129   63
Actual       N   Y
Predicted         
N            0  97
Y          135  77
Actual       N   Y
Predicted         
N            0  91
Y          106  80

Accuracy measures for  Spacy :
 Coverage =  44.0% 
 Confidence = 40.0% 
 Balanced F1 = 41.9%
[('Spacy', 0.44, 0.4, 0.419)]

Accuracy measures for  NLTK :
 Coverage =  38.0% 
 Confidence = 31.0% 
 Balanced F1 = 34.1%
[('NLTK', 0.38, 0.31, 0.341)]

Accuracy me

In [27]:
hansard_gold_standard_eval('jamal_al_harith', 'Jamal al-Harith 2017-02-23')

jamal_al_harith : Spacy: 1.4 seconds 
jamal_al_harith : NLTK: 10.8 seconds 
jamal_al_harith : Stanford CoreNLP: 9.3 seconds 
jamal_al_harith : Stanford Bespoke: 5.7 seconds 
jamal_al_harith : Stanford Combo 12: 22.7 seconds 
jamal_al_harith : Stanford Combo 21: 21.9 seconds 
Actual      N   Y
Predicted        
N           0  68
Y          96  53
Actual       N   Y
Predicted         
N            0  89
Y          123  32
Actual      N   Y
Predicted        
N           0  66
Y          67  55
Actual      N   Y
Predicted        
N           0  85
Y          88  36
Actual      N   Y
Predicted        
N           0  81
Y          94  40
Actual      N   Y
Predicted        
N           0  65
Y          74  56

Accuracy measures for  Spacy :
 Coverage =  43.0% 
 Confidence = 35.0% 
 Balanced F1 = 38.6%
[('Spacy', 0.43, 0.35, 0.386)]

Accuracy measures for  NLTK :
 Coverage =  26.0% 
 Confidence = 20.0% 
 Balanced F1 = 22.6%
[('NLTK', 0.26, 0.2, 0.226)]

Accuracy measures for  Stanford :
 Cover

In [28]:
hansard_gold_standard_eval('local_government_finance', 'Local Government Finance 2017-02-22')

local_government_finance : Spacy: 5.3 seconds 
local_government_finance : NLTK: 46.1 seconds 
local_government_finance : Stanford CoreNLP: 15.8 seconds 
local_government_finance : Stanford Bespoke: 13.4 seconds 
local_government_finance : Stanford Combo 12: 63.2 seconds 
local_government_finance : Stanford Combo 21: 56.8 seconds 
Actual       N    Y
Predicted          
N            0  140
Y          170  146
Actual       N    Y
Predicted          
N            0  175
Y          270  127
Actual       N    Y
Predicted          
N            0  136
Y          161  144
Actual       N    Y
Predicted          
N            0  165
Y          170  108
Actual       N    Y
Predicted          
N            0  157
Y          185  138
Actual       N    Y
Predicted          
N            0  136
Y          168  147

Accuracy measures for  Spacy :
 Coverage =  51.0% 
 Confidence = 46.0% 
 Balanced F1 = 48.4%
[('Spacy', 0.51, 0.46, 0.484)]

Accuracy measures for  NLTK :
 Coverage =  42.0% 
 Confidence 

In [29]:
hansard_gold_standard_eval('police_grant', 'Police Grant 2017-02-22')

police_grant : Spacy: 5.9 seconds 
police_grant : NLTK: 34.3 seconds 
police_grant : Stanford CoreNLP: 13.6 seconds 
police_grant : Stanford Bespoke: 11.2 seconds 
police_grant : Stanford Combo 12: 44.9 seconds 
police_grant : Stanford Combo 21: 45.7 seconds 
Actual       N    Y
Predicted          
N            0  112
Y          149  103
Actual       N    Y
Predicted          
N            0  149
Y          215   66
Actual       N    Y
Predicted          
N            0  118
Y          146   97
Actual       N    Y
Predicted          
N            0  136
Y          161   79
Actual       N    Y
Predicted          
N            0  132
Y          179   83
Actual       N    Y
Predicted          
N            0  120
Y          158   95

Accuracy measures for  Spacy :
 Coverage =  47.0% 
 Confidence = 40.0% 
 Balanced F1 = 43.2%
[('Spacy', 0.47, 0.4, 0.432)]

Accuracy measures for  NLTK :
 Coverage =  30.0% 
 Confidence = 23.0% 
 Balanced F1 = 26.0%
[('NLTK', 0.3, 0.23, 0.26)]

Accuracy measu

In [30]:
hansard_gold_standard_eval('trump_state_visit', 'President Trump- State Visit 2017-02-20')

trump_state_visit : Spacy: 5.7 seconds 
trump_state_visit : NLTK: 48.9 seconds 
trump_state_visit : Stanford CoreNLP: 16.0 seconds 
trump_state_visit : Stanford Bespoke: 13.2 seconds 
trump_state_visit : Stanford Combo 12: 57.4 seconds 
trump_state_visit : Stanford Combo 21: 55.8 seconds 
Actual       N    Y
Predicted          
N            0  155
Y          217  191
Actual       N    Y
Predicted          
N            0  190
Y          281  155
Actual       N    Y
Predicted          
N            0  143
Y          163  199
Actual       N    Y
Predicted          
N            0  213
Y          269  131
Actual       N    Y
Predicted          
N            0  203
Y          279  142
Actual       N    Y
Predicted          
N            0  140
Y          174  200

Accuracy measures for  Spacy :
 Coverage =  55.0% 
 Confidence = 46.0% 
 Balanced F1 = 50.1%
[('Spacy', 0.55, 0.46, 0.501)]

Accuracy measures for  NLTK :
 Coverage =  44.0% 
 Confidence = 35.0% 
 Balanced F1 = 39.0%
[('NLTK', 0.

In [31]:
hansard_gold_standard_eval('unaccompanied_children', 'Unaccompanied Children  (Greece and Italy) 2017-02-23')

unaccompanied_children : Spacy: 5.6 seconds 
unaccompanied_children : NLTK: 48.1 seconds 
unaccompanied_children : Stanford CoreNLP: 16.7 seconds 
unaccompanied_children : Stanford Bespoke: 13.7 seconds 
unaccompanied_children : Stanford Combo 12: 60.6 seconds 
unaccompanied_children : Stanford Combo 21: 61.1 seconds 
Actual       N    Y
Predicted          
N            0  132
Y          229  138
Actual       N    Y
Predicted          
N            0  171
Y          276   99
Actual       N    Y
Predicted          
N            0  125
Y          146  145
Actual       N    Y
Predicted          
N            0  172
Y          230   98
Actual       N    Y
Predicted          
N            0  167
Y          254  103
Actual       N    Y
Predicted          
N            0  122
Y          155  148

Accuracy measures for  Spacy :
 Coverage =  51.0% 
 Confidence = 37.0% 
 Balanced F1 = 42.9%
[('Spacy', 0.51, 0.37, 0.429)]

Accuracy measures for  NLTK :
 Coverage =  36.0% 
 Confidence = 26.0% 
 Ba

In [32]:
hansard_gold_standard_eval('vauxhall_opel_proposed_takeover', 'Vauxhall-Opel- Proposed Takeover 2017-02-20')

vauxhall_opel_proposed_takeover : Spacy: 1.4 seconds 
vauxhall_opel_proposed_takeover : NLTK: 11.3 seconds 
vauxhall_opel_proposed_takeover : Stanford CoreNLP: 9.1 seconds 
vauxhall_opel_proposed_takeover : Stanford Bespoke: 5.7 seconds 
vauxhall_opel_proposed_takeover : Stanford Combo 12: 22.9 seconds 
vauxhall_opel_proposed_takeover : Stanford Combo 21: 21.6 seconds 
Actual      N   Y
Predicted        
N           0  65
Y          87  60
Actual       N   Y
Predicted         
N            0  93
Y          134  32
Actual      N   Y
Predicted        
N           0  57
Y          63  68
Actual      N   Y
Predicted        
N           0  80
Y          77  45
Actual      N   Y
Predicted        
N           0  74
Y          83  51
Actual      N   Y
Predicted        
N           0  55
Y          63  70

Accuracy measures for  Spacy :
 Coverage =  48.0% 
 Confidence = 40.0% 
 Balanced F1 = 43.6%
[('Spacy', 0.48, 0.4, 0.436)]

Accuracy measures for  NLTK :
 Coverage =  25.0% 
 Confidence = 19.

## Consolidate all outputs into 1 file

In [33]:
# Read back on all pickle files and output together as one tab delimited txt file

df1 = pd.read_pickle("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\df_eval_1_5_vauxhall_opel_proposed_takeover")
df2 = pd.read_pickle("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\df_eval_1_5_unaccompanied_children")
df3 = pd.read_pickle("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\df_eval_1_5_trump_state_visit")
df4 = pd.read_pickle("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\df_eval_1_5_police_grant")
df5 = pd.read_pickle("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\df_eval_1_5_local_government_finance")
df6 = pd.read_pickle("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\df_eval_1_5_jamal_al_harith")
df7 = pd.read_pickle("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\df_eval_1_5_high_speed_rail")
df8 = pd.read_pickle("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\df_eval_1_5_aster_housing_association")
df9 = pd.read_pickle("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\df_eval_1_5_armed_forces_historical_cases")
# df10 = pd.read_pickle("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\df_eval_1_5_motor_neurone_disease")

df1.drop(df1.columns[2], axis=1, inplace=True)
df3.drop(df3.columns[2], axis=1, inplace=True)
df4.drop(df4.columns[2], axis=1, inplace=True)
df5.drop(df5.columns[2], axis=1, inplace=True)
df6.drop(df6.columns[2], axis=1, inplace=True)
df7.drop(df7.columns[2], axis=1, inplace=True)
df8.drop(df8.columns[2], axis=1, inplace=True)
df9.drop(df9.columns[2], axis=1, inplace=True)
# df10.drop(df10.columns[2], axis=1, inplace=True)


df_list = [df1, df2, df3, df4, df5, df6, df7, df8, df9]
df_all = pd.concat(df_list, ignore_index=True)

df_all

Unnamed: 0,Entity,Type,Ent_type,Actual,Type.1,Predicted,Method,Sample
0,Aberdeen South,LOC,Location,Y,N,N,Spacy,Vauxhall-Opel- Proposed Takeover 2017-02-20
1,Alison McGovern,PERSON,Person,Y,PERSON,Y,Spacy,Vauxhall-Opel- Proposed Takeover 2017-02-20
2,Amber Valley,LOC,Location,Y,N,N,Spacy,Vauxhall-Opel- Proposed Takeover 2017-02-20
3,Andrew Selous,PERSON,Person,Y,PERSON,Y,Spacy,Vauxhall-Opel- Proposed Takeover 2017-02-20
4,Aston Martin,ORG,Organisation,Y,ORG,Y,Spacy,Vauxhall-Opel- Proposed Takeover 2017-02-20
5,Automotive Council,ORG,Organisation,Y,ORG,Y,Spacy,Vauxhall-Opel- Proposed Takeover 2017-02-20
6,Bishop Auckland,LOC,Location,Y,N,N,Spacy,Vauxhall-Opel- Proposed Takeover 2017-02-20
7,Bristol East,LOC,Location,Y,N,N,Spacy,Vauxhall-Opel- Proposed Takeover 2017-02-20
8,Britain,LOC,Location,Y,GPE,Y,Spacy,Vauxhall-Opel- Proposed Takeover 2017-02-20
9,Calder Valley,LOC,Location,Y,N,N,Spacy,Vauxhall-Opel- Proposed Takeover 2017-02-20


In [34]:
df_all.to_csv("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Hansard data\\gold_standard_eval_1_5_all.txt"\
                    , sep='\t', index=False)

## "Any" Ensemble Method

Combine the extracted entitites from all 3 NLP libraries, and measure their accuracy.
Count a prediction as positive if ANY of the 3 methods predict it as positive.

In [35]:
list(df_all)

['Entity',
 ' Type',
 'Ent_type',
 'Actual',
 'Type',
 'Predicted',
 'Method',
 'Sample']

In [36]:
df_temp = df_all.copy()

del df_temp['Method']
del df_temp[' Type']
del df_temp['Type']

df_dedup = df_temp.sort_values(['Entity', 'Ent_type', 'Sample', 'Predicted'], ascending=[True, True, True, False])\
                    .drop_duplicates(subset=['Entity', 'Ent_type', 'Sample'], keep='first')

df_dedup

Unnamed: 0,Entity,Ent_type,Actual,Predicted,Sample
14196,,Location,N,Y,High Speed Rail (London - West Midlands) Bill ...
7827,,Location,N,Y,Police Grant 2017-02-22
16587,,Organisation,N,Y,Armed Forces- Historical Cases 2017-02-23
1527,,Organisation,N,Y,Unaccompanied Children (Greece and Italy) 201...
125,,Organisation,N,Y,Vauxhall-Opel- Proposed Takeover 2017-02-20
10196,,Person,N,Y,Local Government Finance 2017-02-22
4513,,Person,N,Y,President Trump- State Visit 2017-02-20
2988,'s Independent Anti-slavery Commissioner,Organisation,N,Y,Unaccompanied Children (Greece and Italy) 201...
18121,", Bloody Friday",Organisation,N,Y,Armed Forces- Historical Cases 2017-02-23
13523,", Border Force",Organisation,N,Y,Jamal al-Harith 2017-02-23


In [37]:
# Calculate accuracy measures
print(pd.crosstab(df_dedup['Predicted'], df_dedup['Actual']))

accuracy_measures(df_dedup, "Ensemble - Any")

Actual        N     Y
Predicted            
N             0   496
Y          3870  1371

Accuracy measures for  Ensemble - Any :
 Coverage =  73.0% 
 Confidence = 26.0% 
 Balanced F1 = 38.3%
[('Ensemble - Any', 0.73, 0.26, 0.383)]


## "All" Ensemble Method

Combine the extracted entitites from all 3 NLP libraries, and measure their accuracy.
Count a prediction as positive only if ALL of the 3 methods predict it as positive.

In [38]:
df_count = df_all[['Entity', 'Ent_type', 'Sample', 'Actual', 'Predicted', 'Method']].groupby(['Entity', 'Ent_type', 'Sample', 'Actual', 'Predicted']).count()

df_count.reset_index(inplace=True)

df_count.head(30)

Unnamed: 0,Entity,Ent_type,Sample,Actual,Predicted,Method
0,,Location,High Speed Rail (London - West Midlands) Bill ...,N,Y,1
1,,Location,Police Grant 2017-02-22,N,Y,1
2,,Organisation,Armed Forces- Historical Cases 2017-02-23,N,Y,1
3,,Organisation,Unaccompanied Children (Greece and Italy) 201...,N,Y,1
4,,Organisation,Vauxhall-Opel- Proposed Takeover 2017-02-20,N,Y,1
5,,Person,Local Government Finance 2017-02-22,N,Y,1
6,,Person,President Trump- State Visit 2017-02-20,N,Y,1
7,'s Independent Anti-slavery Commissioner,Organisation,Unaccompanied Children (Greece and Italy) 201...,N,Y,2
8,", Bloody Friday",Organisation,Armed Forces- Historical Cases 2017-02-23,N,Y,3
9,", Border Force",Organisation,Jamal al-Harith 2017-02-23,N,Y,2


In [39]:
df_count['Pred2'] = ((df_count['Predicted']=='Y') & ((df_count['Method']==6)))

dict_pred = {True:'Y', False:'N'}

df_count['Predicted'] = df_count['Pred2'].map(dict_pred)

df_count

Unnamed: 0,Entity,Ent_type,Sample,Actual,Predicted,Method,Pred2
0,,Location,High Speed Rail (London - West Midlands) Bill ...,N,N,1,False
1,,Location,Police Grant 2017-02-22,N,N,1,False
2,,Organisation,Armed Forces- Historical Cases 2017-02-23,N,N,1,False
3,,Organisation,Unaccompanied Children (Greece and Italy) 201...,N,N,1,False
4,,Organisation,Vauxhall-Opel- Proposed Takeover 2017-02-20,N,N,1,False
5,,Person,Local Government Finance 2017-02-22,N,N,1,False
6,,Person,President Trump- State Visit 2017-02-20,N,N,1,False
7,'s Independent Anti-slavery Commissioner,Organisation,Unaccompanied Children (Greece and Italy) 201...,N,N,2,False
8,", Bloody Friday",Organisation,Armed Forces- Historical Cases 2017-02-23,N,N,3,False
9,", Border Force",Organisation,Jamal al-Harith 2017-02-23,N,N,2,False


In [40]:
df_dedup2 = df_count.sort_values(['Entity', 'Ent_type', 'Sample', 'Predicted'], ascending=[True, True, True, False])\
                    .drop_duplicates(subset=['Entity', 'Ent_type', 'Sample'], keep='first')

df_dedup2.head(30)

Unnamed: 0,Entity,Ent_type,Sample,Actual,Predicted,Method,Pred2
0,,Location,High Speed Rail (London - West Midlands) Bill ...,N,N,1,False
1,,Location,Police Grant 2017-02-22,N,N,1,False
2,,Organisation,Armed Forces- Historical Cases 2017-02-23,N,N,1,False
3,,Organisation,Unaccompanied Children (Greece and Italy) 201...,N,N,1,False
4,,Organisation,Vauxhall-Opel- Proposed Takeover 2017-02-20,N,N,1,False
5,,Person,Local Government Finance 2017-02-22,N,N,1,False
6,,Person,President Trump- State Visit 2017-02-20,N,N,1,False
7,'s Independent Anti-slavery Commissioner,Organisation,Unaccompanied Children (Greece and Italy) 201...,N,N,2,False
8,", Bloody Friday",Organisation,Armed Forces- Historical Cases 2017-02-23,N,N,3,False
9,", Border Force",Organisation,Jamal al-Harith 2017-02-23,N,N,2,False


In [41]:
print(pd.crosstab(df_dedup2['Predicted'], df_dedup2['Actual']))

accuracy_measures(df_dedup2, "Ensemble - All")

Actual        N     Y
Predicted            
N          3821  1621
Y            49   246

Accuracy measures for  Ensemble - All :
 Coverage =  13.0% 
 Confidence = 83.0% 
 Balanced F1 = 22.5%
[('Ensemble - All', 0.13, 0.83, 0.225)]
