In [635]:
from parse import *
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from itertools import chain
import string
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

table = str.maketrans({key: None for key in string.punctuation+'’'+'“'+'”'})
%matplotlib inline

In [636]:
def remove_punctuation(text):
    return text.translate(table)

### build the graph

In [2]:
G = parseText('data/electionActLabeled.txt', verbose=False)

node: Alberta_Election_Act:{}
node: Alberta_Election_Act--None--0--None--0--Interpretation:{'text': 'Interpretation', 'level': 6}
node: Alberta_Election_Act--None--0--None--0--Interpretation--1--1:{'text': 'In this Act,', 'level': 8}
node: Alberta_Election_Act--None--0--None--0--Interpretation--1--1--a:{'text': '“advance poll” means a poll taken in advance of polling day;', 'level': 9}
node: Alberta_Election_Act--None--0--None--0--Interpretation--1--1--b:{'text': '“by‑election” means an election other than a general election;', 'level': 9}
node: Alberta_Election_Act--None--0--None--0--Interpretation--1--1--b.1:{'text': '“campaign period” means', 'level': 9}
node: Alberta_Election_Act--None--0--None--0--Interpretation--1--1--b.1--i:{'text': 'in the case of a general election held in accordance with section 38.1(2), the period commencing on February 1 in the year in which the election is held and ending 2 months after polling day,', 'level': 10}
node: Alberta_Election_Act--None--0--None-

### create the initial training and test splits

In [4]:
interp = set(nx.dfs_tree(G, source='Alberta_Election_Act--None--0--None--0--Interpretation'))
rem = set(nx.dfs_tree(G, source='Alberta_Election_Act').nodes)
print(len(interp))
print(len(rem))
candidates = rem-interp

83
1981


In [5]:
x_train, x_test = tts(list(candidates), test_size=0.10, random_state=42)

In [6]:
print(len(x_train))

1708


In [14]:
df = pd.DataFrame({'text':[G.nodes[x]['text'] for  x in x_train if len(G.nodes[x])>0]})
df['level'] = [G.nodes[x]['level'] for  x in x_train if len(G.nodes[x])>0]
df['node'] = [x for  x in x_train if len(G.nodes[x])>0]

df.to_csv('train.csv')
df.head()


Unnamed: 0,text,level,node
0,The Chief Electoral Officer shall from time to...,8,Alberta_Election_Act--Appointments--1--None--0...
1,Identification documents,6,Alberta_Election_Act--Election_Lists--2--None-...
2,The returning officer shall,8,Alberta_Election_Act--Post‑Polling‑Day_Procedu...
3,if no candidate can be declared elected becaus...,9,Alberta_Election_Act--Post‑Polling‑Day_Procedu...
4,number each objection to a ballot in the poll ...,9,Alberta_Election_Act--Post‑Polling‑Day_Procedu...


In [15]:
len(df)

1707

# evaluation functions

In [236]:
def entities_from_tags(text, tags):
    entities = []
    in_ent =False
    for te, ta in zip(text.split(), tags):
        #print(te, ta)
        try:
            if ta == 'U':
                entities.append(te)
            if ta == 'B':
                entities.append([te])
            if ta == 'I' or ta == 'L':
                entities[-1].append(te)
            if ta == 'L':
                entities[-1] = ' '.join(entities[-1])
        except:
            print(text)
            print(tags)
    return entities

def fix_tags(break_index, tags):
    if break_index:
        return tags[1:]
    return tags

def to_set(entities):
    s = set()
    for ent in entities:
        s.update(ent.split())
    return s

In [237]:
eval_data = pd.read_csv('annotated.csv')
eval_data = eval_data.loc[:, ~eval_data.columns.str.contains('^Unnamed')]
eval_data.rename(columns={'0':'text'}, inplace=True)
#df.Tags.apply(lambda x: x[1:-1].split(','))
eval_data['tags'] = eval_data['tags'].apply(lambda x: list(map(lambda y: str(y).strip().strip("'"), x[1:-1].split(','))))
eval_data['tags'] = eval_data.apply(lambda x: fix_tags(getBreak(x['text']), x['tags']), axis=1)
eval_data['text'] = eval_data['text'].apply(lambda x: remove_punctuation(x[getBreak(x):].strip()))
eval_data['entities'] = eval_data.apply( lambda x: entities_from_tags(x['text'], x['tags']), axis = 1)
eval_data['entity_indexes'] = eval_data.tags.apply(lambda x: [i for i,y in enumerate(x) if y in "BILU"])
eval_data['entity_token_set'] = eval_data.entities.apply(to_set)
eval_data['entity_set'] = eval_data.entities.apply(lambda x: set(x))
eval_data.to_csv('final_evaluation_data.csv')
eval_data.head()

Unnamed: 0,text,tags,entities,entity_indexes,entity_token_set,entity_set
0,a statement of the availability of barrier‑fre...,"[O, U, O, O, O, O, B, L, O, O, O, O, O, B, L, ...","[statement, barrier‑free accessibility, return...","[1, 6, 7, 13, 14, 18, 19, 20]","{officer, polling, statement, accessibility, b...","{advance polling places, statement, returning ..."
1,The following restrictions apply with respect ...,"[O, O, O, O, O, O, O, O, O, O, B, I, L]",[accessible voting equipment],"[10, 11, 12]","{equipment, accessible, voting}",{accessible voting equipment}
2,On the coming into force of subsection 1 the f...,"[O, O, O, O, O, O, B, L, O, O, O]",[subsection 1],"[6, 7]","{subsection, 1}",{subsection 1}
3,are Canadian citizens,"[O, B, L]",[Canadian citizens],"[1, 2]","{Canadian, citizens}",{Canadian citizens}
4,forfeits the person’s right to vote in the ele...,"[O, O, O, O, O, O, O, O, U, O]",[election],[8],{election},{election}


# prepare the data

In [697]:
import spacy

nlp = spacy.load("en_core_web_lg")
nlp.tokenizer.rules = {key: value for key, value in nlp.tokenizer.rules.items() if "cannot" not in key}
def spacyize(text):
    spac = nlp(text)
    return {'lemmas':[token.lemma_ for token in spac], 'pos':[token.pos_ for token in spac], 'stops':[token.is_stop for token in spac], 'tokens':[token.text for token in spac]}


### Evaluation Dataset

In [709]:
eval_data['text']=eval_data.text.apply(remove_punctuation)
eval_data['text']=eval_data.text.apply(lambda y: ' '.join(y.split()))
spac = eval_data.text.apply(spacyize)
eval_data['lemmas'] = [x['lemmas'] for x in spac]
eval_data['pos'] = [x['pos'] for x in spac]
eval_data['stops'] = [x['stops'] for x in spac]

In [710]:
eval_data.head()

Unnamed: 0,text,tags,entities,entity_indexes,entity_token_set,entity_set,lemmas,pos,stops,fp
0,a statement of the availability of barrier‑fre...,"[O, U, O, O, O, O, B, L, O, O, O, O, O, B, L, ...","[statement, barrier‑free accessibility, return...","[1, 6, 7, 13, 14, 18, 19, 20]","{officer, polling, statement, accessibility, b...","{advance polling places, statement, returning ...","[a, statement, of, the, availability, of, barr...","[DET, NOUN, ADP, DET, NOUN, ADP, NUM, NOUN, AD...","[True, False, True, True, False, True, False, ...",False
1,The following restrictions apply with respect ...,"[O, O, O, O, O, O, O, O, O, O, B, I, L]",[accessible voting equipment],"[10, 11, 12]","{equipment, accessible, voting}",{accessible voting equipment},"[the, follow, restriction, apply, with, respec...","[DET, VERB, NOUN, VERB, ADP, NOUN, ADP, DET, N...","[True, False, False, False, True, False, True,...",False
2,On the coming into force of subsection 1 the f...,"[O, O, O, O, O, O, B, L, O, O, O]",[subsection 1],"[6, 7]","{subsection, 1}",{subsection 1},"[on, the, come, into, force, of, subsection, 1...","[ADP, DET, VERB, ADP, NOUN, ADP, NOUN, NUM, DE...","[True, True, False, True, False, True, False, ...",False
3,are Canadian citizens,"[O, B, L]",[Canadian citizens],"[1, 2]","{Canadian, citizens}",{Canadian citizens},"[be, canadian, citizen]","[AUX, ADJ, NOUN]","[True, False, False]",False
4,forfeits the persons right to vote in the elec...,"[O, O, O, O, O, O, O, O, U, O]",[election],[8],{election},{election},"[forfeit, the, person, right, to, vote, in, th...","[VERB, DET, NOUN, ADJ, PART, VERB, ADP, DET, N...","[False, True, False, False, True, False, True,...",False


### Train Data

In [711]:
df.text = df.text.apply(remove_punctuation)
df['text']=df.text.apply(lambda y: ' '.join(y.split()))
spac = df.text.apply(spacyize)
df['lemmas'] = [x['lemmas'] for x in spac]
df['pos'] = [x['pos'] for x in spac]
df['stops'] = [x['stops'] for x in spac]

In [712]:
df.head()

Unnamed: 0,text,level,node,lemmas,pos,stops
0,The Chief Electoral Officer shall from time to...,8,Alberta_Election_Act--Appointments--1--None--0...,"[the, Chief, Electoral, Officer, shall, from, ...","[DET, PROPN, PROPN, PROPN, AUX, ADP, NOUN, ADP...","[True, False, False, False, False, True, False..."
1,Identification documents,6,Alberta_Election_Act--Election_Lists--2--None-...,"[identification, document]","[NOUN, NOUN]","[False, False]"
2,The returning officer shall,8,Alberta_Election_Act--Post‑Polling‑Day_Procedu...,"[the, return, officer, shall]","[DET, VERB, NOUN, AUX]","[True, False, False, False]"
3,if no candidate can be declared elected becaus...,9,Alberta_Election_Act--Post‑Polling‑Day_Procedu...,"[if, no, candidate, can, be, declare, elect, b...","[SCONJ, DET, NOUN, AUX, AUX, VERB, VERB, SCONJ...","[True, True, False, True, True, False, False, ..."
4,number each objection to a ballot in the poll ...,9,Alberta_Election_Act--Post‑Polling‑Day_Procedu...,"[number, each, objection, to, a, ballot, in, t...","[NOUN, DET, NOUN, ADP, DET, NOUN, ADP, DET, NO...","[False, True, False, True, True, False, True, ..."


### Seeds (interpretations)

In [245]:
interpretations = pd.DataFrame({'text':[G.nodes[x]['text'].strip() for  x in interp if len(G.nodes[x])>0]})
interpretations['level'] = [G.nodes[x]['level'] for  x in interp if len(G.nodes[x])>0]
interpretations['node'] = [x for  x in interp if len(G.nodes[x])>0]
interpretations

Unnamed: 0,text,level,node
0,“Special Ballot” means the Special Ballot prov...,9,Alberta_Election_Act--None--0--None--0--Interp...
1,"any facility not referred to in subclause (i),...",10,Alberta_Election_Act--None--0--None--0--Interp...
2,“poll book” means a poll book referred to in s...,9,Alberta_Election_Act--None--0--None--0--Interp...
3,“register” means the register of electors esta...,9,Alberta_Election_Act--None--0--None--0--Interp...
4,“oath” includes an affirmation;,9,Alberta_Election_Act--None--0--None--0--Interp...
...,...,...,...
78,in the case of an election under the Alberta S...,10,Alberta_Election_Act--None--0--None--0--Interp...
79,his or her residence before being incarcerated;,9,Alberta_Election_Act--None--0--None--0--Interp...
80,when a person leaves Alberta with the intentio...,9,Alberta_Election_Act--None--0--None--0--Interp...
81,“election” means an election of a person as a ...,9,Alberta_Election_Act--None--0--None--0--Interp...


In [358]:
import re

seed_finder = re.compile(r'“.+”') #note that this wont work if punctuation is removed

def get_seeds(text):
    m = seed_finder.match(text)
    if m:
        return m.group(0).strip('“”')
interpretations['seed'] = interpretations.text.apply(get_seeds )

### output

In [713]:
#df.to_csv('dataset/train.csv')
#eval_data.to_csv('dataset/test.csv')
#interpretations.to_csv('dataset/interpretations.csv')
df.to_pickle('dataset/train.pickle')
eval_data.to_pickle('dataset/test.pickle')
interpretations.to_pickle('dataset/interpretations.pickle')
nx.write_gpickle(G, 'dataset/graph.pickle')


### reload...

In [639]:
df = pd.read_pickle('dataset/train.pickle')
eval_data = pd.read_pickle('dataset/test.pickle')
interpretations = pd.read_pickle('dataset/interpretations.pickle')
G = nx.read_gpickle('dataset/graph.pickle')

### OOP code objects

In [681]:


module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [1426]:
from functools import lru_cache

def evaluate_entity_sets(entity_sets):
    results = []
    total_correctly_matched = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    for s, v in zip(entity_sets, eval_data.entity_set):
        correctly_matched = len(s.intersection(v))
        false_positive = len(s-v)
        false_negative = len(v-s)
        total_correctly_matched += correctly_matched
        total_false_positives += false_positive
        total_false_negatives += false_negative
        results.append({'correct':correctly_matched, 'false_positive':false_positive, 'false_negative':false_negative})
    precision = total_correctly_matched/(total_correctly_matched+total_false_positives)
    recall = total_correctly_matched/(total_correctly_matched+total_false_negatives)
    f1 = 2*((precision*recall)/(precision+recall))
    return {'precision':precision, 'recall':recall, 'f1':f1, 'results':results}

def evaluate_entity_token_sets(entity_sets):
    results = []
    total_correctly_matched = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    for s, v in zip(entity_sets, eval_data.entity_token_set):
        correctly_matched = len(s.intersection(v))
        false_positive = len(s-v)
        false_negative = len(v-s)
        total_correctly_matched += correctly_matched
        total_false_positives += false_positive
        total_false_negatives += false_negative
        results.append({'correct':correctly_matched, 'false_positive':false_positive, 'false_negative':false_negative})
    precision = total_correctly_matched/(total_correctly_matched+total_false_positives)
    recall = total_correctly_matched/(total_correctly_matched+total_false_negatives)
    f1 = 2*((precision*recall)/(precision+recall))
    return {'precision':precision, 'recall':recall, 'f1':f1, 'results':results}

class Span():
    def __init__(self, start, end):
        self.start = start
        self.end = end
        self.len = end-start
    
    def __len__(self):
        return self.len
    
    def overlap(self, other):
        return max(self.start, other.start) <= min(self.end, other.end)
    
    def overlaps(self,other):
        return self.overlap(other)
        
    def __repr__(self):
        return f'start: {self.start} end: {self.end}'
    
    def __str__(self):
        return self.__repr__()
    
    def __eq__(self, other):
        return self.start == other.start and self.end == other.end
        
    def __ne__(self, other):
        return (not self.__eq__(other))
    
    def __hash__(self):
        return hash(self.__repr__())

def char_spans_to_token_span(char_spans, text):
        tokens=tokenize(text)
        
        token_spans = []
        #print(char_spans)

        for span in char_spans:
            this_span_start = -1
            this_span_end = -1
            start = 0
            end = 0
            for i, t in enumerate(tokens):
                end += len(t)
                t_span = Span(start, end)
                start = end
                overlaps =span.overlap(t_span)
                
                if overlaps and this_span_start == -1 and t != ' ':
                    this_span_start = i
                elif overlaps:
                    this_span_end = i
                elif this_span_start != -1 and this_span_end != -1: 
                    token_spans.append(Span(this_span_start,this_span_end))
                    break
            else:
                if this_span_start == -1 or this_span_end == -1:
                    return []
                token_spans.append(Span(this_span_start,len(tokens)))

        return token_spans
            
class ReSearch():
    def __init__(self, text_re, pos_re):
        self.text_re = text_re
        self.pos_re = pos_re
        
    def match_text(self, text):
        spans = []
        for m in self.text_re.finditer(text):
            s = Span(m.start(),m.end())
            if any([s.overlap(x) for x in spans]):
                continue
            spans.append(s)
        return char_spans_to_token_span(spans, text)
    
    def match_pos(self, poses):
        if isinstance(poses, list):
            poses = ' '.join(str(x) for x in poses)
        return char_spans_to_token_span([Span(m.start(),m.end()) for m in self.pos_re.finditer(poses)], poses)
    
    def match_both(self, text, poses):
        
        text_spans = set(self.match_text(text))
        pos_spans = set(self.match_pos(poses))
        return set(text_spans).intersection(set(pos_spans))

        
class Seed(ReSearch):
    def __init__(self, text):
        #handle text
        self.text = remove_punctuation(text.lower())
        spac = spacyize(self.text)
        self.lemmas = spac['lemmas']
        self.pos = spac['pos']
        self.stops = spac['stops']
        self.embedding = embed([self.text])
        
        #handle pos tags
        self.tags = str(" ".join(x for x in self.pos))
        super().__init__(
                       re.compile(f"\\b{self.text}\\b", re.I), 
                       re.compile(f"\\b{self.tags}\\b", re.I)
                      )

    def __repr__(self):
        return f'Text: "{self.text}"' + '\n' + f'Pos: "{self.tags}"'
    
    def __str__(self):
        return self.__repr__()
    
    def fullMatch(self, text, tags):
        return self.match_both(text, tags)
@lru_cache()   
def get_pos_tags(text):
    spac = spacyize(text)
    return " ".join(str(x) for x in spac['pos'])
@lru_cache()
def tokenize(text):
    return re.split('(\W)', text)

class SeedList():
    def __init__(self, startList):
        self.seeds = [Seed(text) for text in startList]
        self.sort()
        
    def add(self, text, sort = True):
        self.seeds.append(Seed(text))
        if sort:
            self.sort()
            
    def sort(self):
        self.seeds.sort(key=lambda x: len(x.text), reverse = True)
        
    def __len__(self):
        return len(self.seeds)
    
    def get_spans_from_seeds(self, text):
        spans = set()
        tokens = tokenize(text)
        for i, seed in enumerate(self.seeds):
            matches = seed.match_text(text)
            for match in matches:
                for span in spans:
                    if span.overlaps(match):
                        break
                else:
                    spans.add(match)
        return spans
                    
    def get_entity_sets_from_seeds(self, text, whole=True, both=False):
        entities = set()
        spans = set()
        tags = get_pos_tags(text)
        tokens = tokenize(text)
        for i, seed in enumerate(self.seeds):
            if both:
                matches = seed.fullMatch(text, tags)
            else:
                matches = seed.match_text(text)
            for match in matches:
                for span in spans:
                    if span.overlaps(match):
                        break
                else:
                    spans.add(match)
                    if whole:
                        entities.add(''.join(tokens[match.start:match.end]))
                    else:
                        entities.update(t for t in tokens[match.start:match.end] if t != ' ')
                    
                    
        return entities
    
    def score(self, text):
        emb = embed([text])
        sc = (0.0, '')
        for seed in self.seeds:
            this_score = np.inner(seed.embedding, emb)
            if this_score>sc[0]:
                sc = (float(this_score), seed.text)
        return sc

class Pattern(ReSearch):
    def __init__(self, prior_pos, entity_pos, post_pos, entity_seed):
        self.prior_pos = prior_pos
        self.entity_pos = entity_pos
        self.post_pos = post_pos
        self.pattern = prior_pos + entity_pos + post_pos
        self.prior_len = len(prior_pos)*2-1 #account for the spaces....
        self.post_len = len(post_pos)*2-1 #account for the spaces....
        self.entity_len = len(entity_pos)
        self.len = len(self.pattern)
        self.entity_seed = entity_seed
        self.pattern_text = ' '.join(x for x in self.pattern)
        super().__init__(
                       re.compile(f"\\b{self.entity_seed}\\b", re.I), 
                       re.compile(f"\\b{self.pattern_text}\\b", re.I)
                      )
    
    def __repr__(self):
        return ', '.join(self.pattern)
    
    def __str__(self):
        return self.__repr__()
        
    def __len__(self):
        return self.len
    
    def __eq__(self, other):
        if isinstance(other, Pattern):
            return self.__repr__() == other.__repr__()
        else:
            return False
    
    def __ne__(self, other):
        return (not self.__eq__(other))
    
    def __hash__(self):
        return hash(self.__repr__())
    
    def get_entities(self, text, tag_string, both=False):
        if isinstance(text, str):
            tokens = tokenize(text)
        if isinstance(text, list):
            tokens = text
            text = ''.join(tokens)
            
        if not isinstance(tag_string, str):
            tag_string = ' '.join(tag_string)
        
        entity_matches = set()
        
        if both:
            pos_spans = self.match_both(text, tag_string)
        else:
            pos_spans = self.match_pos(tag_string)
        for span in pos_spans:
            entity_matches.add(''.join(tokens[span.start+self.prior_len:span.end-self.post_len]).strip())
        return entity_matches
        
        
class Patterns():
    def __init__(self, startList, evaluation_data):
        self.seedList = SeedList(startList)
        self.eval_data = eval_data
        self.seed_set = startList
        
    def print_seeds(self):
        for x in self.seedList.seeds:
            print(x)
            
    def __len__(self):
        return len(self.seedList)
    
    def check_seeds(self, text, whole=True):
        return self.seedList.get_entity_sets_from_seeds(text, whole=whole)

    def add_seed(self, text):
        self.seedList.add(text)
        self.seed_set.add(text)
    
    def score(self, text):
        return self.seedList.score(text)

    def evaluate(self):
        initial_seed_set = self.eval_data.text.apply(self.check_seeds, whole=True)
        r = evaluate_entity_sets(initial_seed_set)
        print('Results for whole entitiy objects')
        print(f"Entity Set Precision: {r['precision']}")
        print(f"Entity Set Recall: {r['recall']}")
        print(f"Entity Set F1: {r['f1']}")
        print()

        print('Results for partial entitiy objects')
        initial_token_set = self.eval_data.text.apply(self.check_seeds, whole=False)
        rtok = evaluate_entity_token_sets(initial_token_set)
        print(f"Entity Token Set Precision: {rtok['precision']}")
        print(f"Entity Token Set Recall: {rtok['recall']}")
        print(f"Entity Token Set F1: {rtok['f1']}")
        
        return r, rtok
    
    def get_spans(self, text):
        return self.seedList.get_spans_from_seeds(text)
        
    def build_patterns(self, train_data, pre_window=4, post_window=4):
        self.patterns = set()
        for a, row in train_data.iterrows():
            spans = self.get_spans(row.text)
            tokens = tokenize(row.text)
            pos = row.pos
            for span in spans:
                pos_start = span.start-span.start//2
                pos_end = span.end-span.end//2
                for a in range(2,pre_window + 1):
                    for b in range(2,post_window +1):
                        p = Pattern(pos[max(0, pos_start-a):pos_start],pos[pos_start:pos_end],pos[pos_end:pos_end+b], ''.join(tokens[span.start:span.end]))
                        print('poses', pos)
                        print('tokens', tokens)
                        print(f'a: {a} b: {b} st: {span.start} en: {span.end}')
                        print('input', pos[max(0, pos_start-a):pos_start],pos[pos_start:pos_end],pos[pos_end:pos_end+b], ''.join(tokens[span.start:span.end]))
                        self.patterns.add(p)
                        

   # def evaluate_patterns(self, train_data):
        

                    

In [1427]:
a = Seed('this is a test')
a

Text: "this is a test"
Pos: "PRON AUX DET NOUN"

In [1428]:
t_text = 'something about this is a test for something'
spac = spacyize(t_text)
a.match_both(t_text, spac['pos'])

{start: 4 end: 11}

In [1429]:
seeds = Patterns(set((x for x in interpretations.seed if x)), eval_data)
#seeds.print_seeds()

In [1430]:
get_pos_tags('something about this is a test for something')

'PRON ADP PRON AUX DET NOUN ADP PRON'

In [1431]:
testList = SeedList(['this is a test', 'test', 'candle'])
print(testList.get_entity_sets_from_seeds('something about this is a test for something'))
print(testList.get_entity_sets_from_seeds('something about this a test for something like a candle'))
print(testList.get_entity_sets_from_seeds('candle something about this a test for something like a'))

{'this is a test'}
{'candle', 'test'}
{'candle', 'test'}


In [1432]:
testList = SeedList(['this is a test', 'test', 'candle'])
print(testList.get_entity_sets_from_seeds('something about this is a test for something', both=True))
print(testList.get_entity_sets_from_seeds('something about this a test for something like a candle', both=True))
print(testList.get_entity_sets_from_seeds('candle something about this a test for something like a', both=True))

{'this is a test'}
{'candle', 'test'}
{'test'}


In [1433]:
p = Pattern(['PRON','ADP'], "PRON AUX DET NOUN".split(), ['ADP', 'PRON'],'this is a test' )
p.get_entities( 'something about this is a test for something',get_pos_tags('something about this is a test for something'))

{'this is a test'}

# initial results on seed dict from interpretation entities :)

In [1434]:
e = seeds.evaluate()

Results for whole entitiy objects
Entity Set Precision: 0.7113821138211383
Entity Set Recall: 0.3141831238779174
Entity Set F1: 0.43586550435865506

Results for partial entitiy objects
Entity Token Set Precision: 0.9484536082474226
Entity Token Set Recall: 0.36007827788649704
Entity Token Set F1: 0.5219858156028369


# Spacy Evaluation

In [1435]:
part_results = []
whole_results = []

for text in eval_data.text:
    wholes = set()
    parts = set()
    
    doc = nlp(text)
    for ent in doc.ents:
        wholes.add(ent.text)
        parts.update(ent.text.split())
    part_results.append(parts)
    whole_results.append(wholes)
#print(part_results)
#print()
#print(whole_results)
r = evaluate_entity_sets(whole_results)
print(f"Entity Set Precision: {r['precision']}")
print(f"Entity Set Recall: {r['recall']}")
print(f"Entity Set F1: {r['f1']}")
print()
rtok = evaluate_entity_token_sets(part_results)
print(f"Entity Token Set Precision: {rtok['precision']}")
print(f"Entity Token Set Recall: {rtok['recall']}")
print(f"Entity Token Set F1: {rtok['f1']}")

Entity Set Precision: 0.2824427480916031
Entity Set Recall: 0.06642728904847396
Entity Set F1: 0.1075581395348837

Entity Token Set Precision: 0.7755905511811023
Entity Token Set Recall: 0.19275929549902152
Entity Token Set F1: 0.30877742946708464


# seed expansion

In [1436]:
seeds.score('Election commish')        

(0.7819006443023682, 'election commissioner')

### set up initial patterns on the training data

In [1437]:
seeds.build_patterns(df)

poses ['DET', 'VERB', 'NOUN', 'AUX']
tokens ['The', ' ', 'returning', ' ', 'officer', ' ', 'shall']
a: 2 b: 2 st: 2 en: 5
input ['DET'] ['VERB', 'NOUN'] ['AUX'] returning officer
poses ['DET', 'VERB', 'NOUN', 'AUX']
tokens ['The', ' ', 'returning', ' ', 'officer', ' ', 'shall']
a: 2 b: 3 st: 2 en: 5
input ['DET'] ['VERB', 'NOUN'] ['AUX'] returning officer
poses ['DET', 'VERB', 'NOUN', 'AUX']
tokens ['The', ' ', 'returning', ' ', 'officer', ' ', 'shall']
a: 2 b: 4 st: 2 en: 5
input ['DET'] ['VERB', 'NOUN'] ['AUX'] returning officer
poses ['DET', 'VERB', 'NOUN', 'AUX']
tokens ['The', ' ', 'returning', ' ', 'officer', ' ', 'shall']
a: 3 b: 2 st: 2 en: 5
input ['DET'] ['VERB', 'NOUN'] ['AUX'] returning officer
poses ['DET', 'VERB', 'NOUN', 'AUX']
tokens ['The', ' ', 'returning', ' ', 'officer', ' ', 'shall']
a: 3 b: 3 st: 2 en: 5
input ['DET'] ['VERB', 'NOUN'] ['AUX'] returning officer
poses ['DET', 'VERB', 'NOUN', 'AUX']
tokens ['The', ' ', 'returning', ' ', 'officer', ' ', 'shall']
a: 3 

poses ['ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'DET', 'NOUN', 'VERB', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'CCONJ', 'DET', 'NOUN', 'VERB', 'NOUN', 'NOUN', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'NOUN', 'NOUN', 'ADP', 'DET', 'NOUN', 'VERB', 'ADP', 'DET', 'VERB', 'NOUN', 'CCONJ', 'ADV', 'ADP', 'DET', 'NOUN', 'ADP', 'PRON', 'ADP', 'DET', 'NOUN', 'CCONJ', 'PRON', 'ADJ', 'NOUN', 'CCONJ', 'NOUN', 'PRON', 'VERB', 'AUX', 'VERB', 'DET', 'NOUN', 'NOUN', 'CCONJ', 'VERB', 'PART', 'VERB', 'DET', 'NOUN', 'CCONJ', 'NOUN', 'NUM', 'CCONJ', 'NUM', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN']
tokens ['After', ' ', 'the', ' ', 'closing', ' ', 'of', ' ', 'the', ' ', 'polling', ' ', 'places', ' ', 'on', ' ', 'polling', ' ', 'day', ' ', 'the', ' ', 'deputy', ' ', 'returning', ' ', 'officer', ' ', 'of', ' ', 'each', ' ', 'mobile', ' ', 'poll', ' ', 'and', ' ', 'the', ' ', 'deputy', ' ', 'returning', ' ', 'officers', ' ', 'poll', ' ', 'clerk', ' ', 'shall', ' ', '

poses ['DET', 'NOUN', 'NOUN', 'NOUN', 'ADJ', 'NOUN', 'CCONJ', 'NOUN', 'PRON', 'VERB', 'PRON', 'CCONJ', 'PRON', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'ADP', 'PRON', 'DET', 'NOUN', 'ADV', 'VERB', 'AUX', 'AUX', 'VERB', 'ADP', 'DET', 'NOUN', 'ADJ', 'NOUN', 'CCONJ', 'NOUN', 'SCONJ', 'DET', 'NOUN', 'VERB', 'PART', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'VERB', 'NOUN', 'CCONJ', 'NOUN', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'NOUN']
tokens ['An', ' ', 'election', ' ', 'officer', ' ', 'candidate', ' ', 'official', ' ', 'agent', ' ', 'or', ' ', 'scrutineer', ' ', 'who', ' ', 'presents', ' ', 'himself', ' ', 'or', ' ', 'herself', ' ', 'for', ' ', 'the', ' ', 'purpose', ' ', 'of', ' ', 'voting', ' ', 'at', ' ', 'the', ' ', 'polling', ' ', 'subdivision', ' ', 'in', ' ', 'which', ' ', 'that', ' ', 'person', ' ', 'ordinarily', ' ', 'resides', ' ', 'may', ' ', 'be', ' ', 'required', ' ', 'by', ' ', 'a', ' ', 'candidate', ' ', 'official', ' ', 'agent', ' ', 'or', ' ', 

poses ['VERB', 'PART', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'VERB', 'DET', 'NOUN', 'DET', 'NOUN', 'AUX', 'VERB', 'ADP', 'AUX', 'VERB', 'ADP', 'CCONJ', 'AUX', 'ADJ', 'PART', 'VERB', 'ADP', 'CCONJ']
tokens ['attempt', ' ', 'to', ' ', 'obtain', ' ', 'any', ' ', 'information', ' ', 'at', ' ', 'a', ' ', 'polling', ' ', 'place', ' ', 'regarding', ' ', 'which', ' ', 'candidate', ' ', 'a', ' ', 'voter', ' ', 'has', ' ', 'voted', ' ', 'for', ' ', 'is', ' ', 'voting', ' ', 'for', ' ', 'or', ' ', 'is', ' ', 'about', ' ', 'to', ' ', 'vote', ' ', 'for', ' ', 'or']
a: 2 b: 4 st: 22 en: 23
input ['VERB', 'DET'] ['NOUN'] ['DET', 'NOUN', 'AUX', 'VERB'] candidate
poses ['VERB', 'PART', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'VERB', 'DET', 'NOUN', 'DET', 'NOUN', 'AUX', 'VERB', 'ADP', 'AUX', 'VERB', 'ADP', 'CCONJ', 'AUX', 'ADJ', 'PART', 'VERB', 'ADP', 'CCONJ']
tokens ['attempt', ' ', 'to', ' ', 'obtain', ' ', 'any', ' ', 'information', ' ', 'at', ' ', 'a', ' ', 'polling', ' '

input ['DET', 'NOUN', 'ADP', 'DET'] ['ADJ', 'NOUN'] ['VERB', 'ADJ', 'ADP'] general election
poses ['ADP', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'VERB', 'ADJ', 'ADP', 'ADP', 'NOUN', 'ADP', 'NOUN', 'NUM', 'DET', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'ADP', 'NOUN', 'NOUN']
tokens ['in', ' ', 'the', ' ', 'case', ' ', 'of', ' ', 'a', ' ', 'general', ' ', 'election', ' ', 'held', ' ', 'other', ' ', 'than', ' ', 'in', ' ', 'accordance', ' ', 'with', ' ', 'section', ' ', '3812', ' ', 'the', ' ', 'period', ' ', 'commencing', ' ', 'with', ' ', 'the', ' ', 'issue', ' ', 'of', ' ', 'a', ' ', 'writ', ' ', 'for', ' ', 'the', ' ', 'general', ' ', 'election', ' ', 'and', ' ', 'ending', ' ', 'on', ' ', 'nomination', ' ', 'day']
a: 4 b: 4 st: 10 en: 13
input ['DET', 'NOUN', 'ADP', 'DET'] ['ADJ', 'NOUN'] ['VERB', 'ADJ', 'ADP', 'ADP'] general election
poses ['ADP', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'VERB', 'ADJ', 'ADP', 'A

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



a: 2 b: 4 st: 38 en: 41
input ['DET', 'NOUN'] ['VERB', 'NOUN'] ['AUX', 'ADV', 'VERB'] returning officer
poses ['DET', 'ADJ', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'NOUN', 'NOUN', 'PRON', 'DET', 'NOUN', 'VERB', 'NOUN', 'AUX', 'ADV', 'VERB']
tokens ['all', ' ', 'other', ' ', 'documents', ' ', 'relating', ' ', 'to', ' ', 'the', ' ', 'operation', ' ', 'of', ' ', 'the', ' ', 'poll', ' ', 'are', ' ', 'placed', ' ', 'in', ' ', 'the', ' ', 'ballot', ' ', 'box', ' ', 'that', ' ', 'the', ' ', 'deputy', ' ', 'returning', ' ', 'officer', ' ', 'shall', ' ', 'immediately', ' ', 'seal']
a: 3 b: 2 st: 38 en: 41
input ['PRON', 'DET', 'NOUN'] ['VERB', 'NOUN'] ['AUX', 'ADV'] returning officer
poses ['DET', 'ADJ', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'NOUN', 'NOUN', 'PRON', 'DET', 'NOUN', 'VERB', 'NOUN', 'AUX', 'ADV', 'VERB']
tokens ['all', ' ', 'other', ' ', 'documents', ' ', 'relating', ' ', 'to', ' ',

poses ['DET', 'NOUN', 'ADP', 'NOUN', 'PRON', 'DET', 'PROPN', 'PROPN', 'VERB', 'ADJ']
tokens ['any', ' ', 'recommendations', ' ', 'for', ' ', 'improvement', ' ', 'that', ' ', 'the', ' ', 'Election', ' ', 'Commissioner', ' ', 'considers', ' ', 'appropriate']
a: 2 b: 2 st: 12 en: 15
input ['PRON', 'DET'] ['PROPN', 'PROPN'] ['VERB', 'ADJ'] Election Commissioner
poses ['DET', 'NOUN', 'ADP', 'NOUN', 'PRON', 'DET', 'PROPN', 'PROPN', 'VERB', 'ADJ']
tokens ['any', ' ', 'recommendations', ' ', 'for', ' ', 'improvement', ' ', 'that', ' ', 'the', ' ', 'Election', ' ', 'Commissioner', ' ', 'considers', ' ', 'appropriate']
a: 2 b: 3 st: 12 en: 15
input ['PRON', 'DET'] ['PROPN', 'PROPN'] ['VERB', 'ADJ'] Election Commissioner
poses ['DET', 'NOUN', 'ADP', 'NOUN', 'PRON', 'DET', 'PROPN', 'PROPN', 'VERB', 'ADJ']
tokens ['any', ' ', 'recommendations', ' ', 'for', ' ', 'improvement', ' ', 'that', ' ', 'the', ' ', 'Election', ' ', 'Commissioner', ' ', 'considers', ' ', 'appropriate']
a: 2 b: 4 st: 12 en: 15

poses ['AUX', 'AUX', 'AUX', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'SCONJ', 'DET', 'PROPN', 'PROPN', 'AUX', 'PART', 'VERB', 'PART', 'VERB', 'DET', 'NOUN']
tokens ['would', ' ', 'have', ' ', 'been', ' ', 'the', ' ', 'subject', ' ', 'of', ' ', 'an', ' ', 'investigation', ' ', 'if', ' ', 'the', ' ', 'Election', ' ', 'Commissioner', ' ', 'had', ' ', 'not', ' ', 'refused', ' ', 'to', ' ', 'conduct', ' ', 'an', ' ', 'investigation']
a: 2 b: 2 st: 20 en: 23
input ['SCONJ', 'DET'] ['PROPN', 'PROPN'] ['AUX', 'PART'] Election Commissioner
poses ['AUX', 'AUX', 'AUX', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'SCONJ', 'DET', 'PROPN', 'PROPN', 'AUX', 'PART', 'VERB', 'PART', 'VERB', 'DET', 'NOUN']
tokens ['would', ' ', 'have', ' ', 'been', ' ', 'the', ' ', 'subject', ' ', 'of', ' ', 'an', ' ', 'investigation', ' ', 'if', ' ', 'the', ' ', 'Election', ' ', 'Commissioner', ' ', 'had', ' ', 'not', ' ', 'refused', ' ', 'to', ' ', 'conduct', ' ', 'an', ' ', 'investigation']
a: 2 b: 3 st: 20 en: 23
input ['SCONJ'

poses ['DET', 'VERB', 'NOUN', 'AUX', 'SCONJ', 'DET', 'VERB', 'NOUN', 'VERB', 'PRON', 'ADJ', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN']
tokens ['A', ' ', 'returning', ' ', 'officer', ' ', 'may', ' ', 'if', ' ', 'the', ' ', 'returning', ' ', 'officer', ' ', 'considers', ' ', 'it', ' ', 'necessary', ' ', 'appoint', ' ', 'in', ' ', 'the', ' ', 'prescribed', ' ', 'form', ' ', 'a', ' ', 'qualified', ' ', 'person', ' ', 'as', ' ', 'an', ' ', 'information', ' ', 'officer', ' ', 'for', ' ', 'each', ' ', 'polling', ' ', 'place']
a: 2 b: 2 st: 2 en: 5
input ['DET'] ['VERB', 'NOUN'] ['AUX', 'SCONJ'] returning officer
poses ['DET', 'VERB', 'NOUN', 'AUX', 'SCONJ', 'DET', 'VERB', 'NOUN', 'VERB', 'PRON', 'ADJ', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN']
tokens ['A', ' ', 'returning', ' ', 'officer', ' ', 'may', ' ', 'if', ' ', 'the', ' ', 'returning', 

input ['NOUN', 'VERB', 'ADP', 'DET'] ['PROPN', 'PROPN'] ['CCONJ', 'ADP', 'DET', 'NOUN'] Election Commissioner
poses ['DET', 'NOUN', 'VERB', 'ADP', 'DET', 'PROPN', 'PROPN', 'CCONJ', 'ADP', 'DET', 'NOUN', 'VERB', 'ADP', 'CCONJ', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'ADP', 'PRON', 'VERB', 'CCONJ', 'VERB', 'PART', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'CCONJ', 'NOUN', 'CCONJ', 'DET', 'VERB', 'NOUN', 'CCONJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'CCONJ', 'NOUN', 'ADP', 'DET', 'PROPN', 'DET', 'PROPN', 'PROPN', 'CCONJ', 'PROPN', 'PROPN', 'PROPN', 'DET', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'CCONJ', 'DET', 'PROPN', 'PROPN', 'PROPN', 'PROPN']
tokens ['No', ' ', 'proceedings', ' ', 'lie', ' ', 'against', ' ', 'the', ' ', 'Election', ' ', 'Commissioner', ' ', 'or', ' ', 'against', ' ', 'a', ' ', 'person', ' ', 'acting', ' ', 'for', ' ', 'or', ' ', 'under', ' ', 'the', ' ', 'direction', ' ', 'of', ' ', 'the', ' ', 'Election', ' ', 'Commissioner', ' ', 'for', '

poses ['SCONJ', 'DET', 'NOUN', 'DET', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'CCONJ', 'DET', 'NOUN', 'AUX', 'PART', 'AUX', 'VERB', 'ADP', 'DET', 'PROPN', 'SCONJ', 'VERB', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'NUM', 'VERB', 'NOUN', 'AUX', 'AUX', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'PROPN', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'CCONJ', 'NOUN']
tokens ['If', ' ', 'a', ' ', 'by', '‑', 'election', ' ', 'an', ' ', 'election', ' ', 'under', ' ', 'the', ' ', 'Alberta', ' ', 'Senate', ' ', 'Election', ' ', 'Act', ' ', 'or', ' ', 'a', ' ', 'plebiscite', ' ', 'is', ' ', 'to', ' ', 'be', ' ', 'conducted', ' ', 'under', ' ', 'this', ' ', 'Act', ' ', 'before', ' ', 'returning', ' ', 'officers', ' ', 'are', ' ', 'appointed', ' ', 'under', ' ', 'subsection', ' ', '1', ' ', 'returning', ' ', 'officers', ' ', 'may', ' ', 'be', ' ', 'appointed', ' ', 'for', ' ', 'the', ' ', 'purpose', ' ', 'of', ' ', 'the', ' ', 'by', '‑', 'election', ' ', 'election', ' ',

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



poses ['SCONJ', 'DET', 'VERB', 'NOUN', 'AUX', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'VERB', 'ADP', 'ADP', 'NOUN', 'NUM', 'PART', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'CCONJ', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'NOUN', 'DET', 'VERB', 'NOUN', 'AUX', 'VERB', 'DET', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'SCONJ', 'AUX', 'AUX', 'VERB']
tokens ['If', ' ', 'a', ' ', 'returning', ' ', 'officer', ' ', 'is', ' ', 'unable', ' ', 'for', ' ', 'any', ' ', 'reason', ' ', 'other', ' ', 'than', ' ', 'the', ' ', 'one', ' ', 'referred', ' ', 'to', ' ', 'in', ' ', 'subsection', ' ', '1', ' ', 'to', ' ', 'announce', ' ', 'the', ' ', 'results', ' ', 'of', ' ', 'the', ' ', 'official', ' ', 'count', ' ', 'at', ' ', 'the', ' ', 'date', ' ', 'and', ' ', 'time', ' ', 'stated', ' ', 'in', ' ', 'the', ' ', 'election', ' ', 'proclamation', ' ', 'the', ' ', 'returning', ' ', 'officer', ' ', 'shall', ' ', 'adjourn', ' ', 'the', ' ', 'proceedings', ' ', 'from'

input ['NOUN', 'CCONJ', 'VERB', 'ADP'] ['NOUN', 'NOUN'] ['VERB', 'DET'] polling day
poses ['DET', 'NOUN', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'DET', 'VERB', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'AUX', 'ADP', 'DET', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'VERB', 'NOUN', 'NOUN', 'CCONJ', 'VERB', 'ADP', 'NOUN', 'NOUN', 'VERB', 'DET', 'NOUN', 'NOUN', 'VERB', 'ADP', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN']
tokens ['An', ' ', 'elector', ' ', 'resident', ' ', 'in', ' ', 'an', ' ', 'electoral', ' ', 'division', ' ', 'on', ' ', 'application', ' ', 'to', ' ', 'the', ' ', 'returning', ' ', 'officer', ' ', 'of', ' ', 'that', ' ', 'electoral', ' ', 'division', ' ', 'may', ' ', 'during', ' ', 'the', ' ', 'period', ' ', 'commencing', ' ', 'on', ' ', 'the', ' ', 'day', ' ', 'following', ' ', 'nomination', ' ', 'day', ' ', 'and', ' ', 'ending', ' ', 'on', ' ', 'polling', ' ', 'day', ' ', 'inspect', ' ', 'the', ' ', 'nomination', ' ', 'papers', ' ', 'filed', ' ', 'by', ' ', 'candidates

poses ['DET', 'PROPN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PROPN', 'AUX', 'VERB', 'DET', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'NUM', 'ADP', 'DET', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'DET', 'PROPN', 'ADP', 'DET', 'PROPN', 'PROPN', 'ADP', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'PRON', 'DET', 'NOUN', 'VERB', 'ADP', 'NOUN', 'ADP', 'SCONJ', 'DET', 'PROPN', 'ADP', 'DET', 'PROPN', 'PROPN', 'AUX', 'VERB', 'ADP', 'DET', 'NOUN', 'NOUN']
tokens ['The', ' ', 'Office', ' ', 'of', ' ', 'the', ' ', 'Chief', ' ', 'Electoral', ' ', 'Officer', ' ', 'shall', ' ', 'disclose', ' ', 'a', ' ', 'statement', ' ', 'of', ' ', 'remuneration', ' ', 'under', ' ', 'section', ' ', '3', ' ', 'of', ' ', 'the', ' ', 'Public', ' ', 'Sector', ' ', 'Compensation', ' ', 'Transparency', ' ', 'Act', ' ', 'for', ' ', 'the', ' ', 'Office', ' ', 'of', ' ', 'the', ' ', 'Election', ' ', 'Commissioner', ' ', 'in', ' ', 'relation', ' ', 'to', ' ', 'the', ' ', 'entire', ' ', 'calendar', ' ', 'year', ' ', '

poses ['ADJ', 'ADP', 'NOUN', 'NUM', 'DET', 'NOUN', 'AUX', 'ADJ', 'PART', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'ADJ', 'SCONJ']
tokens ['Subject', ' ', 'to', ' ', 'section', ' ', '45', ' ', 'an', ' ', 'elector', ' ', 'is', ' ', 'eligible', ' ', 'to', ' ', 'vote', ' ', 'for', ' ', 'a', ' ', 'candidate', ' ', 'in', ' ', 'the', ' ', 'electoral', ' ', 'division', ' ', 'where', ' ', 'the', ' ', 'elector', ' ', 'is', ' ', 'ordinarily', ' ', 'resident', ' ', 'if']
a: 2 b: 2 st: 30 en: 33
input ['ADP', 'DET'] ['ADJ', 'NOUN'] ['SCONJ', 'DET'] electoral division
poses ['ADJ', 'ADP', 'NOUN', 'NUM', 'DET', 'NOUN', 'AUX', 'ADJ', 'PART', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'ADJ', 'SCONJ']
tokens ['Subject', ' ', 'to', ' ', 'section', ' ', '45', ' ', 'an', ' ', 'elector', ' ', 'is', ' ', 'eligible', ' ', 'to', ' ', 'vote', ' ', 'for', ' ', 'a', ' ', 'candidate', ' ', 'in', ' ', 'the'

poses ['VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'CCONJ', 'ADP', 'DET', 'NOUN', 'DET', 'PROPN', 'AUX', 'ADP', 'NOUN', 'NOUN', 'DET', 'NOUN', 'PART', 'VERB', 'DET', 'NOUN', 'VERB', 'ADP', 'CCONJ', 'ADJ', 'ADP', 'DET', 'NOUN']
tokens ['serving', ' ', 'the', ' ', 'petitioner', ' ', 'with', ' ', 'a', ' ', 'notice', ' ', 'of', ' ', 'withdrawal', ' ', 'and', ' ', 'in', ' ', 'that', ' ', 'case', ' ', 'the', ' ', 'Court', ' ', 'shall', ' ', 'on', ' ', 'application', ' ', 'order', ' ', 'the', ' ', 'respondent', ' ', 'to', ' ', 'pay', ' ', 'the', ' ', 'petitioners', ' ', 'costs', ' ', 'of', ' ', 'and', ' ', 'incidental', ' ', 'to', ' ', 'that', ' ', 'statement']
a: 2 b: 2 st: 38 en: 39
input ['NOUN', 'DET'] ['NOUN'] ['PART', 'VERB'] respondent
poses ['VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'CCONJ', 'ADP', 'DET', 'NOUN', 'DET', 'PROPN', 'AUX', 'ADP', 'NOUN', 'NOUN', 'DET', 'NOUN', 'PART', 'VERB', 'DET', 'NOUN', 'VERB', 'ADP', 'CCONJ', 'ADJ', 'ADP', 'DET', 'NO

poses ['SCONJ', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'AUX', 'DET', 'ADJ', 'ADP', 'PRON', 'ADP', 'DET', 'NOUN', 'ADV', 'VERB', 'ADP', 'DET', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'NOUN', 'NUM']
tokens ['whether', ' ', 'the', ' ', 'name', ' ', 'on', ' ', 'the', ' ', 'certificate', ' ', 'envelope', ' ', 'is', ' ', 'the', ' ', 'same', ' ', 'as', ' ', 'that', ' ', 'of', ' ', 'a', ' ', 'person', ' ', 'already', ' ', 'entered', ' ', 'in', ' ', 'the', ' ', 'Special', ' ', 'Ballot', ' ', 'Poll', ' ', 'Book', ' ', 'under', ' ', 'section', ' ', '116']
a: 2 b: 2 st: 42 en: 45
input ['PROPN', 'PROPN'] ['PROPN', 'PROPN'] ['ADP', 'NOUN'] Poll Book
poses ['SCONJ', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'AUX', 'DET', 'ADJ', 'ADP', 'PRON', 'ADP', 'DET', 'NOUN', 'ADV', 'VERB', 'ADP', 'DET', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'NOUN', 'NUM']
tokens ['whether', ' ', 'the', ' ', 'name', ' ', 'on', ' ', 'the', ' ', 'certificate', ' ', 'envelope', ' ', 'is', ' ', 'the', ' ', 'same', 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [1438]:
print(len(seeds.patterns))

10163


In [1439]:
iter(seeds.patterns)

<set_iterator at 0x7f43c596b780>

In [None]:
from tqdm import tqdm
import math 

for pattern in tqdm(seeds.patterns):
    pattern_entities = set()
    for _, row in df.iterrows():
        pattern_entities.update(pattern.get_entities(row.text, row.pos))
    pattern.positives = seeds.seed_set.intersection(pattern_entities)
    pattern.negatives = pattern_entities-seeds.seed_set
    pattern.scores = sum([1-seeds.score(x)[0] for x in negatives])
    pattern.overall = len(pattern.positives)/(len(pattern.negatives)+pattern.scores)*math.log(len(positives))
    #print(f"Positives {len(positives)} Negatives {len(negatives)}")
ranked = sorted(list(seeds.patterns), reverse=True, key = lambda x: x.overall)

  3%|▎         | 287/10163 [00:21<12:16, 13.41it/s]