In [28]:
from parse import *
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from itertools import chain
import string
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tqdm import tqdm
import math 
punct = set(string.punctuation+'’'+'“'+'”')
punct.remove(')')
punct.remove('(')
table = str.maketrans({key: None for key in punct})
%matplotlib inline

In [29]:
import spacy

nlp = spacy.load("en_core_web_lg")
nlp.tokenizer.rules = {key: value for key, value in nlp.tokenizer.rules.items() if "cannot" not in key}
def spacyize(text):
    spac = nlp(text)
    return {'lemmas':[token.lemma_ for token in spac], 'pos':[token.pos_ for token in spac], 'stops':[token.is_stop for token in spac], 'tokens':[token.text for token in spac]}


In [30]:
def remove_punctuation(text):
    return text.translate(table)

In [31]:
import re

seed_finder = re.compile(r'“.+”') #note that this wont work if punctuation is removed

def get_seeds(text):
    m = seed_finder.match(text)
    if m:
        return m.group(0).strip('“”')


In [32]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


### build the graph

In [33]:
G = parseText('data/electionActLabeled.txt', verbose=False)

node: Alberta_Election_Act:{}
node: Alberta_Election_Act--None--0--None--0--Interpretation:{'text': 'Interpretation', 'level': 6}
node: Alberta_Election_Act--None--0--None--0--Interpretation--1--1:{'text': 'In this Act,', 'level': 8}
node: Alberta_Election_Act--None--0--None--0--Interpretation--1--1--a:{'text': '“advance poll” means a poll taken in advance of polling day;', 'level': 9}
node: Alberta_Election_Act--None--0--None--0--Interpretation--1--1--b:{'text': '“by‑election” means an election other than a general election;', 'level': 9}
node: Alberta_Election_Act--None--0--None--0--Interpretation--1--1--b.1:{'text': '“campaign period” means', 'level': 9}
node: Alberta_Election_Act--None--0--None--0--Interpretation--1--1--b.1--i:{'text': 'in the case of a general election held in accordance with section 38.1(2), the period commencing on February 1 in the year in which the election is held and ending 2 months after polling day,', 'level': 10}
node: Alberta_Election_Act--None--0--None-

### create the initial training and test splits

In [34]:
interp = set(nx.dfs_tree(G, source='Alberta_Election_Act--None--0--None--0--Interpretation'))
rem = set(nx.dfs_tree(G, source='Alberta_Election_Act').nodes)
print(len(interp))
print(len(rem))
candidates = rem-interp

83
2064


In [35]:
x_train, x_test = tts(list(candidates), test_size=0.10, random_state=42)

In [36]:
print(len(x_train))

1782


In [39]:
df = pd.DataFrame({'text':[G.nodes[x]['text'] for  x in x_train if len(G.nodes[x])>0]})
df['level'] = [G.nodes[x]['level'] for  x in x_train if len(G.nodes[x])>0]
df['node'] = [x for  x in x_train if len(G.nodes[x])>0]

df.to_csv('train.csv')
df.head()


Unnamed: 0,text,level,node
0,The Chief Electoral Officer’s directive under ...,8,Alberta_Election_Act--Appointments--1--None--0...
1,Advance poll count,6,"Alberta_Election_Act--Elections,_By‑Elections_..."
2,Each person being nominated as a candidate sha...,8,"Alberta_Election_Act--Elections,_By‑Elections_..."
3,A contracting party who enters into a complian...,7,Alberta_Election_Act--Administrative_Penalties...
4,"the candidates for the electoral division, the...",9,Alberta_Election_Act--Post‑Polling‑Day_Procedu...


In [40]:
len(df)

1781

# evaluation functions

In [42]:
def entities_from_tags(text, tags):
    entities = []
    in_ent =False
    for te, ta in zip(text.split(), tags):
        #print(te, ta)
        try:
            if ta == 'U':
                entities.append(te)
            if ta == 'B':
                entities.append([te])
            if ta == 'I' or ta == 'L':
                entities[-1].append(te)
            if ta == 'L':
                entities[-1] = ' '.join(entities[-1])
        except:
            print(text)
            print(tags)
    return entities

def fix_tags(break_index, tags):
    if break_index:
        return tags[1:]
    return tags

def to_set(entities):
    s = set()
    for ent in entities:
        s.update(ent.split())
    return s

In [13]:
eval_data = pd.read_csv('annotated.csv')
eval_data = eval_data.loc[:, ~eval_data.columns.str.contains('^Unnamed')]
eval_data.rename(columns={'0':'text'}, inplace=True)
#df.Tags.apply(lambda x: x[1:-1].split(','))
eval_data['tags'] = eval_data['tags'].apply(lambda x: list(map(lambda y: str(y).strip().strip("'"), x[1:-1].split(','))))
eval_data['tags'] = eval_data.apply(lambda x: fix_tags(getBreak(x['text']), x['tags']), axis=1)
eval_data['text'] = eval_data['text'].apply(lambda x: remove_punctuation(x[getBreak(x):].strip()))
eval_data['entities'] = eval_data.apply( lambda x: entities_from_tags(x['text'], x['tags']), axis = 1)
eval_data['entity_indexes'] = eval_data.tags.apply(lambda x: [i for i,y in enumerate(x) if y in "BILU"])
eval_data['entity_token_set'] = eval_data.entities.apply(to_set)
eval_data['entity_set'] = eval_data.entities.apply(lambda x: set(x))
eval_data.to_csv('final_evaluation_data.csv')
eval_data.head()

Unnamed: 0,text,tags,entities,entity_indexes,entity_token_set,entity_set
0,a statement of the availability of barrier‑fre...,"[O, U, O, O, O, O, B, L, O, O, O, O, O, B, L, ...","[statement, barrier‑free accessibility, return...","[1, 6, 7, 13, 14, 18, 19, 20]","{statement, advance, places, polling, officer,...","{statement, advance polling places, barrier‑fr..."
1,The following restrictions apply with respect ...,"[O, O, O, O, O, O, O, O, O, O, B, I, L]",[accessible voting equipment],"[10, 11, 12]","{accessible, equipment, voting}",{accessible voting equipment}
2,On the coming into force of subsection (1) the...,"[O, O, O, O, O, O, B, L, O, O, O]",[subsection (1)],"[6, 7]","{subsection, (1)}",{subsection (1)}
3,are Canadian citizens,"[O, B, L]",[Canadian citizens],"[1, 2]","{Canadian, citizens}",{Canadian citizens}
4,forfeits the persons right to vote in the elec...,"[O, O, O, O, O, O, O, O, U, O]",[election],[8],{election},{election}


# prepare the data

### Evaluation Dataset

In [14]:
eval_data['text']=eval_data.text.apply(remove_punctuation)
eval_data['text']=eval_data.text.apply(lambda y: ' '.join(y.split()))
spac = eval_data.text.apply(spacyize)
eval_data['lemmas'] = [x['lemmas'] for x in spac]
eval_data['pos'] = [x['pos'] for x in spac]
eval_data['stops'] = [x['stops'] for x in spac]

In [15]:
eval_data.head()

Unnamed: 0,text,tags,entities,entity_indexes,entity_token_set,entity_set,lemmas,pos,stops
0,a statement of the availability of barrier‑fre...,"[O, U, O, O, O, O, B, L, O, O, O, O, O, B, L, ...","[statement, barrier‑free accessibility, return...","[1, 6, 7, 13, 14, 18, 19, 20]","{statement, advance, places, polling, officer,...","{statement, advance polling places, barrier‑fr...","[a, statement, of, the, availability, of, barr...","[DET, NOUN, ADP, DET, NOUN, ADP, NUM, NOUN, AD...","[True, False, True, True, False, True, False, ..."
1,The following restrictions apply with respect ...,"[O, O, O, O, O, O, O, O, O, O, B, I, L]",[accessible voting equipment],"[10, 11, 12]","{accessible, equipment, voting}",{accessible voting equipment},"[the, follow, restriction, apply, with, respec...","[DET, VERB, NOUN, VERB, ADP, NOUN, ADP, DET, N...","[True, False, False, False, True, False, True,..."
2,On the coming into force of subsection (1) the...,"[O, O, O, O, O, O, B, L, O, O, O]",[subsection (1)],"[6, 7]","{subsection, (1)}",{subsection (1)},"[on, the, come, into, force, of, subsection, (...","[ADP, DET, VERB, ADP, NOUN, ADP, NOUN, PUNCT, ...","[True, True, False, True, False, True, False, ..."
3,are Canadian citizens,"[O, B, L]",[Canadian citizens],"[1, 2]","{Canadian, citizens}",{Canadian citizens},"[be, canadian, citizen]","[AUX, ADJ, NOUN]","[True, False, False]"
4,forfeits the persons right to vote in the elec...,"[O, O, O, O, O, O, O, O, U, O]",[election],[8],{election},{election},"[forfeit, the, person, right, to, vote, in, th...","[VERB, DET, NOUN, ADJ, PART, VERB, ADP, DET, N...","[False, True, False, False, True, False, True,..."


### Train Data

In [16]:
df.text = df.text.apply(remove_punctuation)
df['text']=df.text.apply(lambda y: ' '.join(y.split()))
spac = df.text.apply(spacyize)
df['lemmas'] = [x['lemmas'] for x in spac]
df['pos'] = [x['pos'] for x in spac]
df['stops'] = [x['stops'] for x in spac]

In [17]:
df.head()

Unnamed: 0,text,level,node,lemmas,pos,stops
0,the Certificate and Return,9,Alberta_Election_Act--Post‑Polling‑Day_Procedu...,"[the, Certificate, and, Return]","[DET, PROPN, CCONJ, PROPN]","[True, False, True, False]"
1,Fees and Expenses,3,Alberta_Election_Act--Fees_and_Expenses--9,"[fee, and, expense]","[NOUN, CCONJ, NOUN]","[False, True, False]"
2,In counting the votes the deputy returning off...,8,"Alberta_Election_Act--Elections,_By‑Elections_...","[in, count, the, vote, the, deputy, return, of...","[ADP, VERB, DET, NOUN, DET, NOUN, VERB, NOUN, ...","[True, False, True, False, True, False, False,..."
3,The Chief Electoral Officer shall establish gu...,8,"Alberta_Election_Act--Elections,_By‑Elections_...","[the, Chief, Electoral, Officer, shall, establ...","[DET, PROPN, PROPN, PROPN, AUX, VERB, NOUN, VE...","[True, False, False, False, False, False, Fals..."
4,A registered candidate a registered constituen...,8,"Alberta_Election_Act--Elections,_By‑Elections_...","[a, registered, candidate, a, register, consti...","[DET, ADJ, NOUN, DET, VERB, PROPN, PROPN, CCON...","[True, False, False, True, False, False, False..."


### Seeds (interpretations)

In [18]:
interpretations = pd.DataFrame({'text':[G.nodes[x]['text'].strip() for  x in interp if len(G.nodes[x])>0]})
interpretations['level'] = [G.nodes[x]['level'] for  x in interp if len(G.nodes[x])>0]
interpretations['node'] = [x for  x in interp if len(G.nodes[x])>0]
interpretations

Unnamed: 0,text,level,node
0,“judge” means a judge of the Court of Queen’s ...,9,Alberta_Election_Act--None--0--None--0--Interp...
1,in the case of a general election held in acco...,10,Alberta_Election_Act--None--0--None--0--Interp...
2,“respondent” means a candidate against whose e...,9,Alberta_Election_Act--None--0--None--0--Interp...
3,“polling day” means the day fixed for voting a...,9,Alberta_Election_Act--None--0--None--0--Interp...
4,repealed 2004 c23 s2;,9,Alberta_Election_Act--None--0--None--0--Interp...
...,...,...,...
78,“registered constituency association” means a...,9,Alberta_Election_Act--None--0--None--0--Interp...
79,“clerk” means a clerk of the Court of Queen’s ...,9,Alberta_Election_Act--None--0--None--0--Interp...
80,“voting” means voting at an election or plebis...,9,Alberta_Election_Act--None--0--None--0--Interp...
81,“polling station” means a place where an elect...,9,Alberta_Election_Act--None--0--None--0--Interp...


In [19]:
interpretations['seed'] = interpretations.text.apply(get_seeds )

### output

In [713]:
#df.to_csv('dataset/train.csv')
#eval_data.to_csv('dataset/test.csv')
#interpretations.to_csv('dataset/interpretations.csv')
df.to_pickle('dataset/train.pickle')
eval_data.to_pickle('dataset/test.pickle')
interpretations.to_pickle('dataset/interpretations.pickle')
nx.write_gpickle(G, 'dataset/graph.pickle')


### reload...

In [43]:
df = pd.read_pickle('dataset/train.pickle')
eval_data = pd.read_pickle('dataset/test.pickle')
interpretations = pd.read_pickle('dataset/interpretations.pickle')
G = nx.read_gpickle('dataset/graph.pickle')

### OOP code objects

In [44]:
from functools import lru_cache

def evaluate_entity_sets(entity_sets):
    results = []
    total_correctly_matched = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    for s, v in zip(entity_sets, eval_data.entity_set):
        correctly_matched = len(s.intersection(v))
        false_positive = len(s-v)
        false_negative = len(v-s)
        total_correctly_matched += correctly_matched
        total_false_positives += false_positive
        total_false_negatives += false_negative
        results.append({'correct':correctly_matched, 'false_positive':false_positive, 'false_negative':false_negative})
    precision = total_correctly_matched/(total_correctly_matched+total_false_positives)
    recall = total_correctly_matched/(total_correctly_matched+total_false_negatives)
    f1 = 2*((precision*recall)/(precision+recall))
    return {'precision':precision, 'recall':recall, 'f1':f1, 'results':results}

def evaluate_entity_token_sets(entity_sets):
    results = []
    total_correctly_matched = 0
    total_false_positives = 0
    total_false_negatives = 0
    
    for s, v in zip(entity_sets, eval_data.entity_token_set):
        correctly_matched = len(s.intersection(v))
        false_positive = len(s-v)
        false_negative = len(v-s)
        total_correctly_matched += correctly_matched
        total_false_positives += false_positive
        total_false_negatives += false_negative
        results.append({'correct':correctly_matched, 'false_positive':false_positive, 'false_negative':false_negative})
    precision = total_correctly_matched/(total_correctly_matched+total_false_positives)
    recall = total_correctly_matched/(total_correctly_matched+total_false_negatives)
    f1 = 2*((precision*recall)/(precision+recall))
    return {'precision':precision, 'recall':recall, 'f1':f1, 'results':results}

class Span():
    def __init__(self, start, end):
        self.start = start
        self.end = end
        self.len = end-start
    
    def __len__(self):
        return self.len
    
    def overlap(self, other):
        return max(self.start, other.start) <= min(self.end, other.end)
    
    def overlaps(self,other):
        return self.overlap(other)
        
    def __repr__(self):
        return f'start: {self.start} end: {self.end}'
    
    def __str__(self):
        return self.__repr__()
    
    def __eq__(self, other):
        return self.start == other.start and self.end == other.end
        
    def __ne__(self, other):
        return (not self.__eq__(other))
    
    def __hash__(self):
        return hash(self.__repr__())

def char_spans_to_token_span(char_spans, text):
        tokens=tokenize(text)
        
        token_spans = []
        #print(char_spans)

        for span in char_spans:
            this_span_start = -1
            this_span_end = -1
            start = 0
            end = 0
            for i, t in enumerate(tokens):
                end += len(t)
                t_span = Span(start, end)
                start = end
                overlaps =span.overlap(t_span)
                
                if overlaps and this_span_start == -1 and t != ' ':
                    this_span_start = i
                elif overlaps:
                    this_span_end = i
                elif this_span_start != -1 and this_span_end != -1: 
                    token_spans.append(Span(this_span_start,this_span_end))
                    break
            else:
                if this_span_start == -1 or this_span_end == -1:
                    return []
                token_spans.append(Span(this_span_start,len(tokens)))

        return token_spans
            
class ReSearch():
    def __init__(self, text_re, pos_re):
        self.text_re = text_re
        self.pos_re = pos_re
        
    def match_text(self, text):
        spans = []
        for m in self.text_re.finditer(text):
            s = Span(m.start(),m.end())
            if any([s.overlap(x) for x in spans]):
                continue
            spans.append(s)
        return char_spans_to_token_span(spans, text)
    
    def match_pos(self, poses):
        if isinstance(poses, list):
            poses = ' '.join(str(x) for x in poses)
        return char_spans_to_token_span([Span(m.start(),m.end()) for m in self.pos_re.finditer(poses)], poses)
    
    def match_both(self, text, poses):
        
        text_spans = set(self.match_text(text))
        pos_spans = set(self.match_pos(poses))
        return set(text_spans).intersection(set(pos_spans))

        
class Seed(ReSearch):
    def __init__(self, text):
        #handle text
        self.text = remove_punctuation(text.lower())
        spac = spacyize(self.text)
        self.lemmas = spac['lemmas']
        self.pos = spac['pos']
        self.stops = spac['stops']
        self.embedding = embed([self.text])
        
        #handle pos tags
        self.tags = str(" ".join(x for x in self.pos))
        super().__init__(
                       re.compile(f"\\b{self.text}\\b", re.I), 
                       re.compile(f"\\b{self.tags}\\b", re.I)
                      )

    def __repr__(self):
        return f'Text: "{self.text}"' + '\n' + f'Pos: "{self.tags}"'
    
    def __str__(self):
        return self.__repr__()
    
    def fullMatch(self, text, tags):
        return self.match_both(text, tags)
@lru_cache()   
def get_pos_tags(text):
    spac = spacyize(text)
    return " ".join(str(x) for x in spac['pos'])
@lru_cache()
def tokenize(text):
    return re.split('(\W)', text)

class SeedList():
    def __init__(self, startList):
        self.seeds = [Seed(text) for text in startList]
        self.sort()
        
    def add(self, text, sort = True):
        self.seeds.append(Seed(text))
        if sort:
            self.sort()
            
    def sort(self):
        self.seeds.sort(key=lambda x: len(x.text), reverse = True)
        
    def __len__(self):
        return len(self.seeds)
    
    def get_spans_from_seeds(self, text):
        spans = set()
        tokens = tokenize(text)
        for i, seed in enumerate(self.seeds):
            matches = seed.match_text(text)
            for match in matches:
                for span in spans:
                    if span.overlaps(match):
                        break
                else:
                    spans.add(match)
        return spans
                    
    def get_entity_sets_from_seeds(self, text, whole=True, both=False):
        entities = set()
        spans = set()
        tags = get_pos_tags(text)
        tokens = tokenize(text)
        for i, seed in enumerate(self.seeds):
            if both:
                matches = seed.fullMatch(text, tags)
            else:
                matches = seed.match_text(text)
            for match in matches:
                for span in spans:
                    if span.overlaps(match):
                        break
                else:
                    spans.add(match)
                    if whole:
                        entities.add(''.join(tokens[match.start:match.end]))
                    else:
                        entities.update(t for t in tokens[match.start:match.end] if t != ' ')
                    
                    
        return entities
    
    def score(self, text):
        emb = embed([text])
        sc = (0.0, '')
        for seed in self.seeds:
            this_score = np.inner(seed.embedding, emb)
            if this_score>sc[0]:
                sc = (float(this_score), seed.text)
        return sc

class Pattern(ReSearch):
    def __init__(self, prior_pos, entity_pos, post_pos, entity_seed):
        self.prior_pos = prior_pos
        self.entity_pos = entity_pos
        self.post_pos = post_pos
        self.pattern = prior_pos + entity_pos + post_pos
        self.prior_len = len(prior_pos)*2-1 #account for the spaces....
        self.post_len = len(post_pos)*2-1 #account for the spaces....
        self.entity_len = len(entity_pos)
        self.len = len(self.pattern)
        self.entity_seed = entity_seed
        self.pattern_text = ' '.join(x for x in self.pattern)
        self.entities = set()
        super().__init__(
                       re.compile(f"\\b{self.entity_seed}\\b", re.I), 
                       re.compile(f"\\b{self.pattern_text}\\b", re.I)
                      )
    
    def __repr__(self):
        return ', '.join(self.pattern)
    
    def __str__(self):
        return self.__repr__()
        
    def __len__(self):
        return self.len
    
    def __eq__(self, other):
        if isinstance(other, Pattern):
            return self.__repr__() == other.__repr__()
        else:
            return False
    
    def __ne__(self, other):
        return (not self.__eq__(other))
    
    def __hash__(self):
        return hash(self.__repr__())
    
    def get_entities(self, text, tag_string, both=False):
        if isinstance(text, str):
            tokens = tokenize(text)
        if isinstance(text, list):
            tokens = text
            text = ''.join(tokens)
            
        if not isinstance(tag_string, str):
            tag_string = ' '.join(tag_string)
        
        entity_matches = set()
        
        if both:
            pos_spans = self.match_both(text, tag_string)
        else:
            pos_spans = self.match_pos(tag_string)
        for span in pos_spans:
            entity_matches.add(''.join(tokens[span.start+self.prior_len:span.end-self.post_len]).strip())
        return entity_matches
        
        
class Patterns():
    def __init__(self, startList, evaluation_data):
        self.seedList = SeedList(startList)
        self.eval_data = eval_data
        self.seed_set = startList
        self.results = []
        self.patterns = set()
        
    def print_seeds(self):
        for x in self.seedList.seeds:
            print(x)
            
    def __len__(self):
        return len(self.seedList)
    
    def check_seeds(self, text, whole=True):
        return self.seedList.get_entity_sets_from_seeds(text, whole=whole)

    def add_seed(self, text):
        self.seedList.add(text)
        self.seed_set.add(text)
    
    def score(self, text):
        return self.seedList.score(text)

    def evaluate(self, new_candidates = []):
        initial_seed_set = self.eval_data.text.apply(self.check_seeds, whole=True)
        r = evaluate_entity_sets(initial_seed_set)
#         print('Results for whole entitiy objects')
#         print(f"Entity Set Precision: {r['precision']}")
#         print(f"Entity Set Recall: {r['recall']}")
#         print(f"Entity Set F1: {r['f1']}")
#         print()
        

#         print('Results for partial entitiy objects')
        initial_token_set = self.eval_data.text.apply(self.check_seeds, whole=False)
        rtok = evaluate_entity_token_sets(initial_token_set)
#         print(f"Entity Token Set Precision: {rtok['precision']}")
#         print(f"Entity Token Set Recall: {rtok['recall']}")
#         print(f"Entity Token Set F1: {rtok['f1']}")
        
        self.results.append({
            'F1':r['f1'],
            'Precision': r['precision'],
            'Recall': r['recall'],
            'Partial F1': rtok['f1'],
            'Partial Recall': rtok['recall'],
            'Partial Precision': rtok['precision'],
            'New Candidates':new_candidates,
        })
        return r, rtok
    
    def get_spans(self, text):
        return self.seedList.get_spans_from_seeds(text)
        
    def build_patterns(self, train_data, pre_window=4, post_window=4):
        
        for a, row in train_data.iterrows():
            spans = self.get_spans(row.text)
            tokens = tokenize(row.text)
            pos = row.pos
            for span in spans:
                pos_start = span.start-span.start//2
                pos_end = span.end-span.end//2
                for a in range(2,pre_window + 1):
                    for b in range(2,post_window +1):
                        p = Pattern(pos[max(0, pos_start-a):pos_start],pos[pos_start:pos_end],pos[pos_end:pos_end+b], ''.join(tokens[span.start:span.end]))
                        if p not in self.patterns:
                            self.patterns.add(p)
                        
        print(len(self.patterns))
        
    def run(self, df):
        for pattern in tqdm(self.patterns):
            pattern_entities = set()
            if len(pattern.entities) == 0:
                for _, row in df.iterrows():
                    pattern_entities.update(pattern.get_entities(row.text, row.pos))
                pattern.entities = pattern_entities    

                
            pattern.positives = self.seed_set.intersection(pattern.entities)
            pattern.negatives = pattern.entities-self.seed_set
            if len(pattern.positives) == 0 or len(pattern.negatives) == 0:
                pattern.overall = 0
                pattern.score = 0
                continue
            
            pattern.score = sum([1-self.score(x)[0] for x in pattern.negatives])
            pattern.overall = len(pattern.positives)/(len(pattern.negatives)+pattern.score+0.00000000000001)*math.log(len(pattern.positives))

        ranked = sorted(list(self.patterns), reverse=True, key = lambda x: x.overall)
        new_candidates = set()
        for r in ranked[:50]:
            new_candidates.update(r.negatives)
        new_candidates = sorted(new_candidates, key=lambda x: self.score(x), reverse = True)
        for cand in new_candidates[:10]:
            self.add_seed(cand)
        self.evaluate(new_candidates[:10])
        print(self.results[-1])

In [21]:
seeds = Patterns(set((x for x in interpretations.seed if x)), eval_data)
#seeds.print_seeds()

# Spacy Evaluation

In [22]:
part_results = []
whole_results = []

for text in eval_data.text:
    wholes = set()
    parts = set()
    
    doc = nlp(text)
    for ent in doc.ents:
        wholes.add(ent.text)
        parts.update(ent.text.split())
    part_results.append(parts)
    whole_results.append(wholes)
#print(part_results)
#print()
#print(whole_results)
r = evaluate_entity_sets(whole_results)
print(f"Entity Set Precision: {r['precision']}")
print(f"Entity Set Recall: {r['recall']}")
print(f"Entity Set F1: {r['f1']}")
print()
rtok = evaluate_entity_token_sets(part_results)
print(f"Entity Token Set Precision: {rtok['precision']}")
print(f"Entity Token Set Recall: {rtok['recall']}")
print(f"Entity Token Set F1: {rtok['f1']}")

Entity Set Precision: 0.2357142857142857
Entity Set Recall: 0.059245960502693
Entity Set F1: 0.09469153515064563

Entity Token Set Precision: 0.6743295019157088
Entity Token Set Recall: 0.17238001958863858
Entity Token Set F1: 0.2745709828393136


# initial results on seed dict from interpretation entities :)

In [23]:
e = seeds.evaluate()
seeds.results

[{'F1': 0.4383561643835616,
  'Precision': 0.7154471544715447,
  'Recall': 0.31597845601436264,
  'Partial F1': 0.5237757274662882,
  'Partial Recall': 0.3614103819784525,
  'Partial Precision': 0.9510309278350515,
  'New Candidates': []}]

# seed expansion

### set up initial patterns on the training data

In [24]:
seeds.build_patterns(df)

10440


In [25]:
seeds.run(df)

100%|██████████| 10440/10440 [13:14<00:00, 13.15it/s]


{'F1': 0.44208037825059104, 'Precision': 0.6470588235294118, 'Recall': 0.3357271095152603, 'Partial F1': 0.5355191256830601, 'Partial Recall': 0.38393731635651324, 'Partial Precision': 0.8848758465011287, 'New Candidates': ['rejected ballots', 'election for', 'voter', 'vote', 'electors', 'election period', 'election process', 'poll clerk', 'ballot', 'the Election']}


In [26]:
currentF1 = seeds.results[0]['F1']
currentpF1 = seeds.results[0]['Partial F1']
f1diff = seeds.results[-1]['F1'] - currentF1
f1pdiff = seeds.results[-1]['Partial F1'] - currentpF1
print(f"F1 changed by: {f1diff*100:.2f}")
print(f"Partial F1 changed by: {f1pdiff*100:.2f}")
currentF1 = seeds.results[-1]['F1']
currentpF1 = seeds.results[-1]['Partial F1']

F1 changed by: 0.37
Partial F1 changed by: 1.17


## looooooopppppp

In [27]:
currentF1 = seeds.results[-1]['F1']
currentpF1 = seeds.results[-1]['Partial F1']
fail_count=0

while True:
    seeds.build_patterns(df)
    seeds.run(df)
   
    if seeds.results[-1]['F1'] <= currentF1 and seeds.results[-1]['Partial F1'] <= currentpF1:
        fail_count+=1
        if fail_count >=10:
            break
    else:
        fail_count = 0
        f1diff = seeds.results[-1]['F1'] - currentF1
        f1pdiff = seeds.results[-1]['Partial F1'] - currentpF1
        print(f"F1 changed by: {f1diff*100:.2f} and is at {seeds.results[-1]['F1']*100:.2f}")
        print(f"Partial F1 changed by: {f1pdiff*100:.2f} and is at {seeds.results[-1]['Partial F1']*100:.2f}")
        currentF1 = seeds.results[-1]['F1']
        currentpF1 = seeds.results[-1]['Partial F1']


12280


100%|██████████| 12280/12280 [02:56<00:00, 69.41it/s] 


{'F1': 0.46064814814814814, 'Precision': 0.6482084690553745, 'Recall': 0.35727109515260325, 'Partial F1': 0.5465587044534412, 'Partial Recall': 0.3966699314397649, 'Partial Precision': 0.8785249457700651, 'New Candidates': ['court', 'voter place', 'polling', 'ballot box', 'ballots', 'polling subdivisions', 'election clerk', 'polling booth', 'votes', 'the electoral']}
F1 changed by: 1.86 and is at 46.06
Partial F1 changed by: 1.10 and is at 54.66
13016


100%|██████████| 13016/13016 [01:36<00:00, 135.27it/s]


{'F1': 0.4657534246575343, 'Precision': 0.6394984326018809, 'Recall': 0.36624775583482944, 'Partial F1': 0.5584502338009353, 'Partial Recall': 0.40940254652301666, 'Partial Precision': 0.8781512605042017, 'New Candidates': ['election officers', 'election survey', 'elections enumerations', 'former election', 'election proclamation', 'election documents', 'supply vote', 'official', 'poll', 'judgment']}
F1 changed by: 0.51 and is at 46.58
Partial F1 changed by: 1.19 and is at 55.85
13495


100%|██████████| 13495/13495 [01:19<00:00, 170.03it/s]


{'F1': 0.4705882352941176, 'Precision': 0.6162790697674418, 'Recall': 0.38061041292639136, 'Partial F1': 0.568241469816273, 'Partial Recall': 0.4240940254652302, 'Partial Precision': 0.8608349900596421, 'New Candidates': ['survey', 'candidates', 'campaign', 'registration officer', 'referendum period', 'school division', 'petitioner', 'judges seal', 'name', 'custody']}
F1 changed by: 0.48 and is at 47.06
Partial F1 changed by: 0.98 and is at 56.82
14294


100%|██████████| 14294/14294 [01:43<00:00, 137.63it/s]


{'F1': 0.38316920322291853, 'Precision': 0.3821428571428571, 'Recall': 0.38420107719928187, 'Partial F1': 0.52954808806489, 'Partial Recall': 0.4476003917727718, 'Partial Precision': 0.64822695035461, 'New Candidates': ['a polling', 'nomination', 'persons nomination', 'deputy', 'information officer', 'appeal', 'is', 'check', 'sponsor', 'the']}
26691


100%|██████████| 26691/26691 [15:44<00:00, 28.25it/s]


{'F1': 0.3736075407026564, 'Precision': 0.35737704918032787, 'Recall': 0.39138240574506283, 'Partial F1': 0.5311622683885457, 'Partial Recall': 0.4632713026444662, 'Partial Precision': 0.6223684210526316, 'New Candidates': ['polling places', 'candidates behalf', 'nomination form', 'appointment', 'names', 'ballot envelope', 'voters', 'election enumeration', 'of the', 'persons behalf']}
27678


100%|██████████| 27678/27678 [02:10<00:00, 211.37it/s]


{'F1': 0.33206974981046244, 'Precision': 0.2874015748031496, 'Recall': 0.39317773788150806, 'Partial F1': 0.5135983263598327, 'Partial Recall': 0.48090107737512244, 'Partial Precision': 0.5510662177328844, 'New Candidates': ['of', 'persons', 'person', 'mailing', 'to a', 'certificate envelope', 'during the', 'information', 'landlords behalf', 'notice']}
31428


100%|██████████| 31428/31428 [05:32<00:00, 94.42it/s] 


{'F1': 0.3087621696801112, 'Precision': 0.25198637911464244, 'Recall': 0.3985637342908438, 'Partial F1': 0.5157790927021697, 'Partial Recall': 0.5122428991185113, 'Partial Precision': 0.519364448857994, 'New Candidates': ['registration', 'to', 'particulars', 'bulletins', 'respondents', 'employees', 'information complaints', 'recount', 'section', 'friend']}
35901


100%|██████████| 35901/35901 [06:25<00:00, 93.10it/s] 


{'F1': 0.3006711409395973, 'Precision': 0.24008574490889603, 'Recall': 0.4021543985637343, 'Partial F1': 0.510576923076923, 'Partial Recall': 0.5200783545543585, 'Partial Precision': 0.5014164305949008, 'New Candidates': ['landlords', 'certificate', 'envelope', 'proclamation', 'that', 'supply', 'decision', 'advertisement', 'onus', 'document']}
37176


100%|██████████| 37176/37176 [02:37<00:00, 235.36it/s]


error: unbalanced parenthesis at position 2

In [None]:
seeds.results

In [None]:
data = pd.DataFrame(seeds.results)
data

In [None]:
data.to_csv('data/results/mar18-punct.csv')

# Section Referrals

In [98]:
number_finder = re.compile(r'((?:section)|(?:part)|(?:division)|(?:subsection)|(?:subsubsection)){0,1} {0,1}(?:(\d+)|(\(\d+\))|(\([a-z]+\))+)+')
def find_possible_referrals(text):
    matches = number_finder.finditer(text, re.MULTILINE)
    return [str(match.group()).strip() for match in matches]
    for match in matches:

        print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))

        return [x for x in number_finder.findall(text)]
find_possible_referrals('refer to section 1, 45(5)(e)(i) in this act')

['section 1', '45(5)(e)(i)']