<a href="https://colab.research.google.com/github/ratmcu/wiki_ner/blob/master/conll_tagged_ne.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install wget
import os
import wget
try:
    import colabimport
except:
    colabimporturl = 'https://github.com/ratmcu/colaboratory_import/raw/master/colabimport.py'
    filename = colabimporturl.split("/")[-1].split("?")[0]
    if os.path.isfile(filename):
        os.remove(filename)
    wget.download(colabimporturl)
    import colabimport
colabimport.get_notebook('https://github.com/ratmcu/wiki_ner/blob/master/reusable_annotator.ipynb?raw=true')
colabimport.get_notebook('https://github.com/ratmcu/wiki_ner/blob/master/info_box.ipynb?raw=true')
# import io, os, sys, types
from reusable_annotator import PageContents
from info_box import InfoCard, PrivateEntities
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# logging.debug("test")
class HashableTupleAnnotations(tuple):
    def __hash__(self):
        return hash(tuple(sorted([self[0:1],self[1:2]])))
!pip install spacy
!python -m spacy download en_core_web_sm
import spacy
from spacy import displacy
from collections import Counter
nlp = spacy.load("en_core_web_sm")
import re
!pip install pyahocorasick
!pip install fuzzyset
from ahocorasick import Automaton
import fuzzyset
from operator import itemgetter, attrgetter
import pandas as pd
# import json

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [0]:
class WikiConLLTagger():
    def __init__(self, url):
        self.url = url
        self.page = PageContents(url)
        self.text = self.page.get_text_chunk()
        self.info_card = InfoCard(self.page)
        self.private_entities = PrivateEntities(self.info_card).entity_dict   
        self.doc = nlp(self.text)
        self.spacy_noun_chunks =  [(chunk.text, chunk.start, chunk.end) for chunk in self.doc.noun_chunks]
        self.tag_factory = TagFactory(self)
#         print(sorted(self.tag_factory.get_annotations(), key=lambda annot: annot[1][0]))
        
    def _get_annotations(self):
        '''stack all the annotations'''
        return sorted(sorted(self.tag_factory.get_annotations(), key=lambda annot: annot[1][0]), reverse=True) # sort them by the start index of the annotaion token
       
    def get_metadata(self):
        pe = self.private_entities
        # pe = [list(map(lambda li: item[1], item)) for item in pe.items()]
        dp = dict()
        [dp.update({item[0]: item[1][1]}) for item in pe.items()]
        dp.update({'URL' : self.url})
        return dp
        
    def place_tags(self):
        annotations = self._get_annotations()
        words = []
        tags = []
        annotation_tags = ['O']*len(self.doc)
#         annotation = annotations.pop()
        for annotation in annotations:
            annotation_tags[annotation[1][0]] = 'B-'+annotation[0]
            for i in range(annotation[1][1]-annotation[1][0]-1):
                annotation_tags[annotation[1][0]+i+1] = 'I-'+annotation[0]
#         for token in self.doc:
#             words.append(token)
#             if token
        for sentence in self.doc.sents:
            tags.extend(annotation_tags[sentence.start:sentence.end])
            tags.append('\n')
            for token in sentence:
                words.append(token)
            words.append('\n')
#         print(words)
#         print(tags)
        return {'words': words, 'tags':tags}
                

In [0]:
class TagFactory():
    '''will keep a set of all the tagged annotations in the form of
       ('word phrase', 'ENTITY', start, end), each tagging function will update the list'''
    def __init__(self, tagger):
        self.doc =tagger.doc
        self.entity_dict = tagger.private_entities
        self.tag_set = set()
        self.spacy_noun_chunks =  [(chunk.text, chunk.start, chunk.end) for chunk in self.doc.noun_chunks]
        tagging_methods = [getattr(self, method) for method in dir(self) if callable(getattr(self, method)) and re.match('_tag_.*', method)]
        for method in tagging_methods:
            method()
            
    def get_annotations(self):  
        '''list with ('ENTITY', start, end) '''
        return [annot[1:] for annot in list(self.tag_set)]
    
    def _tag_bd(self):
        date_entities = []
        if not 'BIRTH_DATE' in entity_dict:
            return
        birth_dates = self.entity_dict['BIRTH_DATE'][1]
        for entity in self.doc.ents:
            if entity.label_ == 'DATE':
                date_entities.append((entity.text, entity.label_, entity.start, entity.end))

        for date in date_entities:    
            fz = fuzzyset.FuzzySet(use_levenshtein=False)
            for word in date[0].split(): # add all the words into a fuzzy set from that sentence
                fz.add(word)        
            for j, detail in enumerate(birth_dates): # get a detail in the list under an info line
                matched_list = []
                tokens_in_detail = len(detail.split()) # split the detail into words
                for word in detail.split(): # get a word from the detail
                    result = fz.get(word)    # get the matching 
                    if(result and result[0][0]>=0.5 and not len(result[0][1])/2 < len(word)/2): #if the matching confidence is high and word length is high
                        matched_list.append((word, result))
                if abs(len(matched_list) - tokens_in_detail) < 1 and len(matched_list)!=0:
                    annot = (date[0], 'BD', date[2:])
                    self.tag_set.add(HashableTupleAnnotations(annot))
    
    def _tag_bp(self):
        place_entities = []
        if not 'BIRTH_PLACE' in entity_dict:
            return
        birth_places = self.entity_dict['BIRTH_PLACE'][1]
        for entity in self.doc.ents:
            if entity.label_ == 'GPE':
                place_entities.append((entity.text, entity.label_, entity.start, entity.end))
        for place in place_entities:    
            fz = fuzzyset.FuzzySet(use_levenshtein=False)
            for word in place[0].split(): # add all the words into a fuzzy set from that sentence
                fz.add(word)        
            for j, detail in enumerate(birth_places): # get a detail in the list under an info line
                matched_list = []
                tokens_in_detail = len(detail.split()) # split the detail into words
                for word in detail.split(): # get a word from the detail
                    result = fz.get(word)    # get the matching 
                    if(result and result[0][0]==1):
                        matched_list.append((word, result))
                if abs(len(matched_list) - tokens_in_detail) < 1 and len(matched_list)!=0:
                    annot = (place[0], 'BP', place[2:])
                    logging.debug(annot)
                    self.tag_set.add(HashableTupleAnnotations(annot))
        logging.debug('------NOUN CHUNCK MATCHING------')
        for noun in self.spacy_noun_chunks:    #('Joachim Wilhelm Gauck', 0, 4) 
            fz = fuzzyset.FuzzySet(use_levenshtein=False)
            for word in noun[0].split(): # add all the words into a fuzzy set from that sentence
                fz.add(word)        
            for j, detail in enumerate(birth_places): # get a detail in the list under an info line
                matched_list = []
                tokens_in_detail = len(detail.split()) # split the detail into words
                for word in detail.split(): # get a word from the detail
                    result = fz.get(word)    # get the matching 
                    if(result and result[0][0]==1):
                        matched_list.append((word, result))
                if abs(len(matched_list) - tokens_in_detail) < 1 and len(matched_list)!=0:
                    logging.debug((noun[0], 'BP', noun[1:]))
                    self.tag_set.add(HashableTupleAnnotations((noun[0], 'BP', noun[1:])))

    def _tag_spouse(self):
        children_entities = []
        if not 'SPOUSES' in entity_dict:
            return
        children_names = self.entity_dict['SPOUSES'][1]
        for entity in self.doc.ents:
            if entity.label_ == 'PERSON':
                children_entities.append((entity.text, entity.label_, entity.start, entity.end))

        for child in children_entities:    
            fz = fuzzyset.FuzzySet(use_levenshtein=False)
            for word in child[0].split(): # add all the words into a fuzzy set from that sentence
                fz.add(word)        
            for j, detail in enumerate(children_names): # get a detail in the list under an info line
                matched_list = []
                tokens_in_detail = len(detail.split()) # split the detail into words
                for word in detail.split(): # get a word from the detail
                    result = fz.get(word)    # get the matching 
                    if(result and result[0][0]>=0.5 and not len(result[0][1])/2 < len(word)/2): #if the matching confidence is high and word length is high
                        matched_list.append((word, result))
                if abs(len(matched_list) - tokens_in_detail) < 1 and len(matched_list)!=0:
                    annot = (child[0], 'SP', child[2:])
                    logging.debug(annot)
                    self.tag_set.add(HashableTupleAnnotations(annot))
        logging.debug('------NOUN CHUNCK MATCHING------')
        for noun in self.spacy_noun_chunks:    #('Joachim Wilhelm Gauck', 0, 4) 
            fz = fuzzyset.FuzzySet(use_levenshtein=False)
            for word in noun[0].split(): # add all the words into a fuzzy set from that sentence
                fz.add(word)        
            for j, detail in enumerate(children_names): # get a detail in the list under an info line
                matched_list = []
                tokens_in_detail = len(detail.split()) # split the detail into words
                for word in detail.split(): # get a word from the detail
                    result = fz.get(word)    # get the matching 
                    if(result and result[0][0]>=0.5 and not len(result[0][1])/2 < len(word)/2): #if the matching confidence is high and word length is high
                        matched_list.append((word, result))
                if  len(matched_list)!=0:
                    logging.debug((noun[0], 'SP', noun[1:]))
                    self.tag_set.add(HashableTupleAnnotations((noun[0], 'SP', noun[1:])))
                
    def _tag_edu(self):
        children_entities = []
        if not 'EDUCATION' in entity_dict:
            return
        children_names = self.entity_dict['EDUCATION'][1]
        for entity in self.doc.ents:
            if entity.label_ == 'ORG':
                children_entities.append((entity.text, entity.label_, entity.start, entity.end))

        for child in children_entities:    
            fz = fuzzyset.FuzzySet(use_levenshtein=False)
            for word in child[0].split(): # add all the words into a fuzzy set from that sentence
                fz.add(word)        
            for j, detail in enumerate(children_names): # get a detail in the list under an info line
                matched_list = []
                tokens_in_detail = len(detail.split()) # split the detail into words
                for word in detail.split(): # get a word from the detail
                    result = fz.get(word)    # get the matching 
                    if(result and result[0][0]>=0.8 and not len(result[0][1])/2 < len(word)/2): #if the matching confidence is high and word length is high
                        matched_list.append((word, result))
                if abs(len(matched_list) - tokens_in_detail) < 1 and len(matched_list)!=0:
                    annot = (child[0], 'ED', child[2:])
                    logging.debug(annot)
                    self.tag_set.add(HashableTupleAnnotations(annot))
                    
        logging.debug('------NOUN CHUNCK MATCHING------')
        for noun in self.spacy_noun_chunks:    #('Joachim Wilhelm Gauck', 0, 4) 
            fz = fuzzyset.FuzzySet(use_levenshtein=False)
            for word in noun[0].split(): # add all the words into a fuzzy set from that sentence
                fz.add(word)        
            for j, detail in enumerate(children_names): # get a detail in the list under an info line
                matched_list = []
                tokens_in_detail = len(detail.split()) # split the detail into words
                for word in detail.split(): # get a word from the detail
                    result = fz.get(word)    # get the matching 
                    if(result and result[0][0]>=0.9 and not len(result[0][1])/2 < len(word)/2): #if the matching confidence is high and word length is high
                        matched_list.append((word, result))
                if abs(len(matched_list) - tokens_in_detail) < 1 and len(matched_list)!=0:
                    logging.debug((noun[0], 'ED', noun[1:]))
                    self.tag_set.add(HashableTupleAnnotations((noun[0], 'ED', noun[1:])))
                elif noun[0].split()[0] ==  detail.split()[0] and len(noun[0].split())>1: # lets add an exception for University name matching
                    logging.debug((noun[0], 'ED', noun[1:]))
#                     self.tag_set.add(HashableTupleAnnotations((noun[0], 'ED', noun[1:])))
            
    def _tag_children(self):
        children_entities = []
        if not 'CHILDREN' in entity_dict:
            return
        children_names = self.entity_dict['CHILDREN'][1]
        for entity in self.doc.ents:
            if entity.label_ == 'PERSON':
                children_entities.append((entity.text, entity.label_, entity.start, entity.end))

        for child in children_entities:    
            fz = fuzzyset.FuzzySet(use_levenshtein=False)
            for word in child[0].split(): # add all the words into a fuzzy set from that sentence
                fz.add(word)        
            for j, detail in enumerate(children_names): # get a detail in the list under an info line
                matched_list = []
                tokens_in_detail = len(detail.split()) # split the detail into words
                for word in detail.split(): # get a word from the detail
                    result = fz.get(word)    # get the matching 
                    if(result and result[0][0]>=0.5 and not len(result[0][1])/2 < len(word)/2): #if the matching confidence is high and word length is high
                        matched_list.append((word, result))
                if abs(len(matched_list) - tokens_in_detail) < 1 and len(matched_list)!=0:
                    annot = (child[0], 'CH', child[2:])
                    logging.debug(annot)
                    self.tag_set.add(HashableTupleAnnotations(annot))

                    logging.debug('------NOUN CHUNCK MATCHING------')
        for noun in self.spacy_noun_chunks:    #('Joachim Wilhelm Gauck', 0, 4) 
            fz = fuzzyset.FuzzySet(use_levenshtein=False)
            for word in noun[0].split(): # add all the words into a fuzzy set from that sentence
                fz.add(word)        
            for j, detail in enumerate(children_names): # get a detail in the list under an info line
                matched_list = []
                tokens_in_detail = len(detail.split()) # split the detail into words
                for word in detail.split(): # get a word from the detail
                    result = fz.get(word)    # get the matching 
                    if result and result[0][0]>=0.5 and not len(result[0][1])/2 < len(word)/2 \
                              and result[0][1][0] == word[0]: #if the matching confidence is high and word length is high
                        matched_list.append((word, result))
                if  len(matched_list)!=0:
                    logging.debug((noun[0], 'CH', noun[1:]))
                    self.tag_set.add(HashableTupleAnnotations((noun[0], 'CH', noun[1:])))
            
            
    def _tag_parents(self):
        parent_entities = []
        if not 'PARENTS' in entity_dict:
            return
        parent_names = self.entity_dict['PARENTS'][1]
        for entity in self.doc.ents:
            if entity.label_ == 'PERSON':
        #         print(entity.text, entity.label_, entity.start, entity.end)
                parent_entities.append((entity.text, entity.label_, entity.start, entity.end))

        for parent in parent_entities:    
            fz = fuzzyset.FuzzySet(use_levenshtein=False)
            for word in parent[0].split(): # add all the words into a fuzzy set from that sentence
                fz.add(word)        
            for j, detail in enumerate(parent_names): # get a detail in the list under an info line
                matched_list = []
                tokens_in_detail = len(detail.split()) # split the detail into words
                for word in detail.split(): # get a word from the detail
                    result = fz.get(word)    # get the matching 
                    if(result and result[0][0]>=0.5 and not len(result[0][1])/2 < len(word)/2): #if the matching confidence is high and word length is high
                        matched_list.append((word, result))
                if abs(len(matched_list) - tokens_in_detail) < 1 and len(matched_list)!=0:
                    annot = (parent[0], 'PR', parent[2:])
                    logging.debug(annot)
                    self.tag_set.add(HashableTupleAnnotations(annot))
        logging.debug('------NOUN CHUNCK MATCHING------')
        for noun in self.spacy_noun_chunks:    #('Joachim Wilhelm Gauck', 0, 4) 
            fz = fuzzyset.FuzzySet(use_levenshtein=False)
            noun_words = noun[0].split()
            for word in noun_words: # add all the words from the noun phrase into a fuzzy set
                fz.add(word)        
            for j, detail in enumerate(parent_names): # get a one candidate
                matched_list = []
                tokens_in_detail = detail.split()
                if len(noun_words) < len(tokens_in_detail):
                    continue
                for i, word in enumerate(tokens_in_detail): # get a word from the detail
                    result = fz.get(word)    # get the matching 
                    if result and result[0][0]>=0.5 and not len(result[0][1]) < len(word) \
                              and result[0][1][0] == word[0]: #if the matching confidence is high and word length is high
                        matched_list.append((word, result, i))
                if  len(matched_list)!=0 and matched_list[0][2]==0: #and noun[0].split()[0][0] == detail.split()[0][0]:
                    logging.debug((noun[0], 'PR', noun[1:]))
    #                 self.tag_set.add(HashableTupleAnnotations((noun[0], 'PR', noun[1:])))


In [0]:
#experiment_code
# tagger = WikiConLLTagger('https://en.wikipedia.org/wiki/Barack_Obama')
tagger = WikiConLLTagger('https://en.wikipedia.org/wiki/Donald_Trump')
print(tagger.get_metadata())

info card is scraped successfully
[['Born', 'PERSON'], ['Donald John Trump']]
[['Born', 'DATE'], ['1946-06-14', 'June 14, 1946']]
[['Born', 'GPE'], ['New York City', 'Queens']]
[['Children', 'PERSON'], ['Donald Jr.', 'Ivanka', 'Eric', 'Tiffany', 'Barron']]
[['Spouse(s)', 'PERSON'], ['Ivana Zelníčková', 'Marla Maples', 'Melania Knauss']]
[['Parents', 'PERSON'], ['Fred Trump', 'Mary Anne MacLeod']]
[['Education', 'ORG'], ['The Wharton School']]
b'{"NAME": ["Donald John Trump"], "BIRTH_DATE": ["1946-06-14", "June 14, 1946"], "BIRTH_PLACE": ["New York City", "Queens"], "CHILDREN": ["Donald Jr.", "Ivanka", "Eric", "Tiffany", "Barron"], "SPOUSES": ["Ivana Zeln\xc3\xad\xc4\x8dkov\xc3\xa1", "Marla Maples", "Melania Knauss"], "PARENTS": ["Fred Trump", "Mary Anne MacLeod"], "EDUCATION": ["The Wharton School"], "URL": "https://en.wikipedia.org/wiki/Donald_Trump"}'


In [0]:
#experiment_code
df = pd.DataFrame(data = tagger.place_tags())
df.to_csv(r'conll_annot.csv', index = None, header=True)