In [1]:
import flair
from pathlib import Path
flair.cache_root = Path('../../data/flair')
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.embeddings import SentenceTransformerDocumentEmbeddings
from scipy import spatial
import requests

In [4]:
# make a sentence
sentence = Sentence('I love Berlin .')

# load the NER tagger
tagger = SequenceTagger.load('ner-fast')

# run NER over sentence
tagger.predict(sentence)

2021-04-24 16:47:14,356 --------------------------------------------------------------------------------
2021-04-24 16:47:14,357 The model key 'ner-fast' now maps to 'https://huggingface.co/flair/ner-english-fast' on the HuggingFace ModelHub
2021-04-24 16:47:14,358  - The most current version of the model is automatically downloaded from there.
2021-04-24 16:47:14,358  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/ner-fast/en-ner-fast-conll03-v0.4.pt)
2021-04-24 16:47:14,359 --------------------------------------------------------------------------------


Downloading:   0%|          | 0.00/257M [00:00<?, ?B/s]

2021-04-24 16:48:33,378 loading file ..\..\data\flair\models\ner-english-fast\4c58e7191ff952c030b82db25b3694b58800b0e722ff15427f527e1631ed6142.e13c7c4664ffe2bbfa8f1f5375bd0dced866b8c1dd7ff89a6d705518abf0a611


In [23]:
sentence.to_dict(tag_type='ner')

{'text': 'I love Berlin .',
 'labels': [],
 'entities': [{'text': 'Berlin',
   'start_pos': 7,
   'end_pos': 13,
   'labels': [LOC (0.9994)]}]}

In [95]:
res = requests.get('https://lookup.dbpedia.org/api/search?label=Berlin&maxResults=10&format=JSON_RAW')

In [97]:
eval(res.text)['docs'][0]

{'score': ['46574.086'],
 'refCount': ['4684'],
 'resource': ['http://dbpedia.org/resource/Berlin'],
 'redirectlabel': ['Athens on the Spree',
  'Berlib',
  'Berlim',
  'Berlin, Germany',
  'Berlin-Zentrum',
  'Berlin.de',
  'Berlin (Germany)',
  'Berlin City',
  'Berlin Germany',
  'Berlin State',
  'Capital of East Germany',
  'CityBerlin',
  'City of Berlin',
  'Cuisine of Berlin',
  'DEBER',
  'Federal State of Berlin',
  'Historical sites in berlin',
  'Land Berlin',
  'Silicon Allee',
  'Spreeathen',
  'State of Berlin',
  'UN/LOCODE:DEBER'],
 'typeName': ['Settlement', 'City', 'Place', 'PopulatedPlace', 'Location'],
 'comment': ["Berlin (; German: [bɛʁˈliːn] ()) is the capital and largest city of Germany by both area and population. Its 3,748,148 (2018) inhabitants make it the  most populous city proper of the European Union. The city is one of Germany's 16 federal states. It is surrounded by the state of Brandenburg, and contiguous with Potsdam, Brandenburg's capital. The two c

In [81]:
obj = xmltodict.parse(res.text)

In [74]:
for result in obj['ArrayOfResults']['Result']:
    print(result['Label'])

Berlin
Tennis Borussia Berlin
1. FC Union Berlin
West Berlin
East Berlin
Humboldt University of Berlin
Alba Berlin
Eisbären Berlin
Battle of Berlin
Berlin Thunder
Türkiyemspor Berlin
Irving Berlin
Technical University of Berlin
Free University of Berlin
Olympiastadion (Berlin)
FC Viktoria 1889 Berlin
Berlin, Maryland
SCC Berlin
Berlin University of the Arts
Gemäldegalerie, Berlin
Berlin, New Hampshire
Berlin, Connecticut
Berlin Philharmonic
Berlin Wall
Berlin (band)
Berlin, Wisconsin
Berlin Blockade
Füchse Berlin Reinickendorf
Berlin Schönefeld Airport
Berlin–Dresden railway
Air Berlin
SV Tasmania Berlin
Berlin Tegel Airport
Berlin Conference
Berlin International Film Festival
Berlin Hauptbahnhof
Berlin lebt 2
SC Dynamo Berlin
Berlin, Vermont
Berlin–Wrocław railway
Steve Berlin
Berlin Ringbahn
Wedding (Berlin)
Trams in Berlin
Berlin Tempelhof Airport
Isaiah Berlin
New Berlin, Wisconsin
Berlin Palace
Abby Berlin
Wacker 04 Berlin
SD Croatia Berlin
Academy of Arts, Berlin
SC Tasmania 1900

In [2]:
from Levenshtein import distance as levenshtein_distance

In [35]:
class EntityLinker():
    def __init__(self):
        self.tagger = SequenceTagger.load('ner-fast')
        self.embedding = SentenceTransformerDocumentEmbeddings('bert-base-nli-mean-tokens')
        self.lookup_keys = ['label', 'resource', 'comment']
        
    def __split_overlap(self, seq, size, overlap):
        return [x for x in zip(*[seq[i::size-overlap] for i in range(size)])]
    
    def __lookup(self, phr, max_res = 10):
        res = requests.get(f'https://lookup.dbpedia.org/api/search?query={phr}&maxResults=10&format=JSON_RAW')
        docs = eval(res.text)['docs']
        return docs
    
    def __compute_relevance(self, phr, candidate_entity, question_embedding, alfa1=1, alfa2=1, alfa3=1):
        # TODO: compute importance
        # can we use the relevance or simply the rank of results from lookup?
        importance = 1
        
        # compute lev distance
        lev_distance = 1 / (levenshtein_distance(phr, candidate_entity['label'][0]) + 1)
        
        # compute relevance with doc embedding
        doc_entity_flair = Sentence(candidate_entity['comment'])
        self.embedding.embed(doc_entity_flair)
        cos_sim = 1 - spatial.distance.cosine(question_embedding, doc_entity_flair.embedding.tolist())
        
        score = alfa1 * importance + alfa2 * lev_distance + alfa3 * cos_sim
        return score
    
    def extract(self, question):
        sentence = Sentence(question)
        self.tagger.predict(sentence)
        
        entities = sentence.to_dict(tag_type='ner')['entities']
        entities = [entity['text'] for entity in entities]
        return entities
    
    def extend_entity(self, question, phr, max_len):
        tmp_question = question.replace(phr, 'ENTITY')
        # get question tokens
        question_tokens = tmp_question.split()
        # get position of current entity
        index = question_tokens.index('ENTITY')

        extended_entities = []

        for size in range(2, max_len+1):
            for group in self.__split_overlap(question_tokens, size, size-1):
                print(group)
                if 'ENTITY' in group:
                    extended_entities.append(' '.join(group).replace('ENTITY', phr))
        return extended_entities
    
    def __get_question_embedding(self, question):
        sentence = Sentence(question)
        self.embedding.embed(sentence)
        return sentence.embedding.tolist()
        
    
    def link(self, question, max_len = 3):
        linked_entities = []
        
        question_embedding = self.__get_question_embedding(question)
        
        # extract entities from question
        entity_phrases = self.extract(question)
        print(entity_phrases)
        
        for i, phr in enumerate(entity_phrases):
            candidate_entity_phrase = {'phr': phr, 'candidate_entity': None, 'score': 0}
            
            # extend extracted entities
            PX = self.extend_entity(question, phr, max_len)
            EC = []
            # look for candidate entities
            for phr_ext in PX:
                docs = self.__lookup(phr_ext)
                if len(docs) > 0:
                    EC.extend(docs)
            # compute relevances and keep highest relevance candidate entity
            for j, candidate_entity in enumerate(EC):
                if 'comment' in candidate_entity:
                    tmp_score = self.__compute_relevance(phr, candidate_entity, question_embedding)
                if tmp_score > candidate_entity_phrase['score']:
                    candidate_entity_phrase['candidate_entity'] = candidate_entity
                    candidate_entity_phrase['score'] = tmp_score

            linked_entities.append(candidate_entity_phrase)
                
        return linked_entities

In [36]:
entity_linker = EntityLinker()

2021-04-24 22:04:22,545 --------------------------------------------------------------------------------
2021-04-24 22:04:22,546 The model key 'ner-fast' now maps to 'https://huggingface.co/flair/ner-english-fast' on the HuggingFace ModelHub
2021-04-24 22:04:22,547  - The most current version of the model is automatically downloaded from there.
2021-04-24 22:04:22,548  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/ner-fast/en-ner-fast-conll03-v0.4.pt)
2021-04-24 22:04:22,548 --------------------------------------------------------------------------------
2021-04-24 22:04:22,886 loading file ..\..\data\flair\models\ner-english-fast\4c58e7191ff952c030b82db25b3694b58800b0e722ff15427f527e1631ed6142.e13c7c4664ffe2bbfa8f1f5375bd0dced866b8c1dd7ff89a6d705518abf0a611


In [38]:
entity_linker.link("Which band's former member are Kevin Jonas and Joe Jonas ?")

['Kevin Jonas', 'Joe Jonas']
('Which', "band's")
("band's", 'former')
('former', 'member')
('member', 'are')
('are', 'ENTITY')
('ENTITY', 'and')
('and', 'Joe')
('Joe', 'Jonas')
('Jonas', '?')
('Which', "band's", 'former')
("band's", 'former', 'member')
('former', 'member', 'are')
('member', 'are', 'ENTITY')
('are', 'ENTITY', 'and')
('ENTITY', 'and', 'Joe')
('and', 'Joe', 'Jonas')
('Joe', 'Jonas', '?')
('Which', "band's")
("band's", 'former')
('former', 'member')
('member', 'are')
('are', 'Kevin')
('Kevin', 'Jonas')
('Jonas', 'and')
('and', 'ENTITY')
('ENTITY', '?')
('Which', "band's", 'former')
("band's", 'former', 'member')
('former', 'member', 'are')
('member', 'are', 'Kevin')
('are', 'Kevin', 'Jonas')
('Kevin', 'Jonas', 'and')
('Jonas', 'and', 'ENTITY')
('and', 'ENTITY', '?')


[{'phr': 'Kevin Jonas',
  'candidate_entity': {'score': ['14227.122'],
   'refCount': ['27'],
   'resource': ['http://dbpedia.org/resource/Kevin_Jonas'],
   'redirectlabel': ['Paul Kevin Jonas Jr.',
    'Kevin Jonas/version 2',
    'Kevin Jonas (singer)',
    'Kevin jonas bio',
    'Paul Kevin Jonas',
    'Paul Kevin Jonas II'],
   'typeName': ['Person', 'Agent'],
   'comment': ["Paul Kevin Jonas II (born November 5, 1987) is an American musician, singer, songwriter, actor, contractor, dancer, and entrepreneur. He rose to fame as a member of the pop rock band, Jonas Brothers, alongside his younger brothers Joe and Nick. The group released their debut studio album It's About Time through the Columbia label in 2006, which failed to achieve commercial success. After signing with Hollywood Records, the group released their self-titled second studio album in 2007, which became their breakthrough record."],
   'label': ['Kevin Jonas'],
   'type': ['http://dbpedia.org/ontology/Person',
    'h