In [1]:
import json

In [2]:
data = None
with open("./sampleData.json", "r") as rfile:
    data = json.load(rfile)

In [3]:
data.keys()

dict_keys(['query', 'urls', 'dateTo', 'dateFrom', 'queryData', 'topk', 'isTest', 'success'])

In [4]:
base_url = "../rel/data"


In [5]:
from REL.mention_detection import MentionDetection
from REL.utils import process_results
from REL.entity_disambiguation import EntityDisambiguation
from REL.ner import Cmns, load_flair_ner

wiki_version = "wiki_2019"

In [6]:
def example_preprocessing():
    # user does some stuff, which results in the format below.
    text = "Obama will visit Germany. And have a meeting with Merkel tomorrow."
    processed = {"test_doc1": [text, []], "test_doc2": [text, []]}
    return processed

input_text = example_preprocessing()

In [7]:
mention_detection = MentionDetection(base_url, wiki_version)
tagger_ner = load_flair_ner("ner-fast")
tagger_ngram = Cmns(base_url, wiki_version, n=5)
mentions_dataset, n_mentions = mention_detection.find_mentions(input_text, tagger_ner)

2022-04-30 14:22:14,172 --------------------------------------------------------------------------------
2022-04-30 14:22:14,175 The model key 'ner-fast' now maps to 'https://huggingface.co/flair/ner-english-fast' on the HuggingFace ModelHub
2022-04-30 14:22:14,177  - The most current version of the model is automatically downloaded from there.
2022-04-30 14:22:14,178  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/ner-fast/en-ner-fast-conll03-v0.4.pt)
2022-04-30 14:22:14,179 --------------------------------------------------------------------------------
2022-04-30 14:22:14,345 loading file /home/sam/.flair/models/ner-english-fast/4c58e7191ff952c030b82db25b3694b58800b0e722ff15427f527e1631ed6142.e13c7c4664ffe2bbfa8f1f5375bd0dced866b8c1dd7ff89a6d705518abf0a611


In [15]:
mentions_dataset


{'test_doc1': [{'mention': 'Obama',
   'context': ('',
    'will visit Germany And have a meeting with Merkel tomorrow'),
   'candidates': [['Barack_Obama', 0.951],
    ['Obama,_Fukui', 0.119],
    ['Obama,_Nagasaki', 0.097],
    ['Obama_Station', 0.093],
    ['Presidency_of_Barack_Obama', 0.092],
    ['Michelle_Obama', 0.092],
    ['Obama_Domain', 0.092],
    ['Barack_Obama_Sr.', 0.091],
    ['Higashi-Obama_Station', 0.091],
    ['Obama_Castle_(Wakasa_Province)', 0.091],
    ['Obama_(surname)', 0.091],
    ['Newspaper_endorsements_in_the_2008_United_States_presidential_primaries',
     0.01701746529332736],
    ['Sonia_Sotomayor', 0.01141961486789073],
    ['Political_positions_of_Barack_Obama', 0.009255112703388566],
    ['Obama_Doctrine', 0.007389162561576354],
    ['Barack_Obama_judicial_appointment_controversies', 0.00664278250485147],
    ['Barack_Obama_Supreme_Court_candidates', 0.004030452306314375],
    ['Social_policy_of_the_Barack_Obama_administration', 0.00380653828929691],

In [9]:
config = {
    "mode": "eval",
    "model_path": base_url + "/ed-wiki-2019/lr_model.pkl",
}

model = EntityDisambiguation(base_url, wiki_version, config)
predictions, timing = model.predict(mentions_dataset)

Loading model from given path: ../rel/data/ed-wiki-2019/lr_model.pkl




In [10]:
predictions

{'test_doc1': [{'mention': 'Obama',
   'prediction': 'Barack_Obama',
   'candidates': ['Barack_Obama',
    'Obama,_Fukui',
    'Obama,_Nagasaki',
    'Obama_Station',
    'Michelle_Obama',
    'Family_of_Barack_Obama',
    '2008_United_States_presidential_election'],
   'conf_ed': 0.9312528153257645,
   'scores': ['0.4593951',
    '0.40141734',
    '0.39951593',
    '0.3990866',
    '0.41827792',
    '0.37923092',
    '0.34583545']},
  {'mention': 'Germany',
   'prediction': 'Germany',
   'candidates': ['Germany',
    'Germany_national_football_team',
    'Nazi_Germany',
    'German_Empire',
    "Germany_women's_national_football_team",
    'Weimar_Republic',
    'Same-sex_marriage_in_Germany'],
   'conf_ed': 0.5245728296743994,
   'scores': ['0.43033266',
    '0.3900633',
    '0.38722372',
    '0.38899815',
    '0.38888156',
    '0.38889247',
    '0.38826776']},
  {'mention': 'Merkel',
   'prediction': 'Angela_Merkel',
   'candidates': ['Angela_Merkel',
    'Max_Merkel',
    'Merkel,_

In [11]:
for doc in mentions_dataset:
    for mention in mentions_dataset[doc]:
        if mention["candidates"][0][1] > 0.5:
            print(mention["mention"], mention['prediction'])

KeyError: 'prediction'

In [21]:
def compute(state, recompute=False):

    print("computing named entities...")

    if not recompute and 'entities' in state:

        print("Entities already computed, using cached features")

        return False

    urls = list(state['raw'].keys())
    
    entity_index = dict({})
    
    for i,url in enumerate(urls):
        
        ner_input = dict([(i, [sent,[]]) for i, sent in enumerate(state['raw'][url])])
    
        ner_output, n_outputs = mention_detection.find_mentions(ner_input, tagger_ner)
        preds,_ = model.predict(ner_output)

        for sent in preds:
            for mention in preds[sent]:
                
                pred = mention['prediction']
                
                if pred not in entity_index:
                    entity_index[pred] = dict({})
                
                if url not in entity_index[pred]:
                    entity_index[pred][url] = []
                    
                entity_index[pred][url].append(sent)
     
    state['entities'] = entity_index

    return True
    
        

In [22]:
ents = compute(data['queryData'], recompute=True)

computing named entities...


AttributeError: 'list' object has no attribute 'add'

[('China', 70),
 ('Ukraine', 18),
 ('Russia', 12),
 ('India', 7),
 ('United_States', 6),
 ('Australia', 6),
 ('Germany', 6),
 ('Counties_of_China', 6),
 ('O._J._Simpson', 4),
 ('California', 3),
 ('Canada', 3),
 ('Alan_Blinder', 3),
 ('Larissa_Anderson', 3),
 ('Julia_Simon_(biathlete)', 3),
 ('England', 3),
 ('Stephen_Sondheim', 3),
 ('Los_Angeles', 3),
 ('Guy_Fawkes', 3),
 ('Onychomycosis', 3),
 ('Boris_Johnson', 3),
 ('1996_Summer_Olympics', 2),
 ('Syria', 2),
 ('John_Kirby_(admiral)', 2),
 ('Denmark', 2),
 ('Hunan', 2),
 ('Johnson_&amp;_Johnson', 2),
 ('Shaanxi', 2),
 ("Xi'an", 2),
 ('John_Beckman', 2),
 ('Americas', 2),
 ('Delhi', 1),
 ('Chandrakant_Lahariya', 1),
 ('Narendra_Modi', 1),
 ('Nirmala_Sitharaman', 1),
 ('Marie_Antoinette', 1),
 ('Europe', 1),
 ('Ireland', 1),
 ('George_Bonanno', 1),
 ('Tanzania', 1),
 ('Jen_Psaki', 1),
 ('Barack_Obama', 1),
 ('Rakeysh_Omprakash_Mehra', 1),
 ('Rosedale,_Toronto', 1),
 ('Korea', 1),
 ('Japan', 1),
 ('University_of_Oxford', 1),
 ('McGill_

In [40]:
def filter(state, urls, ent, threshold):

    urls = list(state['raw'].keys())
    
    out = [True]*len(urls)
    
    for i, url in enumerate(urls):
        
        if state['entities'][ent][url] < threshold:
            out[i] = False
            
    return out
    

In [None]:
def topk(state, docs, k=5):
    
    
    top_dict = dict({})
    
    for ent in state['entities']: 
    
        for doc in docs:

            if doc in state['entities'][ent]:
                
                if ent not in top_dict:
                    top_dict[ent] = 0
                    
                top_dict[ent] += state['entities'][ent][doc]
    
    return sorted(list(top_dict.values()), key = lambda a: a[1], reverse = True)


In [None]:
def compute(state):
    
    