In [2]:
import requests
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns

from tqdm.notebook import tqdm

# import REL 

# from http.server import HTTPServer

from REL.entity_disambiguation import EntityDisambiguation
from REL.mention_detection import MentionDetection 
from REL.ner import Cmns, load_flair_ner
from REL.utils import process_results
# from REL.server import make_handler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [3]:
def preprocessing(row: pd.Series) -> dict:
    processed = {
        'title': [row['title'], []], 
        'article': [row['article'], []]
        }
    return processed

def find_mentions(text, mention_detection, tagger_ner):
    mentions_dataset, n_mentions = mention_detection.find_mentions(text, tagger_ner)
    return mentions_dataset

def disambiguate_entities(mentions_dataset, entity_disambiguation):
    predictions, timing = entity_disambiguation.predict(mentions_dataset)
    return predictions

def main(dataset, base_url, wiki_version, model_alias, tagger, df_name):

    plug = [{
        'mention': 'None',
        'context': ('None'),
        'candidates': [['', 0.0]],
        'gold': [''],
        'pos': 0,
        'sent_idx': 0,
        'ngram': '',
        'end_pos': 0,
        'sentence': '',
        'conf_md': 0.0,
        'tag': '',
        'sent': ''
        }]

    mention_detection = MentionDetection(base_url, wiki_version)
    tagger = tagger
    
    config = {
        "mode": "eval",
        "model_path": model_alias,
    }
    entity_disambiguation = EntityDisambiguation(base_url, wiki_version, config)

    annotated_articles = []
    
    for index, row in tqdm(dataset.iterrows()):

        processed_row = preprocessing(row)

        # Perform mention detection on headline and body text
        mentions_dataset = find_mentions(processed_row, mention_detection, tagger)
        
        # Insert plug if title or article mentions are empty
        if not mentions_dataset['title']:
            mentions_dataset['title'] = plug
        if not mentions_dataset['article']:
            mentions_dataset['article'] = plug

        # Disambiguate detected mentions
        mentions_disambiguated = disambiguate_entities(mentions_dataset, entity_disambiguation)
        result = process_results(mentions_dataset, mentions_disambiguated, processed_row)

        # Filter mentions with the ORG tag
        headline_mentions = [mention for mention in result['title'] if mention[-1] == 'ORG']
        body_text_mentions = [mention for mention in result['article'] if mention[-1] == 'ORG']

        # Check if any named entities were found in the headline
        if not headline_mentions or not mentions_disambiguated['title']:
            continue

        # Mark salient entities
        salient_entities = []
        for body_entity in body_text_mentions:
            if body_entity[3] in [headline_entity[3] for headline_entity in headline_mentions]:
                salient_entities.append(body_entity)

        # Check if any named entities were found in the body text
        if not body_text_mentions or not mentions_disambiguated['article']:
            continue
        
        salient_entities_list = list(set([entity[3] for entity in salient_entities]))
        # Save the annotated article
        annotated_articles.append({
            'headline': processed_row['title'][0],
            'body_text': processed_row['article'][0],
            'headline_mentions': headline_mentions,
            'body_text_mentions': body_text_mentions,
            'salient_entities': salient_entities,
            'salient_entities_list': salient_entities_list,
            
        })

        # Create a new DataFrame from the annotated_articles list
        annotated_articles_df = pd.DataFrame(annotated_articles)
    
    return annotated_articles_df

    # Save the annotated articles as a CSV file
    annotated_articles_df.to_csv(df_name, index=False)

In [4]:
base_url = '/home/ec2-user/environments/styx_env/styx/data/REL/'
wiki_version = 'wiki_2019'
model_alias = 'ed-wiki-2019'

In [7]:
df = pd.read_csv('../data/external/concatenated/train_news_raw.csv')

In [8]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True).head(100)
df.head()

Unnamed: 0,title,article
0,House Intelligence Committee Democratic memo d...,"February 24, 2018 / 9:39 PM / Updated 10 minut..."
1,BRIEF-Solon Eiendom: UFI To Hold 4.8 Pct In Co,April 9 (Reuters) - SOLON EIENDOM ASA:\n* UFI ...
2,Olympics-Speedskating-Women's team pursuit 6 l...,Feb 21 (Gracenote) - Olympic speedskating wome...
3,Paulson says backs strong U.S. dollar,WASHINGTON (Reuters) - U.S. Treasury Secretar...
4,"Hope Hicks, Trump’s Communications Director, t...","WASHINGTON—Hope Hicks, White House communicati..."


In [9]:
df[df['title'].isna()]

Unnamed: 0,title,article


In [10]:
df[df['article'].isna()]

Unnamed: 0,title,article


In [11]:
tagger_ner = load_flair_ner("ner-fast")
tagger_ngram = Cmns(base_url, wiki_version, n=5)

2023-04-20 07:32:42,111 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [12]:
%%time

t = main(df, base_url, wiki_version, model_alias, tagger_ner, '../data/interim/annotated_articles.csv')

Loading model from given path: /home/ec2-user/.rel_cache/ed-wiki-2019/model


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


0it [00:00, ?it/s]

CPU times: user 7min 30s, sys: 37.3 s, total: 8min 7s
Wall time: 1min 39s


In [13]:
t.head()

Unnamed: 0,headline,body_text,headline_mentions,body_text_mentions,salient_entities,salient_entities_list
0,House Intelligence Committee Democratic memo d...,"February 24, 2018 / 9:39 PM / Updated 10 minut...","[(0, 28, House Intelligence Committee, United_...","[(53, 28, House Intelligence Committee, United...","[(53, 28, House Intelligence Committee, United...",[United_States_House_Permanent_Select_Committe...
1,BRIEF-Solon Eiendom: UFI To Hold 4.8 Pct In Co,April 9 (Reuters) - SOLON EIENDOM ASA:\n* UFI ...,"[(21, 3, UFI, Unión_del_Fútbol_del_Interior, 0...","[(41, 3, UFI, Unión_del_Fútbol_del_Interior, 0...","[(41, 3, UFI, Unión_del_Fútbol_del_Interior, 0...",[Unión_del_Fútbol_del_Interior]
2,Exclusive: ECB eyes seeking capital hike -c.ba...,FRANKFURT (Reuters) - The European Central Ba...,"[(11, 3, ECB, European_Central_Bank, 0.4868450...","[(27, 21, European Central Bank, European_Cent...","[(27, 21, European Central Bank, European_Cent...",[European_Central_Bank]
3,Honeywell sees 2011 sales up about 5 percent,BOSTON (Reuters) - Honeywell International In...,"[(0, 9, Honeywell, Honeywell, 0.68687716108419...","[(20, 27, Honeywell International Inc, Honeywe...","[(20, 27, Honeywell International Inc, Honeywe...",[Honeywell]
4,SES Appoints John-Paul Hemingway as CEO of SES...,LUXEMBOURG--(BUSINESS WIRE)-- SES announced to...,"[(0, 3, SES, SES_S.A., 0.5685978373761983, 0.5...","[(30, 3, SES, Amazon_Web_Services, 0.473384511...","[(1113, 3, SES, SES_S.A., 0.5744543030627711, ...",[SES_S.A.]


In [14]:
t[~t['salient_entities_list'].astype(bool)]

Unnamed: 0,headline,body_text,headline_mentions,body_text_mentions,salient_entities,salient_entities_list
9,"Zale CEO, others quit amid losses; shares slide",NEW YORK (Reuters) - Struggling jewelry chain...,"[(0, 4, Zale, Žale, 0.8435079338451078, 0.8575...","[(47, 9, Zale Corp, Zale_Corporation, 0.387277...",[],[]
11,Russian lawmaker says Russia should halt space...,"MOSCOW (Reuters) - A Russian senator, Viktor B...","[(70, 3, RIA, Bulgaria, 0.341671487824356, 0.9...","[(164, 3, RIA, RIA_Novosti, 0.3399204431765785...",[],[]
15,"Inside Trump's new VA office, moves to help wh...",Dan Martin is chief engineer for Veterans Affa...,"[(19, 2, VA, Lau_Cheok_Vá, 0.27553461929906237...","[(33, 16, Veterans Affairs, United_States_Depa...",[],[]
20,French and Benelux stocks-Factors to watch on ...,Feb 15 (Reuters) - Below are company-related n...,"[(11, 7, Benelux, Benelux, 0.38009321921795963...","[(172, 3, CAC, CAC_40, 0.33621852879410874, 0....",[],[]
24,Japan Noda warns on debt pile in budget battle,TOKYO (Reuters) - Japan's government cannot c...,"[(6, 4, Noda, Noda,_Azerbaijan, 0.730722266701...","[(163, 16, Democratic Party, Democratic_Party_...",[],[]
26,Legg Mason unit eyes distressed sales: report,"(Reuters) - Permal Investment Management, the ...","[(0, 10, Legg Mason, Legg_Mason, 0.38727776780...","[(116, 4, LM.N, ISO_4217, 0.3631330181905939, ...",[],[]


In [15]:
t[~t['salient_entities_list'].astype(bool)].iloc[1, 0]

'Russian lawmaker says Russia should halt space cooperation with U.S.: RIA'

In [16]:
t[~t['salient_entities_list'].astype(bool)].iloc[1, 1]

'MOSCOW (Reuters) - A Russian senator, Viktor Bondarev, said that for the sake of national security Russia should halt its space cooperation with the United States, RIA news agency reported on Friday.\nEarlier on Friday, lawmakers proposed a ban on U.S. imports in response to a new set of U.S. sanctions imposed on Russia.\nThe lower house of parliament is expected to consider draft legislation on the matter next week.\nReporting by Gabrielle Tétrault-Farber; Writing by Maria Tsvetkova; Editing by Hugh Lawson\n '

In [17]:
t[~t['salient_entities_list'].astype(bool)].iloc[1, 2]

[(70, 3, 'RIA', 'Bulgaria', 0.341671487824356, 0.9108534455299377, 'ORG')]

In [18]:
t[~t['salient_entities_list'].astype(bool)].iloc[1, 3]

[(164,
  3,
  'RIA',
  'RIA_Novosti',
  0.33992044317657855,
  0.9998592138290405,
  'ORG')]

In [19]:
t[~t['salient_entities_list'].astype(bool)].iloc[1, 4]

[]

In [None]:
mention_detection = MentionDetection(base_url, wiki_version)

In [None]:
mentions_dataset, n_mentions = mention_detection.find_mentions(input_text, tagger_ner)

In [None]:
mentions_dataset['article'][0]

{'mention': 'Salesforce',
 'context': ('',
  'CEO Marc Benioff told CNBC s Jim Cramer that face masks can stop the coronavirus crisis in the U S in just three weeks The two are sponsoring a mask design challenge'),
 'candidates': [['Salesforce.com', 1.0],
  ['Salesforce_Tower', 0.0007209805335255948],
  ['Salesforce_Marketing_Cloud', 0.0007209805335255948]],
 'gold': ['NONE'],
 'pos': 0,
 'sent_idx': 0,
 'ngram': 'Salesforce',
 'end_pos': 10,
 'sentence': 'Salesforce CEO Marc Benioff told CNBC\'s Jim Cramer that face masks can stop the coronavirus crisis in the U.S. in "just three weeks."',
 'conf_md': 0.9921179413795471,
 'tag': 'ORG'}

In [55]:
mentions_dataset

{'title': [{'mention': 'Salesforce',
   'context': ('',
    's Marc Benioff Face masks can end the U S coronavirus crisis within weeks'),
   'candidates': [['Salesforce.com', 1.0],
    ['Salesforce_Tower', 0.0007209805335255948],
    ['Salesforce_Marketing_Cloud', 0.0007209805335255948]],
   'gold': ['NONE'],
   'pos': 0,
   'sent_idx': 0,
   'ngram': 'Salesforce',
   'end_pos': 10,
   'sentence': "Salesforce's Marc Benioff: Face masks can end the U.S. coronavirus crisis within weeks",
   'conf_md': 0.9846004843711853,
   'tag': 'ORG'},
  {'mention': 'Marc Benioff',
   'context': ('Salesforce s',
    'Face masks can end the U S coronavirus crisis within weeks'),
   'candidates': [['Marc_Benioff', 1.0]],
   'gold': ['NONE'],
   'pos': 13,
   'sent_idx': 0,
   'ngram': 'Marc Benioff',
   'end_pos': 25,
   'sentence': "Salesforce's Marc Benioff: Face masks can end the U.S. coronavirus crisis within weeks",
   'conf_md': 0.9935024380683899,
   'tag': 'PER'},
  {'mention': 'U.S.',
   'conte

In [172]:
mentions_dataset['title']

[{'mention': 'None',
  'context': 'None',
  'candidates': [['', 0.0]],
  'gold': [''],
  'pos': 0,
  'sent_idx': 0,
  'ngram': '',
  'end_pos': 0,
  'sentence': '',
  'conf_md': 0.0,
  'tag': '',
  'sent': ''}]

In [173]:
for mention in mentions_dataset['title']:
    print(mention['mention'], mention['tag'], mention['candidates'][0])

None  ['', 0.0]


In [56]:
config = {
    "mode": "eval",
    "model_path": model_alias,
}

model = EntityDisambiguation(base_url, wiki_version, config)
predictions, timing = model.predict(mentions_dataset)

Loading model from given path: /home/ec2-user/.rel_cache/ed-wiki-2019/model


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [57]:
predictions

{'title': [{'mention': 'Salesforce',
   'prediction': 'Salesforce.com',
   'candidates': ['Salesforce.com',
    'Salesforce_Tower',
    'Salesforce_Marketing_Cloud',
    '#UNK#',
    '#UNK#',
    '#UNK#',
    '#UNK#'],
   'conf_ed': 0.4634936624090754,
   'scores': ['0.4274973',
    '0.3273192',
    '0.32570845',
    '-0.22848086',
    '-0.22848086',
    '-0.22848086',
    '-0.22848086']},
  {'mention': 'Marc Benioff',
   'prediction': 'Marc_Benioff',
   'candidates': ['Marc_Benioff',
    '#UNK#',
    '#UNK#',
    '#UNK#',
    '#UNK#',
    '#UNK#',
    '#UNK#'],
   'conf_ed': 0.3872777678067984,
   'scores': ['0.42387587',
    '-0.22848086',
    '-0.22848086',
    '-0.22848086',
    '-0.22848086',
    '-0.22848086',
    '-0.22848086']},
  {'mention': 'U.S.',
   'prediction': 'United_States',
   'candidates': ['United_States',
    'United_States_Reports',
    'Billboard_Hot_100',
    'Billboard_200',
    'American_English',
    'United_States_customary_units',
    'Dance_Club_Songs'],
 

In [154]:
for mention in predictions['title']:
    print(mention['mention'], mention['prediction'])

None 


In [116]:
predictions['title'][0]

{'mention': 'IQ',
 'prediction': 'Intelligence_quotient',
 'candidates': ['Intelligence_quotient',
  'IQ_(band)',
  'IQ_and_the_Wealth_of_Nations',
  'Social_intelligence',
  'Mensa_International',
  'Race_and_intelligence',
  'The_Bell_Curve'],
 'conf_ed': 0.49680434174401916,
 'scores': ['0.42904449',
  '0.39379632',
  '0.38732094',
  '0.38664663',
  '0.39317554',
  '0.3453135',
  '0.33733743']}

In [58]:
from REL.utils import process_results

In [59]:
result = process_results(mentions_dataset, predictions, input_text)

In [60]:
result['title']

[(0,
  10,
  'Salesforce',
  'Salesforce.com',
  0.4634936624090754,
  0.9846004843711853,
  'ORG'),
 (13,
  12,
  'Marc Benioff',
  'Marc_Benioff',
  0.3872777678067984,
  0.9935024380683899,
  'PER'),
 (50,
  4,
  'U.S.',
  'United_States',
  0.8679714152362863,
  0.9989217519760132,
  'LOC')]

In [35]:
result['test_doc2']

[(1,
  9,
  'Mad Money',
  'Mad_Money',
  0.9396388478212766,
  0.6864858865737915,
  'MISC'),
 (17,
  10,
  'Jim Cramer',
  'Jim_Cramer',
  0.9106089897957748,
  0.9995617270469666,
  'PER')]

In [None]:
def example_preprocessing(row: pd.Series) -> dict:
    processed = {
        'test_doc1': [row['title'], []], 
        'test_doc2': [row['article'], []]
        }
    return processed

In [44]:
n = 5
test = df.head(42).copy()

for g, data in test.groupby(np.arange(len(test)) // n):
    processed = 
    print(data['title'].values)

['Jim Cramer: A better way to invest in the Covid-19 vaccine gold rush'
 "Cramer's lightning round: I would own Teradyne"
 "Cramer's week ahead: Big week for earnings, even bigger week for vaccines"
 'IQ Capital CEO Keith Bliss says tech and healthcare will rally'
 "Wall Street delivered the 'kind of pullback I've been waiting for,' Jim Cramer says"]
["Cramer's lightning round: I would just stay long Wex"
 "Acorns CEO: Parents can turn $5 into five figures for their kids through 'power of compounding'"
 'Dividend cuts may mean rethinking your retirement income strategy'
 'StockX has authenticated 1 million Jordan sneakers this year, CEO says'
 'Biohaven Pharmaceuticals lands Khloe Kardashian as influencer of new migraine drug']
["Cramer's lightning round: I like Beyond Meat"
 'Cramer: We desperately need another round of federal stimulus, despite vaccine progress'
 "Cramer's lightning round: Buy more VMware"
 "Wall Street did something 'highly unusual' in Tuesday's session, Jim Cramer 

In [None]:
n = 400
for g, df in test.groupby(np.arange(len(test)) // n):
    print(df.shape)

@Kimchi You can get it from the groupby object (test.groupby(np.arange(len(test)) // n)). If you assign that to a variable, you can then call .groups() to get a dictionary of {group id:[dataframe index ids]}. You can then use the group ids to retrieve each dataframe by calling .get_group(group_id) on the groupby object

In [4]:
config = {
    "mode": "eval",
    "model_path": "ed-wiki-2019",  # or alias, see also tutorial 7: custom models
}

model = EntityDisambiguation(
    base_url, 
    wiki_version, 
    config)

Loading model from given path: /home/ec2-user/.rel_cache/ed-wiki-2019/model


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
# Using Flair:
tagger_ner = load_flair_ner("ner-fast")

# Alternatively, using n-grams:
tagger_ngram = Cmns(base_url, wiki_version, n=5)

2023-04-05 07:53:10,329 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [7]:
server_address = ("127.0.0.1", 1235)
server = HTTPServer(
    server_address,
    make_handler(
        base_url, wiki_version, model, tagger_ner
    ),
)

try:
    print("Ready for listening.")
    server.serve_forever()
except KeyboardInterrupt:
    exit(0)

Ready for listening.


In [None]:
import requests

IP_ADDRESS = "http://localhost"
PORT = "1235"
text_doc = "If you're going to try, go all the way - Charles Bukowski"

document = {
    "text": text_doc,
    "spans": [],  # in case of ED only, this can also be left out when using the API
}

API_result = requests.post("{}:{}".format(IP_ADDRESS, PORT), json=document).json()


In [5]:
from REL.utils import process_results 

In [6]:
def example_preprocessing():
    # user does some stuff, which results in the format below.
    text = "Obama will visit Germany. And have a meeting with Merkel tomorrow."
    processed = {"test_doc1": [text, []], "test_doc2": [text, []]}
    return processed

input_text = example_preprocessing()

In [7]:
input_text

{'test_doc1': ['Obama will visit Germany. And have a meeting with Merkel tomorrow.',
  []],
 'test_doc2': ['Obama will visit Germany. And have a meeting with Merkel tomorrow.',
  []]}

In [17]:
mention_detection = MentionDetection(base_url, wiki_version)
tagger_ner = load_flair_ner("ner-fast")
tagger_ngram = Cmns(base_url, wiki_version, n=5)

2023-04-05 08:23:35,067 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [18]:
mentions_dataset, n_mentions = mention_detection.find_mentions(input_text, tagger_ner)

In [20]:
mentions_dataset['test_doc1']

[{'mention': 'Obama',
  'context': ('',
   'will visit Germany And have a meeting with Merkel tomorrow'),
  'candidates': [['Barack_Obama', 0.951],
   ['Obama,_Fukui', 0.119],
   ['Obama,_Nagasaki', 0.097],
   ['Obama_Station', 0.093],
   ['Presidency_of_Barack_Obama', 0.092],
   ['Michelle_Obama', 0.092],
   ['Obama_Domain', 0.092],
   ['Barack_Obama_Sr.', 0.091],
   ['Higashi-Obama_Station', 0.091],
   ['Obama_Castle_(Wakasa_Province)', 0.091],
   ['Obama_(surname)', 0.091],
   ['Newspaper_endorsements_in_the_2008_United_States_presidential_primaries',
    0.01701746529332736],
   ['Sonia_Sotomayor', 0.01141961486789073],
   ['Political_positions_of_Barack_Obama', 0.009255112703388566],
   ['Obama_Doctrine', 0.007389162561576354],
   ['Barack_Obama_judicial_appointment_controversies', 0.00664278250485147],
   ['Barack_Obama_Supreme_Court_candidates', 0.004030452306314375],
   ['Social_policy_of_the_Barack_Obama_administration', 0.00380653828929691],
   ['Foreign_policy_of_the_Barack_

In [10]:
config = {
    "mode": "eval",
    "model_path": "ed-wiki-2019",
}

model = EntityDisambiguation(base_url, wiki_version, config)
predictions, timing = model.predict(mentions_dataset)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading model from given path: /home/ec2-user/.rel_cache/ed-wiki-2019/model


In [11]:
predictions

{'test_doc1': [{'mention': 'Obama',
   'prediction': 'Barack_Obama',
   'candidates': ['Barack_Obama',
    'Obama,_Fukui',
    'Obama,_Nagasaki',
    'Obama_Station',
    'Michelle_Obama',
    'Family_of_Barack_Obama',
    '2008_United_States_presidential_election'],
   'conf_ed': 0.9312528153257645,
   'scores': ['0.4593951',
    '0.40141732',
    '0.3995159',
    '0.3990866',
    '0.4182779',
    '0.37923098',
    '0.34583557']},
  {'mention': 'Germany',
   'prediction': 'Germany',
   'candidates': ['Germany',
    'Germany_national_football_team',
    'Nazi_Germany',
    'German_Empire',
    "Germany_women's_national_football_team",
    'Weimar_Republic',
    'Same-sex_marriage_in_Germany'],
   'conf_ed': 0.5245728296743994,
   'scores': ['0.43033266',
    '0.39006326',
    '0.38722378',
    '0.38899815',
    '0.3888815',
    '0.38889253',
    '0.38826782']},
  {'mention': 'Merkel',
   'prediction': 'Angela_Merkel',
   'candidates': ['Angela_Merkel',
    'Max_Merkel',
    'Merkel,_Te

In [6]:
os.path.join(base_url, wiki_version, "generated")

'/home/ec2-user/environments/styx_env/styx/data/REL/wiki_2019/generated'

In [None]:
config = {
    "mode": "train",
    "model_path": "{}/{}/generated/model".format(
        base_url, wiki_version
    ),
}
model = EntityDisambiguation(base_url, wiki_version, config)

In [37]:
API_URL = "https://rel.cs.ru.nl/api"

In [59]:
text_doc = "If you're going to try, go all the way - Charles Bukowski"

# Example EL.
el_result = requests.post(API_URL, json={
    "text": text_doc,
    "spans": []
}).json()

# Example ED.
ed_result = requests.post(API_URL, json={
    "text": text_doc,
    "spans": [(40, 16)]
}).json()

In [60]:
el_result

[[41,
  16,
  'Charles Bukowski',
  'Charles_Bukowski',
  0.9535556493571484,
  0.988037109375,
  'PER']]

In [61]:
ed_result

[[40,
  16,
  ' Charles Bukowsk',
  'Charles_Bukowski',
  0.38727825954758405,
  0.0,
  'NULL']]

In [7]:
df.shape

(448306, 2)

In [8]:
df.loc[0, 'title']

'Jim Cramer: A better way to invest in the Covid-19 vaccine gold rush'

In [9]:
%%time
requests.post(API_URL, json={
    "text": df.loc[0, 'title'],
    "spans": []
}).json()

CPU times: user 87.9 ms, sys: 0 ns, total: 87.9 ms
Wall time: 4.97 s


[[0,
  10,
  'Jim Cramer',
  'Jim_Cramer',
  0.6775981905919447,
  0.95093634724617,
  'PER']]

In [2]:
el_result

[[41,
  16,
  'Charles Bukowski',
  'Charles_Bukowski',
  0.9535556493571484,
  0.988037109375,
  'PER']]

In [3]:
ed_result

[[41,
  16,
  'Charles Bukowski',
  'Charles_Bukowski',
  0.9535556493571484,
  0.0,
  'NULL']]