In [75]:
import pandas as pd
import numpy as np
import requests

In [53]:
# APIs
biencoder = 'http://localhost:30300/api/blink/biencoder' # mention # entity
biencoder_mention = f'{biencoder}/mention'
biencoder_entity = f'{biencoder}/entity'
crossencoder = 'http://localhost:30302/api/blink/crossencoder'
indexer = 'http://localhost:30301/api/indexer' # search # add
indexer_search = f'{indexer}/search'
indexer_add = f'{indexer}/add'
indexer_reset = f'{indexer}/reset/rw'
nilpredictor = 'http://localhost:30303/api/nilprediction'
nilcluster = 'http://localhost:30305/api/nilcluster'

# Reset the New Kb (RW index) to ensure it is empty

In [55]:
print('Resetting RW index...')
res_reset = requests.post(indexer_reset, data={})

if res_reset.ok:
    print('Reset done.')
else:
    print('ERROR while resetting!')

Resetting RW index...
Reset done.


Load some data (e.g. the first batch of the dev set)

In [7]:
data = pd.read_json('../incremental_dataset/dev/dev_0.jsonl', lines=True)

In [9]:
data.head()

Unnamed: 0,Wikipedia_ID,Wikipedia_title,Wikipedia_URL,left_context,mention_as_list,right_context,context_left,mention,context_right,y_category,...,docId,label_id,freebaseID,y_wikiurl_dump,query_id,p_formula,p_uniform,NIL,freq,batch
0,3871014,Rainbow,http://en.wikipedia.org/wiki/Rainbow,"[the, rainbow, '', from, the, wizard, of, oz, ...",[rainbow],"[,, a, symbol, of, good, ,, the, calm, after, ...","the Rainbow"" from the Wizard of Oz where Dorot...",Rainbow,", a symbol of good, the calm after the storm a...","[lucky symbols, atmospheric optical phenomena,...",...,5725443,http://hierroglyphic.blogspot.com/2010_09_01_a...,Some(9202a8c04000641f800000000038768c),https://en.wikipedia.org/wiki?curid=3871014,4206740,3.162278e-95,0.734306,False,378,0
1,24997985,Tortoiseshell cat,http://en.wikipedia.org/wiki/Tortoiseshell_cat,"[but, over, time, ,, this, characteristic, has...",[tortoiseshell],"[,, blue, ,, and, tabby, ., the, persian, is, ...","but over time, this characteristic has become ...",tortoiseshell,", blue, and tabby . The Persian is generally d...",[cat coat types],...,8618624,http://www.sulit.com.ph/index.php/view+classif...,Some(9202a8c04000641f80000000000785ee),https://en.wikipedia.org/wiki?curid=24997985,4227303,5.6234e-06,0.497273,False,21,0
2,43468,T. E. Lawrence,http://en.wikipedia.org/wiki/T._E._Lawrence,"[some, of, the, political, implications, of, t...","[lawrence, of, arabia]","[,, was, a, badass, ., this, much, is, evident...",some of the political implications of the ways...,Lawrence of Arabia,", was a badass. This much is evident from any ...","[british army general list officers, british a...",...,6853281,http://joelsbookshelf.blogspot.com/2012/08/rea...,Some(9202a8c04000641f80000000000570ba),https://en.wikipedia.org/wiki?curid=43468,4353903,0.01,0.633318,False,6,0
3,113519,Short (finance),http://en.wikipedia.org/wiki/Short_(finance),"[of, homes, and, their, list-and-sold, price.a...","[short, sales]","[,, not, a, regular, sale, ), status, address,...",of homes and their list-and-sold price.Â DOM ...,short sales,", not a regular sale) Status Address Bedroom, ...","[introductions, dutch inventions, 17th-century...",...,1557206,http://raymondong.com/,Some(9202a8c04000641f80000000000c73b4),https://en.wikipedia.org/wiki?curid=113519,2138095,5.623413e-106,0.976293,False,421,0
4,156045,M*A*S*H (TV series),http://en.wikipedia.org/wiki/M*A*S*H_(TV_series),"[watching, sporting, events, ,, season, finale...",[m*a*s*h],"[stood, as, the, most, watched, american, tv, ...","watching sporting events, season finales, and ...",M*A*S*H,stood as the most watched American TV broadcas...,"[american, television programs based on films,...",...,10409920,http://ustelevision.com/2011/05/18/what-does-t...,Some(9202a8c04000641f8000000000123a15),https://en.wikipedia.org/wiki?curid=156045,354180,0.3162278,0.760256,False,28,0


Select 100 not-NIL + 100 NIL mentions

In [46]:
number_not_nil = 50
number_nil = 50

selection = data.query('~NIL').head(number_not_nil)
selection = pd.concat([selection, data.query('NIL').head(number_nil)])

In [47]:
selection.shape

(100, 21)

Check how many NIL mentions should be clustered together (since they refer to the same out-of-KB entity)

In [48]:
selection.query('NIL')['Wikipedia_ID'].value_counts()

262831      5
4607980     4
21980       3
182494      3
145699      2
14900       2
43999623    2
30310       1
5869719     1
22316162    1
1926443     1
4565664     1
15389730    1
14508500    1
94834       1
7076247     1
425554      1
207750      1
23281       1
40359       1
262891      1
501582      1
19317904    1
978740      1
755645      1
43452       1
3185609     1
28387289    1
12639363    1
27705200    1
668081      1
2135896     1
45492       1
363196      1
1619786     1
4223084     1
Name: Wikipedia_ID, dtype: int64

In [49]:
selection.shape

(100, 21)

# Biencoder encode mentions

In [50]:
res_biencoder = requests.post(biencoder_mention,
        json=selection[[
            'mention',
            'context_left',
            'context_right'
            ]].to_dict(orient='records'))

In [51]:
if res_biencoder.ok:
    selection['encoding'] = res_biencoder.json()['encodings']
    print('Biencode OK')
    print('Encoded {} entities.'.format(selection.shape[0]))
else:
    print('Biencoder ERROR')
    print(res_biencoder)
    raise Exception('Biencoder ERROR')

Biencode OK
Encoded 100 entities.


The encoding columns has been added with the base64 encoded vector representing the mention

In [52]:
selection[['encoding']].head()

Unnamed: 0,encoding
0,Y1J+PkX+ADw1DLO8hd7evFtK5L1xXSA9AlKgPCnUwTyp7D...
1,+4FqPeIoGj6bioQ+3jnJvVQpfr1r7qq96fFQPZ5IvD0dcR...
2,QhF3Po2lADyk5gs+11sVvXNHDb0Ts0c8oUe5vGqJwr1u+u...
3,MACiPZd4aD4suke8wZMfPEs5zb1YiBS+SZ5ZvOg2wD0mPW...
4,TUqpPh9cET4XdAQ+pVUlPl3rGT7PtSU+PmdpvswESTzFMQ...


# Retrieval with indexer

In [56]:
body = {
    'encodings': selection['encoding'].values.tolist(),
    'top_k': 10 # top_10 candidates
}
res_indexer = requests.post(indexer_search, json=body)

In [57]:
if res_indexer.ok:
    candidates = res_indexer.json()
    print('Indexer OK')
else:
    print('ERROR with the indexer.')
    print(res_indexer)
    print(res_indexer.json())

if len(candidates) == 0 or len(candidates[0]) == 0:
    print('No candidates received.')

selection['candidates'] = candidates

Indexer OK


In [72]:
# 3 candidates
selection['candidates'].iloc[0][:3]

[{'raw_score': 322.6261901855469,
  'id': 849365,
  'wikipedia_id': 3871014,
  'title': 'Rainbow',
  'url': 'https://en.wikipedia.org/wiki?curid=3871014',
  'type_': None,
  'indexer': 10,
  'score': 83.38381958007812,
  'norm_score': 0.5085703801599667},
 {'raw_score': 327.61798095703125,
  'id': 331878,
  'wikipedia_id': 1026203,
  'title': 'Rainbows in mythology',
  'url': 'https://en.wikipedia.org/wiki?curid=1026203',
  'type_': None,
  'indexer': 10,
  'score': 80.88790893554688,
  'norm_score': 0.5350073800335146},
 {'raw_score': 330.5491943359375,
  'id': 35028,
  'wikipedia_id': 71079,
  'title': 'Rainbow flag',
  'url': 'https://en.wikipedia.org/wiki?curid=71079',
  'type_': None,
  'indexer': 10,
  'score': 79.42232513427734,
  'norm_score': 0.4614385006898517}]

# NIL prediction

In [63]:
def prepare_for_nil_prediction(x, mention='mention'):
    """
    Function to prepare the features required by the nil predictor
    """
    c = x['candidates']

    is_nil = False
    features = {}

    if len(c) == 0:
        is_nil = True
        return is_nil, features

    is_cross = 'is_cross' in c[0] and c[0]['is_cross']

    features = {}
    if not is_cross:
        # bi only
        features['max_bi'] = c[0]['score']
    else:
        # cross
        if 'bi_score' in c[0]:
            features['max_bi'] = c[0]['bi_score']
        features['max_cross'] = c[0]['score']

    features['mention'] = x[mention]
    features['title'] = c[0]['title']
    features['topcandidates'] = c

    return is_nil, features

In [64]:
selection[['is_nil', 'nil_features']] = selection.apply(prepare_for_nil_prediction, axis=1, result_type='expand')

In [69]:
selection[['is_nil', 'nil_features']].head()

Unnamed: 0,is_nil,nil_features
0,False,"{'max_bi': 83.38381958007812, 'mention': 'Rain..."
1,False,"{'max_bi': 84.1929931640625, 'mention': 'torto..."
2,False,"{'max_bi': 78.03597259521484, 'mention': 'Lawr..."
3,False,"{'max_bi': 83.60071563720703, 'mention': 'shor..."
4,False,"{'max_bi': 80.76179504394531, 'mention': 'M*A*..."


In [84]:
## NIL prediction
# initialize fields (default NIL)
selection['nil_score'] = np.zeros(selection.shape[0])
not_yet_nil = selection.query('is_nil == False')

if not_yet_nil.shape[0] > 0:
    res_nilpredictor = requests.post(nilpredictor, json=not_yet_nil['nil_features'].values.tolist())
    if res_nilpredictor.ok:
        print('NIL pred OK')
        nil_scores_bi = np.array(res_nilpredictor.json()['nil_score_bi'])
    else:
        print('ERROR during NIL prediction')
        print(res_nilpredictor)
        print(res_nilpredictor.json())
else:
    print('ERROR. Probably the KB is emtpy')

selection.loc[not_yet_nil.index, 'nil_score'] = nil_scores_bi

nil_threshold = 0.5
# if below threshold --> is NIL
selection['is_nil'] = selection['nil_score'].apply(lambda x: x < nil_threshold)

print('Estimated {} entities as NOT NIL'.format(selection.eval('is_nil == False').sum()))
print('Estimated {} entities as NIL'.format(selection.eval('is_nil == True').sum()))

NIL pred OK
Estimated 86 entities as NOT NIL
Estimated 14 entities as NIL


In [82]:
# NIL
selection.query('~is_nil')[["Wikipedia_title", "mention", "NIL", "is_nil"]].head()

Unnamed: 0,Wikipedia_title,mention,NIL,is_nil
0,Rainbow,Rainbow,False,False
1,Tortoiseshell cat,tortoiseshell,False,False
3,Short (finance),short sales,False,False
4,M*A*S*H (TV series),M*A*S*H,False,False
5,Limbo,limbo,False,False


In [83]:
# NIL
selection.query('is_nil')[["Wikipedia_title", "mention", "NIL", "is_nil"]].head()

Unnamed: 0,Wikipedia_title,mention,NIL,is_nil
2,T. E. Lawrence,Lawrence of Arabia,False,True
37,"Thebes, Greece",Thebes,False,True
41,DSM-IV codes,DSM-IV,False,True
43,Myra Breckinridge (film),Myra Breckinridge,False,True
1183,Arcadia Group,arcadia,True,True


# NIL Clustering

In [87]:
## Entity Clustering
nil_mentions = selection.query('is_nil == True')

res_nilcluster = requests.post(nilcluster, json={
        'ids': nil_mentions.index.tolist(),
        'mentions': nil_mentions["mention"].values.tolist(),
        'encodings': nil_mentions['encoding'].values.tolist()
    })

if not res_nilcluster.ok:
    print('NIL cluster ERROR')
else:
    print('NIL cluster OK')

clusters = pd.DataFrame(res_nilcluster.json())

# visualize big clusters first 
clusters = clusters.sort_values(by='nelements', ascending=False)

NIL cluster OK


In [88]:
clusters.head()

Unnamed: 0,title,nelements,mentions_id,mentions,center
7,Miami Vice,5,"[3842, 4281, 4708, 5163, 8710]","[Miami Vice, Miami Vice, Miami Vice, Miami Vic...",69xJPhww1jsoZmy9kIqSPgLgvbxekgk+gGaLvkxoFL1Vv9...
5,Lawrence of Arabia,2,"[2, 4715]","[Lawrence of Arabia, Lawrence of Arabia]",QhF3Po2lADyk5gs+11sVvXNHDb0Ts0c8oUe5vGqJwr1u+u...
0,DSM-IV,1,[41],[DSM-IV],jTKIPtQyOT0ViYw9b2oMvtA03r1xCeG8JoorvlaS+b3eGU...
1,Thebes,1,[37],[Thebes],lfWtPdLcdD1B2wQ+kh+Qu0DcpL0GjzI9HR4hvgldzb1zvF...
2,arcadia,1,[1183],[arcadia],EXcdPjrcYj360be9WuifvUk0T72bp8W98yKFvI/KMT0H7c...


# Add New entities to the New KB

In [89]:
selection_new = clusters[['title', 'center']].rename(columns={'center': 'encoding', 'mode': 'wikipedia_id'})
new_indexed = requests.post(indexer_add, json=selection_new.to_dict(orient='records'))

if not new_indexed.ok:
    print('error adding new entities')
else:
    print('new entities added correctly.')
    new_indexed = new_indexed.json()
    clusters['index_id'] = new_indexed['ids']
    clusters['index_indexer'] = new_indexed['indexer']

new entities added correctly.


At this point these new entities are retrieved in the indexer step so that subsequent documents that mentions them could be linked.