In [None]:
import requests
import re
import numpy as np
import pandas as pd
import json

In [None]:
biencoder = 'http://localhost:30300/api/blink/biencoder' # mention # entity
biencoder_mention = f'{biencoder}/mention'
biencoder_entity = f'{biencoder}/entity'
crossencoder = 'http://localhost:30302/api/blink/crossencoder'
indexer = 'http://localhost:30301/api/indexer' # search # add
indexer_search = f'{indexer}/search'
indexer_add = f'{indexer}/add'
nilpredictor = 'http://localhost:30303/api/nilprediction'
nilcluster = 'http://localhost:30305/api/nilcluster'

In [None]:
data = pd.read_json('/home/lsasso/test_unseen_mention/test.json', lines=True)

In [None]:
data = data.rename(columns={'right_context_text': 'context_right', 'left_context_text': 'context_left', 'word': 'mention'})

## Entity Linking

### Encoding

In [None]:
res_biencoder = requests.post(biencoder_mention, json=data.to_dict(orient='records'))

In [None]:
if res_biencoder.ok:
    data['encoding'] = res_biencoder.json()['encodings']
else:
    print('Biencoder ERROR')
    print(res_biencoder)

In [None]:
print('Encoded {} entities.'.format(data.shape[0]))

In [None]:
data.head()

### Retrieval

In [None]:
body = {
    'encodings': data['encoding'].values.tolist(),
    'top_k': 10
}
res_indexer = requests.post(indexer_search, json=body)

In [None]:
if res_indexer.ok:
    candidates = res_indexer.json()
else:
    print('ERROR with the indexer.')
    print(res_indexer)
    print(res_indexer.json())

In [None]:
if len(candidates) == 0 or len(candidates[0]) == 0:
    print('No candidates received.')
else:
    _top_k = len(candidates[0])
    for _cand in candidates:
        assert len(_cand) == _top_k
    print('Received {} candidates for all the {} entities.'.format(_top_k, len(candidates)))

In [None]:
data['candidates'] = candidates

In [None]:
data.head()

### Crossencoder

In [None]:
res_cross = requests.post(crossencoder, json= {
    'samples': data[['context_left', 'context_right', 'mention']].to_dict(orient='records'),
    'candidates': data['candidates'].tolist()
})

In [None]:
res_cross.json()

In [None]:
data['candidates']

In [None]:
if res_cross.ok:
    data = data.rename(columns={'candidates': 'candidates_bi'})
    data['candidates'] = res_cross.json()
else:
    print('ERROR with the crossencoder.')
    print(res_cross)
    print(res_cross.json())

In [None]:
data

In [None]:
def prepare_for_nil_prediction(x):
    c = x['candidates']

    is_nil = False
    features = {}

    if len(c) == 0:
        is_nil = True
        return is_nil, features
        
    is_cross = 'is_cross' in c[0] and c[0]['is_cross']
    
    features = {}
    if not is_cross:
        # bi only
        features['max_bi'] = c[0]['score']
    else:
        # cross
        if 'bi_score' in c[0]:
            features['max_bi'] = c[0]['bi_score']
        features['max_cross'] = c[0]['score']
        
    features['mention'] = x['mention']
    features['title'] = c[0]['title']
    
    return is_nil, features

In [None]:
data[['is_nil', 'nil_features']] = data.apply(prepare_for_nil_prediction, axis=1, result_type='expand')

In [None]:
data.head()

## NIL prediction

In [None]:
# prepare fields (default NIL)
data['nil_score'] = np.zeros(data.shape[0])

In [None]:
not_yet_nil = data.query('is_nil == False')

In [None]:
if not_yet_nil.shape[0] > 0:
    res_nilpredictor = requests.post(nilpredictor, json=not_yet_nil['nil_features'].values.tolist())
    if res_nilpredictor.ok:
        # TODO use cross if available
        nil_scores = np.array(res_nilpredictor.json()['nil_score_bi'])
    else:
        print('ERROR during NIL prediction')
        print(res_nilpredictor)
        print(res_nilpredictor.json())

In [None]:
data.loc[not_yet_nil.index, 'nil_score'] = nil_scores

In [None]:
nil_threshold = 0.5
# if below threshold --> is NIL
data['is_nil'] = data['nil_score'].apply(lambda x: x < nil_threshold)

In [None]:
data.head()

In [None]:
print('Estimated {} entities as NOT NIL'.format(data.eval('is_nil == False').sum()))
print('Estimated {} entities as NIL'.format(data.eval('is_nil == True').sum()))

In [None]:
data['top_title'] = data['candidates'].apply(lambda x: x[0]['title'])

In [None]:
# not NIL
data.query('is_nil == False')[['mention', 'top_title']].head()

## Entity Clustering

In [None]:
nil_mentions = data.query('is_nil == True')

In [None]:
res_nilcluster = requests.post(nilcluster, json={
        'ids': nil_mentions.index.tolist(),
        'mentions': nil_mentions['mention'].values.tolist(),
        'encodings': nil_mentions['encoding'].values.tolist()
    })

In [None]:
if not res_nilcluster.ok:
    print('NIL cluster ERROR')
else:
    print('OK')

In [None]:
clusters = pd.DataFrame(res_nilcluster.json())

In [None]:
clusters = clusters.sort_values(by='nelements', ascending=False)

In [None]:
# TODO considero i tipi nel clustering

In [None]:
clusters.head()
#clusters

In [None]:
clusters['nelements'].plot(kind='hist', bins=20)

In [None]:
print('Found {} clusters out of {} NIL mentions.'.format(clusters.shape[0], nil_mentions.shape[0]))

In [None]:
outdata = './outdata.pickle'
data.to_pickle(outdata)

In [None]:
outclusters ='./outclusters.pickle'
clusters.to_pickle(outclusters)