# BERT Embeddings as a service

The BERT as a service is available for you to use in this project.
The client exposes all the methods you need.

In [1]:
import ElasticSearchSimpleAPI as es
import numpy as np

from bert_serving.client import BertClient

from sklearn.metrics.pairwise import cosine_similarity
from pandas.io.json import json_normalize
import pandas as pd

import numpy as np
import TRECCASTeval as trec


# Create test bed object
test_bed = trec.ConvSearchEvaluation()

# Elastic Search API
elastic = es.ESSimpleAPI()

# BERT as a service
bc = BertClient()
bc.length_limit = 512


## Top ranked documents
Computing the embeddings for all msmarco passages yields better results but takes too much time. To avoid this computational cost, you must use a retrieval model to sample the top ranked documents.

This will then be your working sample.

In [2]:
# Retrieval example
topic_turn_id = '1_1'
topic_turn_query="What is a physician's assistant?"
df = elastic.search_body(topic_turn_query, 10)

[p10, recall, ndcg] = test_bed.eval(df[['_id','_score']], topic_turn_id)
print('P10=', p10, '  Recall=', recall, '  NDCG=',ndcg)

             _id    _score
0   MARCO_849267  7.988291
1  MARCO_2331424  7.979860
2  MARCO_5780723  7.856357
3   MARCO_920443  7.837830
4  MARCO_4903530  7.829967
5   MARCO_955948  7.767573
6  MARCO_4016757  7.679798
7  MARCO_5692406  7.574573
8  MARCO_2331422  7.536899
9  MARCO_6193189  7.521804


TypeError: Index does not support mutable operations

## Ranking in the embeddings space
Calculating the embedding of a sentence is computational slow and should be used carefully (10 documents per query will take some time). The similarities computation is linear and is not too complex.

A suggestion is to compute the embedding once and store the embedding of each document on a local file. You can then use the sci-kit learn `knn` implementation to identify the nearest neighbour to a given embedding vector.

In [6]:
# BERT reranking example
texts = [topic_turn_query] + df["_source.body"].tolist() #query e resultados do elastic search numa matriz
vects = bc.encode(texts, is_tokenized=False) #gera uma matriz em que cada linha tem nums- representa o vetor de cada elem
#o primeiro vetor e o da query, os outros sao os dos docs
sims = cosine_similarity(vects[0:1, :], vects[1:, :]) #semelhanças de cos entre o vetor de embedding da query e dos docs
df["bert_score"] = sims[0]

df = df.sort_values(by="bert_score", ascending=False)

#podemos guardar os resultados de vecs(?) num ficheiro na nossa home, para nao ter de estar sempre a fazer queries ao bert- pode ficar lento
#acho que sao os de sentence embedding

[p10, recall, ndcg] = test_bed.eval(df[['_id','bert_score']], topic_turn_id)
print('P10=', p10, '  Recall=', recall, '  NDCG=',ndcg)

    topic_turn_id  dummy          docid  rel
244           2_1      0  MARCO_4369683    1
P10= 0.1   Recall= 1.0   NDCG= 0


In [7]:
df

Unnamed: 0,_index,_type,_id,_score,_source.body,bert_score
5,msmarco,_doc,MARCO_2142719,9.074766,INTRODUCTION. The Myotonic goat is a distinct ...,0.8934
0,msmarco,_doc,MARCO_5023599,9.434418,This is a list of goat breeds. There are many ...,0.892809
7,msmarco,_doc,MARCO_1632359,8.833383,Meat goats are often called Spanish goats in h...,0.890255
1,msmarco,_doc,MARCO_4369683,9.340648,Different breeds of goats are now endearing a ...,0.888519
2,msmarco,_doc,MARCO_1827213,9.315275,This is a list of goat breeds. There are many ...,0.887969
9,msmarco,_doc,MARCO_2142716,8.754038,Myotonic Goats. Myotonic Goats are the only br...,0.887701
6,msmarco,_doc,MARCO_12701,8.857945,There is question as to whether or not the Bru...,0.870248
4,msmarco,_doc,MARCO_1188048,9.212319,INTRODUCTION. The Myotonic goat is a distinct ...,0.839404
8,msmarco,_doc,MARCO_1188047,8.769095,The breed is a small breed ranging from 17-25â...,0.83741
3,msmarco,_doc,MARCO_1827207,9.24509,Jamunapari Goat. Jamunapari goat is a very bea...,0.836113
