In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')

In [2]:
df = pd.read_csv('news_dataset.csv', encoding='latin-1')
data = df[['id', 'article']]
data.head()

Unnamed: 0,id,article
0,17307,PARIS ? When the Islamic State was about to...
1,17292,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Finally. The Second Avenue subway opened in Ne...
3,17311,WASHINGTON ? It?s or time for Republica...
4,17339,"For Megyn Kelly, the shift from Fox News to NB..."


In [3]:
data[data.duplicated(subset=['article'], keep=False)]

Unnamed: 0,id,article
41,17313,The body of the Iraqi prisoner was found naked...
219,17545,"DETROIT ? Just before the holidays, on a da..."
220,17546,"DETROIT ? Just before the holidays, on a da..."
752,18185,The body of the Iraqi prisoner was found naked...
753,18186,The body of the Iraqi prisoner was found naked...
886,18337,HOUSTON ? The chants rang out loud and long...
887,18338,HOUSTON ? The chants rang out loud and long...
888,18339,Picking the pain reliever that?s best for you ...
889,18341,Picking the pain reliever that?s best for you ...


In [4]:
data_no_dup = data.drop_duplicates(subset=['article'],keep='first').reset_index(drop=True)
data_no_dup.head()

Unnamed: 0,id,article
0,17307,PARIS ? When the Islamic State was about to...
1,17292,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Finally. The Second Avenue subway opened in Ne...
3,17311,WASHINGTON ? It?s or time for Republica...
4,17339,"For Megyn Kelly, the shift from Fox News to NB..."


In [5]:
# Create a function to clean data
def clean_text(text):
    # Remove non-ASCII characters
    text = ''.join([char for char in text if ord(char) < 128])

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove question mark problems
    text = re.sub(r'(\s\?)',' ',text)
    text = re.sub(r"\b\?\b", "\'", text)
    text = re.sub(r"(,\?)",",", text)
    text = re.sub(r"\?+", "?", text)
    text = text.strip()

    return text

In [6]:
def chunk_text(data_index, data_text, chunk_size, chunk_overlap):

    list_chunk_text = []

    for position in range(len(data_index)):

        words = clean_text(data_text[position]).split()

        start = 0
        part = 1
        while start < len(words):
            end = start + chunk_size
            segment = ' '.join(words[start:end])
            list_chunk_text.append((str(data_index[position]) + str(part), segment))
            part += 1
            start += (chunk_size - chunk_overlap)

    return pd.DataFrame(list_chunk_text, columns=['id', 'article'])

In [7]:
data_chunk = chunk_text(data_no_dup['id'], data_no_dup['article'], 500, 50)

In [8]:
data_chunk.to_csv('data_chunk.csv', index=False)

In [9]:
data_chunk.head()

Unnamed: 0,id,article
0,173071,PARIS When the Islamic State was about to be d...
1,173072,to mobilize public opinion in the face of the ...
2,173073,the guards at Mari reported that looters had c...
3,172921,Angels are everywhere in the Mu'iz family's ap...
4,172922,and his lower jaw and cut a hole through his e...


In [10]:
def fetch_data_info(dataframe_idx, score):
    info = data_chunk.iloc[dataframe_idx]
    meta_dict = {}
    meta_dict['id'] = info['id']
    meta_dict['article'] = info['article']
    meta_dict['score'] = score
    return meta_dict

    
def search(query, top_k, index, model):
    query_vector = model.encode([query])
    top_k = index.search(query_vector, top_k)
    print(top_k)
    top_k_ids = list(top_k[1].tolist()[0])
    score = list(top_k[0].tolist()[0])
    results =  [fetch_data_info(idx, score) for idx, score in zip(top_k_ids, score)]
    return top_k_ids

In [11]:
encoded_data = model.encode(data_chunk['article'].tolist())
encoded_data = np.asarray(encoded_data.astype('float32'))

In [12]:
import faiss
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(encoded_data, np.array(range(0, len(data_chunk))))
faiss.write_index(index, 'data_article.index')

In [13]:
import faiss
index = faiss.read_index('data_article.index')

In [14]:
index

<faiss.swigfaiss_avx2.IndexIDMap; proxy of <Swig Object of type 'faiss::IndexIDMapTemplate< faiss::Index > *' at 0x00000122C680FA80> >

In [15]:
query = "Who is the vice chairman of Samsung?"
query = clean_text(query)
results = search(query, top_k=20, index=index, model=model)
results

(array([[53.687637, 50.57049 , 49.184822, 47.497726, 44.04258 , 43.050102,
        42.99745 , 42.868958, 42.802124, 42.406384, 42.37947 , 41.848022,
        41.833664, 41.7257  , 41.68076 , 41.54087 , 41.514153, 41.352108,
        41.108116, 40.851246]], dtype=float32), array([[ 714, 1444, 1215, 1216,  715,  172, 2446, 1381, 2089, 2027, 2392,
        2911,  872,  675,  787, 1741, 1307, 2627, 2407,  159]],
      dtype=int64))


[714,
 1444,
 1215,
 1216,
 715,
 172,
 2446,
 1381,
 2089,
 2027,
 2392,
 2911,
 872,
 675,
 787,
 1741,
 1307,
 2627,
 2407,
 159]

## Cross Encoder

In [16]:
from sentence_transformers import CrossEncoder
cross_model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6', max_length=512)

In [17]:
def cross_score(model_inputs):
    scores = cross_model.predict(model_inputs)
    return scores

model_inputs = [[query, item['article']] for item in results]
scores = cross_score(model_inputs)

#Sort the scores in decreasing order
ranked_results = [{'Id': inp['id'], 'Score': score} for inp, score in zip(results, scores)]
ranked_results = sorted(ranked_results, key=lambda x: x['Score'], reverse=True)

TypeError: 'int' object is not subscriptable

In [None]:
from pprint import pprint

print("\n")
for result in ranked_results[:5]:
    print('\t',pprint(result))



{'Id': '175741', 'Score': 0.95794976}
	 None
{'Id': '177641', 'Score': 0.12907586}
	 None
{'Id': '177642', 'Score': 0.053180993}
	 None
{'Id': '178513', 'Score': 0.0005532388}
	 None
{'Id': '175742', 'Score': 0.00019425723}
	 None


In [None]:
query = "who lives in Sheepshead Bay?"
query = clean_text(query)
results = search(query, top_k=10, index=index, model=model)
model_inputs = [[query, item['article']] for item in results]
scores = cross_score(model_inputs)

ranked_results = [{'Id': inp['id'], 'Score': score} for inp, score in zip(results, scores)]
ranked_results = sorted(ranked_results, key=lambda x: x['Score'], reverse=True)
print("\n")
for result in ranked_results[:5]:
    print('\t',pprint(result))



{'Id': '179103', 'Score': 0.00019763244}
	 None
{'Id': '183542', 'Score': 0.00018770898}
	 None
{'Id': '183343', 'Score': 0.0001814828}
	 None
{'Id': '181331', 'Score': 0.00016951578}
	 None
{'Id': '176332', 'Score': 0.00016916702}
	 None


## BERT score

In [None]:
import bert_score
from bert_score import score

In [None]:
query = "Who is the vice chairman of Samsung?"
query = clean_text(query)
ranked_results_bert = []

for result in results:
    P, R, F1 = score([result['article']], [query], lang='en')
    ranked_results_bert.append({'Id': result['id'], 'Score': F1.numpy()[0]})

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [None]:
ranked_results_bert = sorted(ranked_results_bert, key=lambda x: x['Score'], reverse=True)
print("\n")
for result in ranked_results_bert[:5]:
    print('\t',pprint(result))



{'Id': '175943', 'Score': 0.8234369}
	 None
{'Id': '175501', 'Score': 0.820606}
	 None
{'Id': '181351', 'Score': 0.8139641}
	 None
{'Id': '183661', 'Score': 0.81344926}
	 None
{'Id': '181331', 'Score': 0.81143385}
	 None
