In [1]:
import pandas as pd
import numpy as np
import re

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')

In [2]:
df = pd.read_csv('news_dataset.csv', encoding='latin-1')
data = df[['id', 'article']]
data.head()

Unnamed: 0,id,article
0,17307,PARIS ? When the Islamic State was about to...
1,17292,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Finally. The Second Avenue subway opened in Ne...
3,17311,WASHINGTON ? It?s or time for Republica...
4,17339,"For Megyn Kelly, the shift from Fox News to NB..."


In [3]:
data[data.duplicated(subset=['article'], keep=False)]

Unnamed: 0,id,article
41,17313,The body of the Iraqi prisoner was found naked...
219,17545,"DETROIT ? Just before the holidays, on a da..."
220,17546,"DETROIT ? Just before the holidays, on a da..."
752,18185,The body of the Iraqi prisoner was found naked...
753,18186,The body of the Iraqi prisoner was found naked...
886,18337,HOUSTON ? The chants rang out loud and long...
887,18338,HOUSTON ? The chants rang out loud and long...
888,18339,Picking the pain reliever that?s best for you ...
889,18341,Picking the pain reliever that?s best for you ...


In [4]:
data_no_dup = data.drop_duplicates(subset=['article'],keep='first').reset_index(drop=True)
data_no_dup.head()

Unnamed: 0,id,article
0,17307,PARIS ? When the Islamic State was about to...
1,17292,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Finally. The Second Avenue subway opened in Ne...
3,17311,WASHINGTON ? It?s or time for Republica...
4,17339,"For Megyn Kelly, the shift from Fox News to NB..."


In [5]:
# Create a function to clean data
def clean_text(text):
    # Remove non-ASCII characters
    text = ''.join([char for char in text if ord(char) < 128])

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove question mark problems
    text = re.sub(r'(\s\?)',' ',text)
    text = re.sub(r"\b\?\b", "\'", text)
    text = re.sub(r"(,\?)",",", text)
    text = re.sub(r"\?+", "?", text)
    text = text.strip()

    return text

In [6]:
def chunk_text(data_index, data_text, chunk_size, chunk_overlap):

    list_chunk_text = []

    for position in range(len(data_index)):

        words = clean_text(data_text[position]).split()

        start = 0
        part = 1
        while start < len(words):
            end = start + chunk_size
            segment = ' '.join(words[start:end])
            list_chunk_text.append((str(data_index[position]) + str(part), segment))
            part += 1
            start += (chunk_size - chunk_overlap)

    return pd.DataFrame(list_chunk_text, columns=['id', 'article'])

In [30]:
data_chunk_train = chunk_text(data_no_dup[:5]['id'], data_no_dup[:5]['article'], 100, 50)
data_chunk_train.to_csv('data_chunk_train.csv', index=False)

In [37]:
data_no_dup['id']

0      17307
1      17292
2      17298
3      17311
4      17339
       ...  
990    18460
991    18461
992    18462
993    18463
994    18465
Name: id, Length: 995, dtype: int64

In [40]:
data_no_train = data_no_dup[5:].reset_index(drop=True)
data_chunk = chunk_text(data_no_train['id'], data_no_train['article'], 500, 50)
data_chunk

Unnamed: 0,id,article
0,173401,Megyn Kelly's new office at NBC News sits a bl...
1,173402,closer to a news program than the typical dayt...
2,173403,"taking over Fox News's 7 p. m. slot, have also..."
3,173421,"In the technology industry, the sharks have ne..."
4,173422,"sales were flat last year, and after a monster..."
...,...,...
2942,184633,"New Orleans but was rebuffed. Dolan, he said, ..."
2943,184634,relationships with many of his former teammate...
2944,184651,"Hans Rosling, a Swedish doctor who transformed..."
2945,184652,the flair of a seasoned performer (he once dem...


In [41]:
data_chunk.to_csv('data_chunk.csv', index=False)

In [42]:
data_chunk.head()

Unnamed: 0,id,article
0,173401,Megyn Kelly's new office at NBC News sits a bl...
1,173402,closer to a news program than the typical dayt...
2,173403,"taking over Fox News's 7 p. m. slot, have also..."
3,173421,"In the technology industry, the sharks have ne..."
4,173422,"sales were flat last year, and after a monster..."


In [43]:
def fetch_data_info(dataframe_idx, score):
    info = data_chunk.iloc[dataframe_idx]
    meta_dict = {}
    meta_dict['id'] = info['id']
    meta_dict['article'] = info['article']
    meta_dict['score'] = score
    return meta_dict

    
def search(query, top_k, index, model):
    query_vector = model.encode([query])
    top_k = index.search(query_vector, top_k)
    print(top_k)
    top_k_ids = list(top_k[1].tolist()[0])
    score = list(top_k[0].tolist()[0])
    results =  [fetch_data_info(idx, score) for idx, score in zip(top_k_ids, score)]
    return results

In [44]:
encoded_data = model.encode(data_chunk['article'].tolist())
encoded_data = np.asarray(encoded_data.astype('float32'))

In [45]:
import faiss
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(encoded_data, np.array(range(0, len(data_chunk))))
faiss.write_index(index, 'data_article.index')

In [46]:
import faiss
index = faiss.read_index('data_article.index')

In [47]:
query = "Who is the vice chairman of Samsung?"
query = clean_text(query)
results = search(query, top_k=20, index=index, model=model)
results

(array([[53.687637, 50.57049 , 49.184822, 47.497726, 44.04258 , 43.050102,
        42.99745 , 42.868958, 42.802124, 42.406384, 42.37947 , 41.848022,
        41.833664, 41.7257  , 41.68076 , 41.54087 , 41.514153, 41.352108,
        41.108116, 40.851246]], dtype=float32), array([[ 699, 1429, 1200, 1201,  700,  157, 2431, 1366, 2074, 2012, 2377,
        2896,  857,  660,  772, 1726, 1292, 2612, 2392,  144]],
      dtype=int64))


[{'id': '175741',
  'article': "SEOUL, South Korea A special prosecutor investigating the corruption scandal that led to President Park s impeachment summoned the de facto head of Samsung for questioning on Wednesday, calling him a bribery suspect. The de facto leader, Jay Y. Lee, the vice chairman of Samsung, will be questioned on Thursday, according to the special prosecutor's office, which recommended that he also be investigated on suspicion of perjury. Mr. Lee effectively runs Samsung, South Korea's largest conglomerate he is the son of its chairman, Lee who has been incapacitated with health problems. He is expected to be asked whether donations that Samsung made to two foundations controlled by Choi a longtime friend of the president, amounted to bribes, and what role, if any, he played in the decision to give the money. Investigators at the special prosecutor's office have questioned other senior Samsung executives as suspects about the bribery accusations. Neither Samsung nor 

## Cross Encoder

In [48]:
from sentence_transformers import CrossEncoder
cross_model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6', max_length=512)

In [49]:
def cross_score(model_inputs):
    scores = cross_model.predict(model_inputs)
    return scores

model_inputs = [[query, result['article']] for result in results]
scores = cross_score(model_inputs)

#Sort the scores in decreasing order
ranked_results = [{'Id': result['id'], 'Score': score} for result, score in zip(results, scores)]
ranked_results = sorted(ranked_results, key=lambda x: x['Score'], reverse=True)

In [50]:
from pprint import pprint

print("\n")
for result in ranked_results[:5]:
    print('\t',pprint(result))



{'Id': '175741', 'Score': 0.93238395}
	 None
{'Id': '177641', 'Score': 0.75574094}
	 None
{'Id': '177642', 'Score': 0.046901602}
	 None
{'Id': '175742', 'Score': 0.0065609324}
	 None
{'Id': '178513', 'Score': 0.0010543187}
	 None


In [51]:
query = "who lives in Sheepshead Bay?"
query = clean_text(query)
results = search(query, top_k=10, index=index, model=model)
model_inputs = [[query, item['article']] for item in results]
scores = cross_score(model_inputs)

ranked_results = [{'Id': inp['id'], 'Score': score} for inp, score in zip(results, scores)]
ranked_results = sorted(ranked_results, key=lambda x: x['Score'], reverse=True)
print("\n")
for result in ranked_results[:5]:
    print('\t',pprint(result))

(array([[39.005157, 38.923374, 37.96611 , 37.959103, 37.91911 , 37.274223,
        36.828144, 36.226395, 36.163   , 35.62926 ]], dtype=float32), array([[1584, 2616,  645,  763, 2310,  867, 2120, 2691, 2126, 2659]],
      dtype=int64))


{'Id': '183661', 'Score': 0.0007293035}
	 None
{'Id': '179103', 'Score': 0.00019763244}
	 None
{'Id': '175501', 'Score': 0.00018643167}
	 None
{'Id': '181351', 'Score': 0.00018178276}
	 None
{'Id': '183343', 'Score': 0.00018148297}
	 None


## BERT score

In [24]:
import bert_score
from bert_score import score

In [25]:
query = "Who is the vice chairman of Samsung?"
query = clean_text(query)
ranked_results_bert = []

for result in results:
    P, R, F1 = score([result['article']], [query], lang='en')
    ranked_results_bert.append({'Id': result['id'], 'Score': F1.numpy()[0]})

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [26]:
ranked_results_bert = sorted(ranked_results_bert, key=lambda x: x['Score'], reverse=True)
print("\n")
for result in ranked_results_bert[:5]:
    print('\t',pprint(result))



{'Id': '176332', 'Score': 0.80943394}
	 None
{'Id': '183343', 'Score': 0.80591106}
	 None
{'Id': '175943', 'Score': 0.8052438}
	 None
{'Id': '182113', 'Score': 0.8026211}
	 None
{'Id': '179103', 'Score': 0.8021943}
	 None
