In [11]:
import pandas as pd
import time
import re
from tqdm import tqdm
import seaborn as sns
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')

In [9]:
df = pd.read_csv('news_dataset.csv', encoding='latin-1')
data = df[['id', 'article']]
data.head()

Unnamed: 0,id,article
0,17307,PARIS ? When the Islamic State was about to...
1,17292,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Finally. The Second Avenue subway opened in Ne...
3,17311,WASHINGTON ? It?s or time for Republica...
4,17339,"For Megyn Kelly, the shift from Fox News to NB..."


In [12]:
# Create a function to clean data
def clean_text(text):
    # Remove non-ASCII characters
    text = ''.join([char for char in text if ord(char) < 128])

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove question mark problems
    text = re.sub(r'(\s\?)',' ',text)
    text = re.sub(r"\b\?\b", "\'", text)
    text = re.sub(r"(,\?)",",", text)
    text = re.sub(r"\?+", "?", text)
    text = text.strip()

    return text

In [81]:
def chunk_text(data_index, data_text, chunk_size, chunk_overlap):

    list_chunk_text = []

    for position in range(len(data_index)):

        words = clean_text(data_text[position]).split()

        start = 0
        part = 1
        while start < len(words):
            end = start + chunk_size
            segment = ' '.join(words[start:end])
            list_chunk_text.append((str(data_index[position]) + str(part), segment))
            part += 1
            start += (chunk_size - chunk_overlap)

    return pd.DataFrame(list_chunk_text, columns=['id', 'article'])

In [82]:
data_chunk = chunk_text(data['id'], data['article'], 500, 50)

In [83]:
data_chunk.head()

Unnamed: 0,id,article
0,173071,PARIS ? When the Islamic State was about to be...
1,173072,? He said it aimed to mobilize public opinion ...
2,173073,"near Syria?s border with Iraq. Before fleeing,..."
3,172921,Angels are everywhere in the Mu?iz family?s ap...
4,172922,"Mu?iz?s life, surgeons removed his tongue and ..."


In [86]:
def fetch_data_info(dataframe_idx):
    info = data_chunk.iloc[dataframe_idx]
    meta_dict = {}
    meta_dict['id'] = info['id']
    meta_dict['article'] = info['article'][:500]
    return meta_dict

    
def search(query, top_k, index, model):
    query_vector = model.encode([query])
    top_k = index.search(query_vector, top_k)
    top_k_ids = list(top_k[1].tolist()[0])
    # print(top_k_ids)
    # top_k_ids = list(np.unique(top_k_ids))
    results =  [fetch_data_info(idx) for idx in top_k_ids]
    return results

In [28]:
encoded_data = model.encode(data_chunk['article'].tolist())
encoded_data = np.asarray(encoded_data.astype('float32'))

In [44]:
len(encoded_data[161])

768

In [87]:
import faiss
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
# index.add_with_ids(encoded_data, np.array(data_chunk['id'].values))
index.add_with_ids(encoded_data, np.array(range(0, len(data_chunk))))
faiss.write_index(index, 'data_article.index')

In [68]:
len(data_chunk)

3012

In [91]:
query = "Who is the vice chairman of Samsung?"
query = clean_text(query)
results = search(query, top_k=20, index=index, model=model)
results

[{'id': '175741',
  'article': 'SEOUL, South Korea ? A special prosecutor investigating the corruption scandal that led to President Park ?s impeachment summoned the de facto head of Samsung for questioning on Wednesday, calling him a bribery suspect. The de facto leader, Jay Y. Lee, the vice chairman of Samsung, will be questioned on Thursday, according to the special prosecutor?s office, which recommended that he also be investigated on suspicion of perjury. Mr. Lee effectively runs Samsung, South Korea?s largest conglomerat'},
 {'id': '178513',
  'article': 'in the Korean decision,? said Don Rosenberg, a lawyer for Qualcomm. The regulators, he said, ?were prodded and misled by commercial interests. ? In an email, Jee, a spokeswoman for Samsung, said it ?was one of many multinational corporations? that responded to questions from South Korean regulators. Qualcomm?s lawyer figures the Korean case may take years to move through the courts. And recent events, including the impeachment o

In [75]:
from sentence_transformers import CrossEncoder
cross_model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6', max_length=512)

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [93]:
def cross_score(model_inputs):
    scores = cross_model.predict(model_inputs)
    return scores

model_inputs = [[query, item['article']] for item in results]
scores = cross_score(model_inputs)

#Sort the scores in decreasing order
ranked_results = [{'Id': inp['id'], 'Score': score} for inp, score in zip(results, scores)]
ranked_results = sorted(ranked_results, key=lambda x: x['Score'], reverse=True)

In [95]:
from pprint import pprint

print("\n")
for result in ranked_results[:5]:
    print('\t',pprint(result))



{'Id': '175741', 'Score': 0.9349358}
	 None
{'Id': '177642', 'Score': 0.077817224}
	 None
{'Id': '177641', 'Score': 0.06557797}
	 None
{'Id': '176291', 'Score': 0.00034940912}
	 None
{'Id': '178513', 'Score': 0.00022788203}
	 None
{'Id': '182772', 'Score': 0.0002067317}
	 None
{'Id': '175742', 'Score': 0.00019164718}
	 None


In [125]:
query = "who lives in Sheepshead Bay?"
query = clean_text(query)
results = search(query, top_k=20, index=index, model=model)
model_inputs = [[query, item['article']] for item in results]
scores = cross_score(model_inputs)
ranked_results = [{'Id': inp['id'], 'Score': score} for inp, score in zip(results, scores)]
ranked_results = sorted(ranked_results, key=lambda x: x['Score'], reverse=True)
print("\n")
for result in ranked_results[:5]:
    print('\t',pprint(result))




{'Id': '183542', 'Score': 0.0002189889}
	 None
{'Id': '182611', 'Score': 0.00021261694}
	 None
{'Id': '181403', 'Score': 0.00021122393}
	 None
{'Id': '175383', 'Score': 0.00019030515}
	 None
{'Id': '181331', 'Score': 0.00018310206}
	 None


In [130]:
data_index = data_chunk[data_chunk['id'] == 183542]
data_index['article']

Series([], Name: article, dtype: object)