In [1]:
import pandas as pd
import numpy as np
import re
import faiss
import time
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder

# 1. Preprocessing

In [3]:
# Read dataset
df = pd.read_csv('news_dataset.csv', encoding='latin-1')
data = df[['id', 'article']]
data.head()

Unnamed: 0,id,article
0,17307,PARIS ? When the Islamic State was about to...
1,17292,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Finally. The Second Avenue subway opened in Ne...
3,17311,WASHINGTON ? It?s or time for Republica...
4,17339,"For Megyn Kelly, the shift from Fox News to NB..."


In [7]:
# Check for duplicates
data[data.duplicated(subset=['article'], keep=False)]

Unnamed: 0,id,article
41,17313,The body of the Iraqi prisoner was found naked...
219,17545,"DETROIT ? Just before the holidays, on a da..."
220,17546,"DETROIT ? Just before the holidays, on a da..."
752,18185,The body of the Iraqi prisoner was found naked...
753,18186,The body of the Iraqi prisoner was found naked...
886,18337,HOUSTON ? The chants rang out loud and long...
887,18338,HOUSTON ? The chants rang out loud and long...
888,18339,Picking the pain reliever that?s best for you ...
889,18341,Picking the pain reliever that?s best for you ...


In [8]:
# Drop duplicates
data_no_dup = data.drop_duplicates(subset=['article'],keep='first').reset_index(drop=True)
data_no_dup.head()

Unnamed: 0,id,article
0,17307,PARIS ? When the Islamic State was about to...
1,17292,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Finally. The Second Avenue subway opened in Ne...
3,17311,WASHINGTON ? It?s or time for Republica...
4,17339,"For Megyn Kelly, the shift from Fox News to NB..."


In [9]:
# Check length of documents
data_no_dup['len_article'] = data_no_dup['article'].apply(lambda x:len(x))
min_len = data_no_dup['len_article'].min()
max_len = data_no_dup['len_article'].max()
print(min_len, max_len)

1074 32759


In [10]:
# Check non-ASCII characters
def check_non_ascii(text):
    text = ''.join([char for char in text if ord(char) > 128])
    return text

check_set = set()
for i in range(len(data_no_dup)):
    check = check_non_ascii(data_no_dup['article'][i])
    for char in check:
        check_set.add(char)
check_set

{'\x81', '\x90', '\x9b', '\x9c', '\x9f', 'ª', '\xad', '®', 'ÿ'}

In [9]:
# Create function to clean data
def clean_text(text):
    # Remove non-ASCII characters
    text = ''.join([char for char in text if ord(char) < 128])

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove question mark problems
    text = re.sub(r'(\s\?)',' ',text)
    text = re.sub(r"\b\?\b", "\'", text)
    text = re.sub(r"(,\?)",",", text)
    text = re.sub(r"\?+", "?", text)
    text = text.strip()

    return text

In [17]:
# Create chunking function
def chunk_text(data_index, data_text, chunk_size, chunk_overlap):

    list_chunk_text = []

    for position in range(len(data_index)):
        
        # Clean data
        words = clean_text(data_text[position]).split()

        # Chunk data
        start = 0
        part = 1
        while start < len(words):
            end = start + chunk_size
            segment = ' '.join(words[start:end])
            list_chunk_text.append((str(data_index[position]) + str(part), segment))
            part += 1
            start += (chunk_size - chunk_overlap)

    return pd.DataFrame(list_chunk_text, columns=['id', 'article'])

In [8]:
# Create training dataset
data_chunk_train = chunk_text(data_no_dup[:5]['id'], data_no_dup[:5]['article'], 100, 50)
data_chunk_train.to_csv('data_chunk_train.csv', index=False)

# Create testing dataset
data_no_train = data_no_dup[5:].reset_index(drop=True)
data_chunk = chunk_text(data_no_train['id'], data_no_train['article'], 100, 50)
data_chunk.to_csv('data_chunk_100.csv', index=False)

In [44]:
# Set up model to encode data
model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')

# Vectorize data
encoded_data = model.encode(data_chunk['article'].tolist())
encoded_data = np.asarray(encoded_data.astype('float32'))

In [45]:
# Save vector database
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(encoded_data, np.array(range(0, len(data_chunk))))
faiss.write_index(index, 'data_article_100.index')

# 2. Retrieval Information

In [4]:
model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')
cross_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)

In [5]:
# Create function to fetch data
def fetch_data(doc_ids, score):
    
    '''doc_ids should be a list of document ids'''
    info = data_chunk.iloc[doc_ids]
    meta_dict = {}
    meta_dict['id'] = info['id']
    meta_dict['article'] = info['article']
    meta_dict['score'] = score

    return meta_dict

In [6]:
# Create function to search top k match documents for query
def search(query, top_k, index, model):

    query_vector = model.encode([query])
    top_k = index.search(query_vector, top_k)

    top_k_ids = list(top_k[1].tolist()[0])
    score = list(top_k[0].tolist()[0])

    results =  [fetch_data(idx, score) for idx, score in zip(top_k_ids, score)]

    return results

In [7]:
# Create function to retrieve top k documents with cross-encoder
def query_answer(query, query_id):
    """Query is a string"""
    query = clean_text(query)

    # Search top 20 related documents
    results = search(query, top_k=20, index=index, model=model)

    # Re-rank the results
    model_inputs = [[query, result['article']] for result in results]
    scores = cross_model.predict(model_inputs)

    # Sort the scores in decreasing order
    ranked_results = [{'id': result['id'], 'article': result['article'], 'score': score} for result, score in zip(results, scores)]
    ranked_results = sorted(ranked_results, key=lambda x: x['score'], reverse=True)
    
    # Display top 3 results
    result_dataset = []
    for i, rank in enumerate(ranked_results[:3]):
        dataset = {'question_id': query_id,
                   'question': query,
                   'rank': i + 1,
                   'id': int(rank['id'] // 10),
                   'article': rank['article'],
                   'score': rank['score']}
        result_dataset.append(dataset)

    return result_dataset

## Testing

In [2]:
# Read vector database
index = faiss.read_index('data_article_100.index')
data_chunk = pd.read_csv('data_chunk_100.csv')
question_dataset = pd.read_csv('question_test_data_2.csv')

In [25]:
# Create function to calculate MRR
def mrr_score(answer_data, question_data):
    '''answer_data is a list of list of ids
    question_data has 2 columns: question and correct document ids'''
    score = []
    for i, answer in enumerate(answer_data):
        for j, index in enumerate(answer):
            if index == question_data[i]:
                score.append(1 / (j + 1))
                break
        if len(score) < i + 1:
            score.append(0)
    return sum(score) / len(score) if len(score) > 0 else 0

In [27]:
# Get re-rank id lists for each question
answer_100_dataset = dict()
for idx, question in enumerate(question_dataset['question']):
    answers = query_answer(question, idx)
    for answer in answers:
        if answer['question_id'] not in answer_100_dataset:
            answer_100_dataset[answer['question_id']] = [answer['id']]
        else:
            answer_100_dataset[answer['question_id']].append(answer['id'])

article_100_ids = [answer_100_dataset[x] for x in answer_100_dataset]

# Calculate MRR score
score = mrr_score(article_100_ids, question_dataset['doc_id'])
print('MRR score:', score)

0.8243243243243243

## Test with chunking 500-word dataset

In [None]:
# Chunk 500-word dataset
data_chunk = chunk_text(data_no_train['id'], data_no_train['article'], 500, 50)
data_chunk.to_csv('data_chunk_500.csv', index=False)

In [None]:
# Save vector database
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(encoded_data, np.array(range(0, len(data_chunk))))
faiss.write_index(index, 'data_article_500.index')

In [None]:
# Get re-rank id lists for each question
answer_500_dataset = dict()
for idx, question in enumerate(question_dataset['question']):
    answers = query_answer(question, idx)
    for answer in answers:
        if answer['question_id'] not in answer_500_dataset:
            answer_500_dataset[answer['question_id']] = [answer['id']]
        else:
            answer_500_dataset[answer['question_id']].append(answer['id'])

article_500_ids = [answer_500_dataset[x] for x in answer_500_dataset]

# Calculate MRR score
score = mrr_score(article_500_ids, question_dataset['doc_id'])
print('MRR score:', score)

## Test cross-encoder models

In [None]:
# List of cross-encoder models is used to test
cross_models = ['cross-encoder/ms-marco-MiniLM-L-12-v2',
                'cross-encoder/ms-marco-MiniLM-L-6-v2',
                'cross-encoder/ms-marco-MiniLM-L-4-v2',
                'cross-encoder/ms-marco-MiniLM-L-2-v2',
                'cross-encoder/ms-marco-TinyBERT-L-6',
                'cross-encoder/ms-marco-TinyBERT-L-2-v2']

In [None]:
# Create a function to test cross-encoder models
def test_model(question_dataset, cross_model):
    start_time = time.time()

    # Get re-rank id lists for each question
    answer_dataset = dict()
    for idx, question in enumerate(question_dataset['question']):
        answers = query_answer(question, idx)
        for answer in answers:
            if answer['question_id'] not in answer_dataset:
                answer_dataset[answer['question_id']] = [answer['id']]
            else:
                answer_dataset[answer['question_id']].append(answer['id'])

    article_ids = [answer_dataset[x] for x in answer_dataset]
    
    # Calculate MRR score
    mrr_score = mrr_score(article_ids, question_dataset['doc_id'])
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    result = {'mrr_score': article_ids,
              'time': elapsed_time}

    return result

In [None]:
# Test each cross-encoder model
test_result = []
for cross_model_name in cross_models:
    # Set cross-encoder model
    cross_model = CrossEncoder(cross_model_name) 

    # Test cross-encoder model
    result = test_model(question_dataset, cross_model)
    result['cross_model'] = cross_model_name    
    test_result.append(result)

In [None]:

# Define the positions of the bars on the x-axis
x = range(len(cross_models))

# Create the figure and axes
fig, ax1 = plt.subplots()
x = range(len(cross_models))

# Create bar charts for MRR score and time
ax1.bar(x, test_result['mrr_score'], width=0.35, label='MRR score', color='b')
ax1.set_ylabel('MRR score', color='b')
ax1.tick_params(axis='y', labelcolor='b')

ax2 = ax1.twinx()
ax2.bar([i + 0.35 for i in x], test_result['time'], width=0.35, label='Time', color='r')
ax2.set_ylabel('Time', color='r')
ax2.tick_params(axis='y', labelcolor='r')

# Add title and labels
plt.title('Cross-encoder models performance comparison')
plt.xlabel('Cross-encoder models')
plt.xticks([i + 0.35/2 for i in x], cross_models, rotation=90)  # Adjust x-axis tick positions

# Add legend
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

# Display the plot
plt.show()