#### Cross-Encoder


Trying the cross-encoder model from sentence-transformers library.

In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import warnings

# Suppress the specific warning
warnings.filterwarnings("ignore", message="torch.utils._pytree._register_pytree_node is deprecated")


class DocumentReranker:
    def __init__(self, model_name='cross-encoder/ms-marco-TinyBERT-L-2'):
        """
        Initialize the reranker with the specified model and tokenizer.

        Parameters:
        - model_name (str): Name of the Hugging Face model to use for reranking.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.eval()  # Set the model to evaluation mode

    def rerank(self, query, documents):
        """
        Rerank the documents based on similarity scores with the query.

        Parameters:
        - query (str): The query string.
        - documents (list): List of documents to rerank.

        Returns:
        - list of tuples: A list of (document, score) tuples sorted by score in descending order.
        """
        # Tokenize the query and documents
        features = self.tokenizer(
            [query] * len(documents),  # Repeat the query for each document
            documents,                # List of documents
            return_tensors='pt',      # Return tensors
            padding=True,             # Pad the sequences
            truncation=True           # Truncate sequences that are too long
        )

        # Compute similarity scores
        with torch.no_grad():
            scores = self.model(**features).logits.squeeze().tolist()

        # Combine documents with their scores
        if isinstance(scores, float):  # Handle single-document case
            scores = [scores]
        scored_documents = list(zip(documents, scores))

        # Sort the documents by score in descending order
        ranked_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

        return ranked_documents

In [13]:
reranker = DocumentReranker()

# Define the query and documents
query = "Who was John of Gaunt's brother, and what was his role in government?"
documents = [
    "External links Richard II's Treasure from the Institute of Historical Research and Royal Holloway, University of London. Richard II's Irish chancery rolls listed by year, translated, published online by CIRCLE. The Peasants' Revolt, BBC Radio 4 discussion with Miri Rubin, Caroline Barron & Alastair Dunn (In Our Time, 16 November 2006) |- 1367 births 1400 deaths 14th-century English monarchs 14th-century murdered monarchs 14th-century English nobility Burials at Westminster Abbey Deaths by starvation Dukes of Cornwall English people of French descent English pretenders to the French throne English Roman Catholics House of Plantagenet Knights of the Garter Medieval child rulers Monarchs who abdicated Peasants' Revolt People from Bordeaux Princes of Wales Prisoners in the Tower of London Peers created by Edward III Children of Edward the Black Prince",
    "References Sources Chronicles (1993) Chronicles of the Revolution, 1397–1400: The Reign of Richard II, ed. Chris Given-Wilson. Manchester: Manchester University Press. Froissart, Jean (1978). Chronicles, ed. Geoffrey Brereton. London: Penguin. Historia Vitae et Regni Ricardi Secundi, ed. George B. Stow. Philadelphia: University of Pennsylvania Press. Knighton, Henry (1995). Knighton's Chronicle 1337–1396, ed. G. H. Martin. Oxford: Clarendon Press. Walsingham, Thomas (1862–64). Historia Anglicana 2 vols., ed. Henry Thomas Riley. London: Longman, Roberts, and Green.",
    "John of Gaunt's brother Edmund of Langley was only one year younger, but it has been suggested that this prince was of 'limited ability', and he took less part in government than Gaunt did. It has been speculated that the whole incident surrounding the killing of Wat Tyler was planned in advance by the council, to end the rebellion. While both England and the Empire supported Pope Urban VI in Rome, the French sided with the Avignon Papacy of Clement VII. This 'appeal'—which would give its name to the Lords Appellant—was not an appeal in the modern sense but a criminal charge, often one of treason."
]

# Rerank the documents
ranked_documents = reranker.rerank(query, documents)

# Display the ranked documents
print("Reranked Documents:")
for idx, (doc, score) in enumerate(ranked_documents, start=1):
    print(f"{idx}. Score: {score:.4f}")
    print(f"   Document: {doc}\n")

Reranked Documents:
1. Score: 2.8781
   Document: John of Gaunt's brother Edmund of Langley was only one year younger, but it has been suggested that this prince was of 'limited ability', and he took less part in government than Gaunt did. It has been speculated that the whole incident surrounding the killing of Wat Tyler was planned in advance by the council, to end the rebellion. While both England and the Empire supported Pope Urban VI in Rome, the French sided with the Avignon Papacy of Clement VII. This 'appeal'—which would give its name to the Lords Appellant—was not an appeal in the modern sense but a criminal charge, often one of treason.

2. Score: -6.0335
   Document: External links Richard II's Treasure from the Institute of Historical Research and Royal Holloway, University of London. Richard II's Irish chancery rolls listed by year, translated, published online by CIRCLE. The Peasants' Revolt, BBC Radio 4 discussion with Miri Rubin, Caroline Barron & Alastair Dunn (In Our 

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-TinyBERT-L-2')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-TinyBERT-L-2')

features = tokenizer(["Who was John of Gaunt's brother, and what was his role in government?", "Who was John of Gaunt's brother, and what was his role in government?", "Who was John of Gaunt's brother, and what was his role in government?"],
                     ["External links Richard II's Treasure from the Institute of Historical Research and Royal Holloway, University of London. Richard II's Irish chancery rolls listed by year, translated, published online by CIRCLE. The Peasants' Revolt, BBC Radio 4 discussion with Miri Rubin, Caroline Barron & Alastair Dunn (In Our Time, 16 November 2006) |- 1367 births 1400 deaths 14th-century English monarchs 14th-century murdered monarchs 14th-century English nobility Burials at Westminster Abbey Deaths by starvation Dukes of Cornwall English people of French descent English pretenders to the French throne English Roman Catholics House of Plantagenet Knights of the Garter Medieval child rulers Monarchs who abdicated Peasants' Revolt People from Bordeaux Princes of Wales Prisoners in the Tower of London Peers created by Edward III Children of Edward the Black Prince", 
                      "References Sources Chronicles (1993) Chronicles of the Revolution, 1397\u20131400: The Reign of Richard II, ed. Chris Given-Wilson. Manchester: Manchester University Press. . Froissart, Jean (1978). Chronicles, ed. Geoffrey Brereton. London: Penguin. . (1977) Historia Vitae et Regni Ricardi Secundi, ed. George B. Stow. Philadelphia: University of Pennsylvania Press. . Knighton, Henry (1995). Knighton's Chronicle 1337\u20131396, ed. G. H. Martin. Oxford: Clarendon Press. . Walsingham, Thomas (1862\u201364). Historia Anglicana 2 vols., ed. Henry Thomas Riley. London: Longman, Roberts, and Green Secondary sources Alexander, Jonathan; Binski, Paul (eds.) (1987). Age of Chivalry, Art in Plantagenet England, 1200\u20131400. London: Royal Academy/Weidenfeld & Nicolson. Levey, Michael (1971). Painting at Court. London: Weidenfeld and Nicolson. External links", 
                      "John of Gaunt's brother Edmund of Langley was only one year younger, but it has been suggested that this prince was of \"limited ability\", and he took less part in government than Gaunt did. b. It has been speculated that the whole incident surrounding the killing of Wat Tyler was in fact planned in advance by the council, in order to end the rebellion. c. While both England and the Empire supported Pope Urban VI in Rome, the French sided with the Avignon Papacy of Clement VII. d. This \"appeal\"which would give its name to the Lords Appellantwas not an appeal in the modern sense of an application to a higher authority. In medieval common law the appeal was criminal charge, often one of treason."],
                      return_tensors='pt', padding=True)

model.eval()
with torch.no_grad():
    scores = model(**features).logits
    print("The first document has the similarity score:", scores[0][0].item())
    print("The second document has the similarity score:", scores[1][0].item())
    print("The third document has the similarity score:", scores[2][0].item())

# determine the highest similarity score and wich document it belongs to

similarity_scores = [scores[0][0].item(), scores[1][0].item(), scores[2][0].item()]
max_score = max(similarity_scores)
max_score_index = similarity_scores.index(max_score)

print(f"The document with the highest similarity score is document {max_score_index + 1} with a score of {max_score:.2f}")

The first document has the similarity score: -6.033543109893799
The second document has the similarity score: -6.910589694976807
The third document has the similarity score: 2.7864646911621094
The document with the highest similarity score is document 3 with a score of 2.79


#### ElastikSearch

Using the ElastikSearch to search the most similar documents to the query based on bm25 similarity.

In [None]:
from elasticsearch import Elasticsearch
import pyarrow.parquet as pq

# Create a connection to the Elasticsearch cluster
es = Elasticsearch([
    {'host': 'localhost', 'port': 9200, 'scheme': 'http'},  # Replace 'localhost' with the actual IP if running on a different machine
    {'host': 'es-node2', 'port': 9200, 'scheme': 'http'},
    {'host': 'es-node3', 'port': 9200, 'scheme': 'http'},
    {'host': 'es-node4', 'port': 9200, 'scheme': 'http'}
])

# Check if the connection was successful
if es.ping():
    print("Connected to Elasticsearch cluster successfully!")
else:
    print("Could not connect to Elasticsearch cluster.")

# Load a Parquet file
def load_parquet_to_es(file_path, index_name):
    # Read the Parquet file
    table = pq.read_table(file_path)
    df = table.to_pandas()

    # Index each row in Elasticsearch
    for _, row in df.iterrows():
        es.index(index=index_name, body=row.to_dict())

# Load a test Parquet file into Elasticsearch
test_parquet_file = "C:/Users/linus/Downloads/wikipedia_corpus/a.parquet"
load_parquet_to_es(test_parquet_file, 'test_index')

# Example search query
response = es.search(
    index="wikipedia",  # Replace with your actual index name
    body={
        "query": {
            "match_all": {}
        }
    }
)

# Print the search results
print("Search Results:")
for hit in response['hits']['hits']:
    print(f"ID: {hit['_id']}, Source: {hit['_source']}")

  if es.ping():


Connected to Elasticsearch cluster successfully!


  es.index(index=index_name, body=row.to_dict())


#### Bi-Encoder

Using FAISS to search the most similar documents to the query based on cosine similarity. The class FAISSSearch is used to search the most similar documents to the query based on cosine similarity.

In [1]:
import requests
import pandas as pd

class FAISSSearcher:
    def __init__(self):
        """
        Initialize the FAISSSearcher class.

        Parameters:
        - api_url (str): The URL of the FAISS API endpoint.
        - parquet_file_path (str): Path to the Parquet file containing the Wikipedia corpus.
        """
        self.api_url = "http://localhost:8000/search"
        self.parquet_file_path = "../data/wiki_2023_index.parquet"
        self.df = self.load_parquet_file()

    def load_parquet_file(self):
        """
        Load the Parquet file containing the Wikipedia corpus into a DataFrame.

        Returns:
        - DataFrame: The loaded DataFrame.
        """
        try:
            df = pd.read_parquet(self.parquet_file_path)
            print("Parquet file loaded successfully!")
            return df
        except Exception as e:
            print(f"An error occurred while loading the Parquet file: {e}")
            raise

    def search(self, query, top_k=5):
        """
        Perform a search on the FAISS API and retrieve the relevant articles.

        Parameters:
        - query (str): The query string to search for.
        - top_k (int): The number of top results to fetch.

        Returns:
        - DataFrame: A DataFrame containing the relevant articles.
        """
        # Parameters for the search
        params = {
            "query": query,
            "top_k": top_k
        }

        # Make a GET request to the API
        response = requests.get(self.api_url, params=params)

        # Check if the request was successful
        if response.status_code == 200:
            results = response.json()            
            faiss_indices = [result['index'] for result in results['results']]

            # Retrieve the rows corresponding to the FAISS indices
            relevant_articles = self.df.iloc[faiss_indices]
            
            return relevant_articles
        else:
            print(f"Error: {response.status_code}")
            print(response.text)
            return None

In [2]:
# Create a FAISSSearcher instance
searcher = FAISSSearcher()

Parquet file loaded successfully!


#### Evaluation

Evaluating the models using the NQ dataset for open-domain question answering

In [18]:
import pandas as pd
import json

# Path to the JSON Lines (.jsonl) file
file_path = "../data/nq-dev-all.jsonl"

# Read and parse the first 10 lines of the file
lines = []
with open(file_path, 'r') as f:
    for i in range(10):  # Read the first 10 lines
        line = f.readline()
        if not line:  # Break if fewer than 10 lines exist
            break
        lines.append(json.loads(line))

# Convert the list of dictionaries into a Pandas DataFrame
questions = pd.DataFrame(lines)

test_questions = questions[["document_title", "question_text"]]

In [19]:
count = 0

for i in [5, 10, 100, 1000, 10000]:
    count = 0  # Reset count for each `i`
    for j in range(len(test_questions)):
        # Example query
        query = test_questions.iloc[j]["question_text"]
        doc_title = test_questions.iloc[j]["document_title"]

        # Perform the search and retrieve relevant articles
        relevant_articles = searcher.search(query, top_k=i)

        # Ensure `relevant_articles` is not empty and has the `title` column
        if relevant_articles is not None and "title" in relevant_articles.columns:
            if doc_title in list(relevant_articles["title"]):
                count += 1
        else:
            print(f"Warning: No relevant articles found or missing 'title' column for query: {query}")

    print(f"{i} Documents retrieved, percentage of correct documents: {count / len(test_questions) * 100:.2f}%")

5 Documents retrieved, percentage of correct documents: 60.00%
10 Documents retrieved, percentage of correct documents: 60.00%
100 Documents retrieved, percentage of correct documents: 60.00%
1000 Documents retrieved, percentage of correct documents: 70.00%
10000 Documents retrieved, percentage of correct documents: 90.00%


In [15]:
import time

# Initialize counters
count_correct_at_top = 0
total_queries = len(test_questions)

# Measure time for the entire validation
start_time = time.time()

for i in [5, 10, 100, 1000, 10000]:
    count_correct_at_top = 0  # Reset count for each `i`
    
    # Start time for this batch
    batch_start_time = time.time()

    for j in range(total_queries):
        # Example query
        query = test_questions.iloc[j]["question_text"]
        doc_title = test_questions.iloc[j]["document_title"]

        # Perform the search and retrieve relevant articles
        relevant_articles = searcher.search(query, top_k=i)

        # Ensure `relevant_articles` is not empty and has the `title` column
        if relevant_articles is not None and "title" in relevant_articles.columns:
            # Rerank the retrieved articles
            reranked_articles = reranker.rerank(query, list(relevant_articles["title"]))

            # Check if the correct document is at the top
            if reranked_articles[0][0] == doc_title:
                count_correct_at_top += 1
        else:
            print(f"Warning: No relevant articles found or missing 'title' column for query: {query}")

    # Calculate time for this batch
    batch_time = time.time() - batch_start_time

    # Print results for this batch
    print(f"{i} Documents retrieved:")
    print(f"  Percentage of correct documents at position 1: {count_correct_at_top / total_queries * 100:.2f}%")
    print(f"  Time taken: {batch_time:.2f} seconds")

# Total time for validation
total_time = time.time() - start_time
print(f"Total validation time: {total_time:.2f} seconds")

5 Documents retrieved:
  Percentage of correct documents at position 1: 50.00%
  Time taken: 11.49 seconds
10 Documents retrieved:
  Percentage of correct documents at position 1: 40.00%
  Time taken: 10.28 seconds
100 Documents retrieved:
  Percentage of correct documents at position 1: 20.00%
  Time taken: 10.19 seconds
1000 Documents retrieved:
  Percentage of correct documents at position 1: 20.00%
  Time taken: 13.33 seconds
10000 Documents retrieved:
  Percentage of correct documents at position 1: 10.00%
  Time taken: 90.37 seconds
Total validation time: 135.67 seconds
