#### Implementing Retriever Functions in a RAG System

#### 1 - Importing the libraries

In [2]:
import joblib
import numpy as np
import pandas as pd
import bm25s
import os
from sentence_transformers import SentenceTransformer

#### 2 - Loading the Dataset

In [5]:
# Load the dataset
data_path = 'news_data_dedup.csv'
if os.path.exists(data_path):
    df = pd.read_csv(data_path)
else:
    raise FileNotFoundError(f"The dataset {data_path} does not exist.")

print(df.iloc[0])  # Display the first row of the dataset

guid                             e3dc5caa18f9a16d7edcc09f8d5c2bb4
title           Harvey Weinstein's 2020 rape conviction overtu...
description     Victims group describes the New York appeal co...
venue                                                         BBC
url             https://www.bbc.co.uk/news/world-us-canada-688...
published_at                               2024-04-25 18:24:04+00
updated_at                          2024-04-26 20:03:00.628113+00
Name: 0, dtype: object


#### 3 - Retrieve Functions

In [10]:
# Use for corpus title appended with description
corpus = df['title'].astype(str).fillna('') + ' ' + df['description'].astype(str).fillna('')

# Clean the corpus to remove any problematic entries
corpus = corpus.apply(lambda x: str(x).strip() if pd.notna(x) else '')
corpus = corpus[corpus != ''].reset_index(drop=True)  # Remove empty strings

print(f"Corpus size: {len(corpus)}")
print(f"Sample corpus entry: {corpus.iloc[0][:100]}...")

# Initialize retriever function (don't pass corpus to constructor)
retriever = bm25s.BM25()

# Tokenize the corpus
print("Tokenizing corpus...")
tokenized_corpus = bm25s.tokenize(corpus.tolist())

# Index the tokenized corpus
print("Indexing corpus...")
retriever.index(tokenized_corpus)

# Tokenize a sample query
query = "What are the recent news about GDP?"
print(f"Query: {query}")
tokenized_query = bm25s.tokenize(query)

# Retrieve the results and their scores
print("Retrieving results...")
results, scores = retriever.retrieve(tokenized_query, k=3)

# Display the results
print(f"Results shape: {results.shape}")
print(f"Scores shape: {scores.shape}")
print("\nTop 3 results:")
for i, (result_idx, score) in enumerate(zip(results[0], scores[0])):
    print(f"{i+1}. Score: {score:.4f}")
    print(f"   Text: {corpus.iloc[result_idx][:200]}...")
    print()


Corpus size: 870
Sample corpus entry: Harvey Weinstein's 2020 rape conviction overturned Victims group describes the New York appeal court...
Tokenizing corpus...


Split strings:   0%|          | 0/870 [00:00<?, ?it/s]

Indexing corpus...


BM25S Count Tokens:   0%|          | 0/870 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/870 [00:00<?, ?it/s]

Query: What are the recent news about GDP?


Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving results...


BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Results shape: (1, 3)
Scores shape: (1, 3)

Top 3 results:
1. Score: 5.0626
   Text: GDP and the Dow Are Up. But What About American Well-Being? The standard ways of measuring economic growth don’t capture what life is like for real people. A new metric offers a better alternative, es...

2. Score: 4.8727
   Text: What the GDP Report Says About Inflation: A Hot First Quarter Thursday’s gross domestic product report suggests that a widely watched inflation reading due Friday could be worse than expected....

3. Score: 3.8447

