In [1]:
!pip install pyserini sentence-transformers openai faiss-cpu datasets

Collecting pyserini
  Downloading pyserini-0.44.0.tar.gz (195.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.3/195.3 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting pyjnius>=1.6.0 (from pyserini)
  Downloading pyjnius-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime>=1.8.1 (from pyserini)
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting tiktoken>=0.4.0 (from pyserini)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata

In [2]:
from datasets import load_dataset

# Load MS MARCO dataset from Hugging Face
dataset = load_dataset("ms_marco", "v2.1", split="train[:10%]")  # Using 10% for efficiency



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
print(dataset.column_names)

['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers']


In [4]:
# Extract passages
documents = {str(i): doc["passages"] for i, doc in enumerate(dataset)}
print(f"Loaded {len(documents)} documents from MS MARCO.")

Loaded 80873 documents from MS MARCO.


In [None]:
from pyserini.search.lucene import LuceneSearcher
from pyserini.index.lucene import IndexReader
from pyserini.index import IndexCollection
import os

# Define index path
INDEX_PATH = "msmarco_index"

# Create index if it doesn't exist
if not os.path.exists(INDEX_PATH):
    os.makedirs(INDEX_PATH)
    with open("docs.jsonl", "w") as f:
        for doc_id, text in documents.items():
            f.write(f'{{"id": "{doc_id}", "contents": "{text}"}}\n')

    IndexCollection.main(
        args=["--collection", "JsonCollection",
              "--input", ".",
              "--index", INDEX_PATH,
              "--generator", "DefaultLuceneDocumentGenerator",
              "--threads", "4"]
    )

# Load BM25 index
searcher = LuceneSearcher(INDEX_PATH)

def retrieve_top_docs(query, top_k=500):
    hits = searcher.search(query, k=top_k)
    return [(hit.docid, documents[hit.docid]) for hit in hits]

query = "What are the effects of climate change?"
bm25_results = retrieve_top_docs(query, top_k=500)
print(f"BM25 Retrieved {len(bm25_results)} docs for query: {query}")

In [None]:
from sentence_transformers import CrossEncoder

# Load cross-encoder model
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# Prepare pairs for re-ranking
pairs = [(query, doc_text) for _, doc_text in bm25_results]

# Compute relevance scores
scores = reranker.predict(pairs)

# Rank documents based on cross-encoder scores
reranked_docs = sorted(zip(scores, bm25_results), reverse=True)[:50]
print(f"Re-ranked to {len(reranked_docs)} documents using cross-encoder.")

In [None]:
import openai

# OpenAI API Key (Replace with your key)
openai.api_key = "your-api-key"

# Select top 10 documents
top_docs = [doc_text for _, (_, doc_text) in enumerate(reranked_docs[:10])]

# Prepare the prompt
context = "\n\n".join(top_docs)
prompt = f"""
Using the following documents, generate a concise answer:
{context}

Query: {query}
"""

# Call OpenAI GPT-4
response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[{"role": "system", "content": "You are an expert assistant."},
              {"role": "user", "content": prompt}]
)

# Extract response
answer = response["choices"][0]["message"]["content"]
print(f"Generated Answer: {answer}")