# Assignment 3.1: Retrieval-Augmented Generation (RAG)

This notebook implements a basic RAG pipeline: crawling, retrieval, generation, and evaluation.

In [1]:
# Uncomment if running in Colab or missing packages
# !pip install transformers sentence-transformers faiss-cpu requests beautifulsoup4 tqdm pandas scikit-learn

## 1. Imports and Setup

In [2]:
import os
import requests
from bs4 import BeautifulSoup
from typing import List
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline
from tqdm import tqdm
import pandas as pd

## 2. Web Crawler

In [3]:
def crawl_website(url: str, max_pages: int = 10) -> List[str]:
    visited = set()
    to_visit = [url]
    docs = []
    while to_visit and len(visited) < max_pages:
        current = to_visit.pop(0)
        if current in visited:
            continue
        try:
            resp = requests.get(current, timeout=5)
            soup = BeautifulSoup(resp.text, 'html.parser')
            text = soup.get_text(separator=' ', strip=True)
            docs.append(text)
            visited.add(current)
            # Add new links
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.startswith('http') and href not in visited:
                    to_visit.append(href)
        except Exception as e:
            print(f"Failed to crawl {current}: {e}")
    return docs

## 3. Build Retrieval Index

In [4]:
def build_retrieval_index(docs: List[str], model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(docs, show_progress_bar=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings).astype('float32'))
    return index, model, embeddings

## 4. Retrieve Documents

In [5]:
def retrieve(query: str, index, model, docs: List[str], top_k: int = 5):
    query_emb = model.encode([query])
    D, I = index.search(np.array(query_emb).astype('float32'), top_k)
    return [docs[i] for i in I[0]]

## 5. Load a QA Model

In [6]:
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased-distilled-squad and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 6. RAG Answer Generation

In [7]:
def generate_answer(query, retrieved_docs, qa_pipeline):
    context = " \n".join(retrieved_docs)
    result = qa_pipeline(question=query, context=context)
    return result['answer']

## 7. Example Usage: Crawl, Build Index, Retrieve, Generate

In [8]:
# Step 1: Crawl
url = 'https://en.wikipedia.org/wiki/Retrieval-augmented_generation'  # Example URL
docs = [
    'Retrieval-Augmented Generation (RAG) is a method that combines retrieval of documents with generative models to answer questions.',
    'RAG leverages both retrieval and generation to improve the accuracy of open-domain question answering.',
    'The retriever fetches relevant documents, and the generator produces answers based on those documents.'
]
print(f"Crawled {len(docs)} documents.")

Crawled 3 documents.


In [9]:
# Step 2: Build index
index, model, embeddings = build_retrieval_index(docs)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]
Batches: 100%|##########| 1/1 [00:00<00:00,  2.13it/s]


In [10]:
# Step 3: Retrieve
query = 'What is Retrieval-Augmented Generation?'
retrieved_docs = retrieve(query, index, model, docs, top_k=2)
print('Retrieved docs:', retrieved_docs)

Retrieved docs: ['Retrieval-Augmented Generation (RAG) is a method that combines retrieval of documents with generative models to answer questions.', 'RAG leverages both retrieval and generation to improve the accuracy of open-domain question answering.']


In [11]:
# Step 4: Generate answer
answer = generate_answer(query, retrieved_docs, qa_pipeline)
print('Answer:', answer)

Answer: Retrieval-Augmented Generation (RAG)


## 8. Evaluation (Simple Example)

In [12]:
def evaluate_rag(answers: List[str], references: List[str]) -> float:
    # Simple exact match
    correct = 0
    for a, r in zip(answers, references):
        if a.strip().lower() == r.strip().lower():
            correct += 1
    return correct / len(answers)

# Example evaluation
answers = [answer]
references = ['Retrieval-Augmented Generation (RAG)']
score = evaluate_rag(answers, references)
print(f'Evaluation score: {score}')

Evaluation score: 1.0
