### RAG System
Now that the data pre-processing, indexing and evaluation for retrieval is complete the next step is to put it all together and create the RAG system which works end to end. This notebook will include the following:
1. Read chunks from the directory of the current run (experiment).
2. Create index with the chunks and the chunk ids
3. Link the retriever and LLM together to create an end to end pipeline where the user asks questions and receives an answer from the LLM which includes the answer as well as the sources for the answer
4. Input and output validation and guardrails to prevent LLM from hallucinating, leaking PII etc

In [3]:
from openai import OpenAI
from pydantic import BaseModel, Field, field_validator, ValidationInfo
from typing import Optional, Dict, Any, List, Annotated
from dataclasses import dataclass
import instructor
from instructor import openai_moderation
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
client = instructor.from_openai(OpenAI(api_key=OPENAI_API_KEY))

In [4]:
### Creating a retriever class
import json
from pathlib import Path
from typing import List, Dict
from pylate import indexes, models, retrieve

class Retriever:
    def __init__(self, experiment_number: str):
        """Initialize the Retriever with experiment number.
        
        Args:
            experiment_number (str): The experiment number (e.g., '001')
        """
        self.experiment_dir = Path(f"Experiments/{experiment_number}")
        self.model = models.ColBERT(
            model_name_or_path="shresht8/modernBERT_text_similarity_finetune"
        )
        self.index = indexes.Voyager(
            index_folder="pylate-index",
            index_name="index",
            override=True
        )
        self.retriever = None
        self.chunks_data = None
        
    def read_chunks(self) -> List[Dict]:
        """Read document chunks from the experiment directory."""
        chunks_path = self.experiment_dir / "document_chunks.json"
        with open(chunks_path, 'r', encoding='utf-8') as f:
            self.chunks_data = json.load(f)
        return self.chunks_data
    
    def create_index(self):
        """Create index from chunks and initialize retriever."""
        if self.chunks_data is None:
            self.read_chunks()
            
        # Prepare lists for indexing
        all_chunks = []
        chunk_ids = []
        
        # Extract chunks and their IDs
        for chunk in self.chunks_data:
            all_chunks.append(chunk['chunk_content'])
            chunk_ids.append(chunk['chunk_id'])
            
        # Encode all chunks
        documents_embeddings = self.model.encode(
            all_chunks,
            batch_size=32,
            is_query=False,
            show_progress_bar=True
        )
        
        # Add documents to index
        self.index.add_documents(
            documents_ids=chunk_ids,
            documents_embeddings=documents_embeddings
        )
        
        # Initialize retriever
        self.retriever = retrieve.ColBERT(index=self.index)
        
    def get_relevant_chunks(self, query: str, k: int = 3) -> List[Dict]:
        """Retrieve relevant chunks for a given query.
        
        Args:
            query (str): The search query
            k (int): Number of chunks to retrieve
            
        Returns:
            List[Dict]: List of relevant chunks with their metadata
        """
        if self.retriever is None:
            raise ValueError("Index not created. Call create_index() first.")
            
        # Encode the query
        query_embeddings = self.model.encode(
            [query],
            batch_size=32,
            is_query=True,
            show_progress_bar=False
        )
        
        # Get top k retrievals
        scores = self.retriever.retrieve(
            queries_embeddings=query_embeddings,
            k=k
        )
        
        # Get retrieved chunk IDs
        retrieved_chunks = scores[0]  # First (and only) query results
        retrieved_chunk_ids = [chunk['id'] for chunk in retrieved_chunks]
        
        # Map chunk IDs to full chunk data
        chunk_map = {chunk['chunk_id']: chunk for chunk in self.chunks_data}
        relevant_chunks = [chunk_map[chunk_id] for chunk_id in retrieved_chunk_ids]
        
        return relevant_chunks
    
    def format_chunks_to_context(self, chunks: List[Dict]) -> str:
        """Format retrieved chunks into a single context string.
        
        Args:
            chunks (List[Dict]): List of chunk dictionaries
            
        Returns:
            str: Formatted context string with source information
        """
        formatted_chunks = []
        for chunk in chunks:
            chunk_text = (
                f"Source: {chunk['document_name']}\n"
                f"Content: {chunk['chunk_content']}\n"
            )
            formatted_chunks.append(chunk_text)
            
        return "\n".join(formatted_chunks)
    
    def create_prompt(self, query: str, system_prompt: str, k: int = 3) -> List[Dict]:
        """Create a formatted prompt for the OpenAI API with retrieved context.
        
        Args:
            query (str): User's question
            system_prompt (str): System prompt for the LLM
            k (int): Number of chunks to retrieve
            
        Returns:
            List[Dict]: Formatted messages for the OpenAI API
        """
        # Get relevant chunks
        relevant_chunks = self.get_relevant_chunks(query, k=k)
        
        # Format chunks into context
        context = self.format_chunks_to_context(relevant_chunks)
        
        # Create messages array for OpenAI API
        messages = [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": f"Context:\n{context}\n\nQuestion: {query}"
            }
        ]
        
        return messages

In [5]:
# Example usage
retriever = Retriever(experiment_number="001")
retriever.create_index()



PyLate model loaded successfully.
Encoding documents (bs=32): 100%|██████████| 10/10 [01:55<00:00, 11.59s/it]
Adding documents to the index (bs=2000): 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


In [7]:
class AnswerWithCitation(BaseModel):
    """Validates and structures the final response. Answers are provided with citation to keep the response to user query is
    grounded to context provided by user, prevent harmful responses and maxise accuracy of response"""
    answer: Annotated[
        str, 
        Field(description='Final answer to user. Must be a response that is relevant to the user query if relevant information is available. Otherwise the assistant cannot help user'),
        openai_moderation(client=client)
    ] = Field(...)
    
    citation: Optional[str] = Field(
        None,
        description='Citation from the context provided. Can be a sentence from the context provided or it can be the entire context. If not relevant context is available it must be None'
    )

# Define system prompt
system_prompt = "You are a helpful assistant. Answer the question based on the provided context only. If you cannot find the answer in the context, say so."

# Create formatted prompt
query = "What are the requirements for AI systems?"
messages = retriever.create_prompt(
    query=query,
    system_prompt=system_prompt,
    k=10
)

# Use with OpenAI API
response = client.chat.completions.create(
    response_model=AnswerWithCitation,
    model="o1-2024-12-17",
    messages=messages,
    max_retries=3
)

Retrieving documents (bs=50): 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]


In [8]:
print(response)

answer='Under the proposal reflected in the provided context, all “high-risk” AI systems must satisfy several core requirements intended to safeguard health, safety, and fundamental rights. In particular, providers must: • Establish a risk-management system that identifies and addresses known and foreseeable risks in light of the system’s intended purpose and reasonably predictable misuse; • Use data sets that are sufficiently relevant, representative, accurate, and complete for their intended purpose and that mitigate potential bias; • Maintain up-to-date technical documentation that demonstrates compliance with these regulatory obligations; • Log the system’s operations in order to support traceability and facilitate oversight by regulators; • Provide clear, adequate instructions and transparency information so deployers understand how to use the AI system safely and reliably; • Ensure human oversight mechanisms are in place, so that humans can monitor how the system functions and in