In [1]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai
from vertexai.preview.generative_models import (
    FunctionDeclaration,
    GenerativeModel,
    Tool,
    ToolConfig,
    Part,
    GenerationConfig,
)
PROJECT_ID = "104916006626"  # @param {type: "string", placeholder: "[your-project-id]" isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "xyz":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "australia-southeast1")

vertexai.init(project="104916006626", location=LOCATION)

In [2]:
import os
from google.oauth2 import service_account

# Path to your service account key file
key_path = "C:\\Users\\shres\\Projects\\RAG-case-study\keys\\keyproject-401005-6e1cdcbb5996.json"

# Create credentials using the service account key file
credentials = service_account.Credentials.from_service_account_file(
    key_path
)

# Set the credentials for the current environment
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
# auth_request = transport.requests.Request()
# credentials.refresh(auth_request)

In [30]:
response_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
            },
            "answer": {
                "type": "string",
            },
            "difficulty": {
                "type": "string",
                "enum": ["easy", "medium", "hard"],
            },
            "chunk_ids": {
                "type": "array",
                "items": {
                    "type": "string"
                },
                "description": "List of chunk IDs that the question and answer are based on"
            }
        },
        "required": ["question", "answer", "difficulty", "chunk_ids"],
    },
}

In [31]:
from prompts import system_prompt_QA_eval_bot
def generate_questions(context, num_questions=10):
    """
    Generate a set of questions and answers from a given context.

    Args:
    context: The context to generate questions from.
    num_questions: The number of questions to generate.

    Returns:
    A list of questions and answers.
    """
    model = GenerativeModel("gemini-1.5-pro-002")

    response = model.generate_content(
    system_prompt_QA_eval_bot.format(context=context, num_questions=num_questions),
    generation_config=GenerationConfig(
        response_mime_type="application/json", response_schema=response_schema
    ),
    )
    return response.text




### Creating a cross encode to get the similarity between the eval question and the document chunks
 The cross encoder will output a score between 0 and 1 for each question and document chunk.


In [3]:
from sentence_transformers.cross_encoder import CrossEncoder

model = CrossEncoder("cross-encoder/stsb-distilroberta-base")
scores = model.predict([["My first", "sentence pair"], ["Second text", "pair"]])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
scores

array([0.11382268, 0.11522377], dtype=float32)

In [5]:
from pathlib import Path
import logging
from typing import Dict, List
from transformers import AutoTokenizer
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter

class DocumentChunker:
    def __init__(self, base_dir: str = "processed_docs", model_id: str = "answerdotai/ModernBERT-base"):
        """
        Initialize the DocumentChunker with necessary components.
        
        Args:
            base_dir: Base directory containing markdown files
            model_id: Model ID for the tokenizer
        """
        # Setup logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        self.base_dir = Path(base_dir)
        self.model_id = model_id
        
        # Initialize components
        self._setup_components()
        
        # Store results
        self.document_chunks: Dict[str, List[str]] = {}

    def _setup_components(self) -> None:
        """Initialize tokenizer, chunker and document converter."""
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.chunker = HybridChunker(
            tokenizer=self.tokenizer,
            merge_peers=True,
        )
        self.doc_converter = DocumentConverter()
        
    def process_single_document(self, file_path: Path) -> List[str]:
        """
        Process a single markdown file and return its chunks.
        
        Args:
            file_path: Path to the markdown file
            
        Returns:
            List of chunks for the document
        """
        chunks = []
        
        try:
            # Convert markdown to docling document
            doc = self.doc_converter.convert(source=str(file_path)).document
            
            # Generate and store chunks in order
            for chunk in self.chunker.chunk(dl_doc=doc):
                chunks.append(self.chunker.serialize(chunk=chunk))
                
            self.logger.info(f"Successfully processed {file_path.name} - Generated {len(chunks)} chunks")
            
        except Exception as e:
            self.logger.error(f"Error processing {file_path.name}: {str(e)}")
        
        return chunks

    def process_directory(self) -> Dict[str, List[str]]:
        """
        Process all markdown files in the directory and its subdirectories.
        
        Returns:
            Dictionary mapping document names to their ordered chunks
        """
        # Find all markdown files
        md_files = list(self.base_dir.glob("**/*-with-image-refs.md"))
        
        if not md_files:
            self.logger.warning(f"No markdown files found in {self.base_dir}")
            return self.document_chunks
        
        self.logger.info(f"Found {len(md_files)} markdown files to process")
        
        # Process each file
        for md_file in md_files:
            self.logger.info(f"Processing {md_file.relative_to(self.base_dir)}")
            
            # Store chunks with document name as key
            doc_key = md_file.stem
            self.document_chunks[doc_key] = self.process_single_document(md_file)
        
        self.logger.info(f"Completed processing all documents")
        return self.document_chunks
    
    def get_document_statistics(self) -> None:
        """Print statistics about processed documents and their chunks."""
        if not self.document_chunks:
            print("No documents have been processed yet.")
            return
            
        print("\nDocument Processing Statistics:")
        print("-" * 30)
        for doc_name, chunks in self.document_chunks.items():
            print(f"\nDocument: {doc_name}")
            print(f"Number of chunks: {len(chunks)}")
            if chunks:
                avg_chunk_length = sum(len(self.tokenizer.tokenize(chunk)) 
                                     for chunk in chunks) / len(chunks)
                print(f"Average chunk length: {avg_chunk_length:.2f} tokens")



  from .autonotebook import tqdm as notebook_tqdm


In [6]:
doc_chunker = DocumentChunker()

# Process all documents
document_chunks = doc_chunker.process_directory()

# Print statistics
doc_chunker.get_document_statistics()

INFO:__main__:Found 3 markdown files to process
INFO:__main__:Processing AI_ACT\AI_ACT-with-image-refs.md
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document AI_ACT-with-image-refs.md
INFO:docling.document_converter:Finished converting document AI_ACT-with-image-refs.md in 293.09 sec.
Token indices sequence length is longer than the specified maximum sequence length for this model (8230 > 8192). Running this sequence through the model will result in indexing errors
INFO:__main__:Successfully processed AI_ACT-with-image-refs.md - Generated 152 chunks
INFO:__main__:Processing Cybersecurity_California_Privacy\Cybersecurity_California_Privacy-with-image-refs.md
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Cybersecurity_California_Privacy-with-image-refs.md
INFO:docling.document_converter:Finished converting document Cybersecurity_California_Pr


Document Processing Statistics:
------------------------------

Document: AI_ACT-with-image-refs
Number of chunks: 152
Average chunk length: 1133.82 tokens

Document: Cybersecurity_California_Privacy-with-image-refs
Number of chunks: 41
Average chunk length: 266.54 tokens

Document: GDPR-with-image-refs
Number of chunks: 122
Average chunk length: 938.01 tokens


In [11]:
import json

# Prepare list to store all chunks with their metadata
chunks_data = []

# Loop through the document_chunks dictionary
for doc_name, chunks in document_chunks.items():
    # Process each chunk in the document
    for i, chunk_content in enumerate(chunks):
        chunk_data = {
            "document_name": doc_name,
            "chunk_id": f"{doc_name}_chunk_{i}",
            "chunk_content": chunk_content
        }
        chunks_data.append(chunk_data)

# Save to JSON file
output_path = "document_chunks.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(chunks_data, f, indent=2, ensure_ascii=False)

print(f"Saved {len(chunks_data)} chunks to {output_path}")

Saved 315 chunks to document_chunks.json


In [14]:
# Load and restructure the chunks data
with open("document_chunks.json", 'r', encoding='utf-8') as f:
    chunks_list = json.load(f)

# Convert the flat list structure back to document_chunks dictionary
document_chunks = {}
for chunk in chunks_list:
    doc_name = chunk['document_name']
    if doc_name not in document_chunks:
        document_chunks[doc_name] = []
    document_chunks[doc_name].append(chunk['chunk_content'])

print(f"Loaded chunks for {len(document_chunks)} documents")

Loaded chunks for 3 documents


In [18]:
# To verify all chunks are loaded correctly
# len(document_chunks['GDPR-with-image-refs'])

122

In [28]:
def format_document_chunks(chunks_data: List[dict]) -> Dict[str, str]:
    """
    Format chunks from JSON data into strings organized by document.
    
    Args:
        chunks_data: List of dictionaries containing chunk information from document_chunks.json
        
    Returns:
        Dictionary mapping document names to their formatted content string
    """
    formatted_docs = {}
    
    # Group chunks by document
    for chunk in chunks_data:
        doc_name = chunk['document_name']
        
        if doc_name not in formatted_docs:
            formatted_docs[doc_name] = f"{doc_name}:\n\n"
            
        formatted_docs[doc_name] += "----x----\n"
        formatted_docs[doc_name] += f"chunk_id: {chunk['chunk_id']}\n"
        formatted_docs[doc_name] += f"chunk_content: {chunk['chunk_content']}\n\n"
    
    return formatted_docs

# Load chunks from JSON
with open("document_chunks.json", 'r', encoding='utf-8') as f:
    chunks_data = json.load(f)

# Generate formatted documents
formatted_docs = format_document_chunks(chunks_data)



In [32]:
# Generate eval set for one doc to see results
generate_questions(formatted_docs['AI_ACT-with-image-refs'],50)


'[{"question": "What is the aim of the AI Act?", "answer": "The Artificial Intelligence Act aims to improve the functioning of the internal market by creating a uniform legal framework for the development, placing on the market, putting into service, and use of AI systems in the EU.", "difficulty": "medium", "chunk_ids": ["AI_ACT-with-image-refs_chunk_3"]}, {"question": "What values does the AI Act adhere to?", "answer": "The AI Act is applied in accordance with Union values, including the protection of natural persons, undertakings, democracy, the rule of law, and environmental protection.", "difficulty": "medium", "chunk_ids": ["AI_ACT-with-image-refs_chunk_3"]}, {"question": "How does the AI Act address market fragmentation?", "answer": "It ensures a consistent and high level of protection throughout the Union for trustworthy AI, preventing divergences that hinder free circulation, innovation, deployment, and uptake of AI systems in the internal market.", "difficulty": "medium", "ch

In [None]:
# Generate eval sets for each document
eval_sets = {}
for doc_id, formatted_content in formatted_docs.items():
    print(f"Generating questions for {doc_id}...")
    eval_sets[doc_id] = generate_questions(formatted_content)

In [None]:
import json

# Parse the eval sets and add document information
parsed_eval_sets = []

for doc_id, eval_set in eval_sets.items():
    # Convert string response to Python list of dictionaries
    questions = json.loads(eval_set)
    
    # Add document information to each question
    for question in questions:
        question['document'] = doc_id
        parsed_eval_sets.append(question)

# Save to JSON file
output_path = "evaluation_sets.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(parsed_eval_sets, f, indent=2, ensure_ascii=False)

print(f"Saved {len(parsed_eval_sets)} questions to {output_path}")

In [None]:
# Load the eval sets when needed
with open("evaluation_sets.json", 'r', encoding='utf-8') as f:
    eval_sets = json.load(f)