In [31]:
from dotenv import load_dotenv
import os

# --- DEBUGGING STEP ---
# Print the current working directory to see where Python is looking.
print(f"Current working directory: {os.getcwd()}")

# Load environment variables from the .env file
load_dotenv()

# Get the API key from the environment variables
# The string "GENAI_API_KEY" must match the variable name in your .env file
api_key = os.getenv("GENAI_API_KEY")

# Check if the API key is loaded correctly
if not api_key:
    raise ValueError("No API key found. Please set the GENAI_API_KEY in your .env file.")


Current working directory: /home/lewis/github/rag-strategies


In [13]:
from google import genai
from google.genai import types
import pathlib
import httpx

client = genai.Client(api_key=api_key)

# Retrieve and encode the PDF byte
file = ["firds_reference_data_functional_specifications_v2.10.pdf"]
current_dir = os.getcwd()
doc_path = os.path.join(current_dir, "resources", file[0])
filepath = pathlib.Path(doc_path)

prompt = "Show me the entire table of Annex 1c: Reference Data Content and Consistency Validation Rules"
response = client.models.generate_content(
  model="gemini-2.5-flash",
  contents=[
      types.Part.from_bytes(
        data=filepath.read_bytes(),
        mime_type='application/pdf',
      ),
      prompt])
print(response.text)

Here is the entire table of **Annex 1c: Reference Data Content and Consistency Validation Rules** from page 185 of the document:

**TABLE 33 - REFERENCE DATA CONTENT AND CONSISTENCY VALIDATION RULES**

| Control executed by the system | Error code | Error Message | Concerned Fields |
| :----------------------------- | :--------- | :------------ | :--------------- |
| The value of “Instrument Classification” shall be a valid ISO 10962 code and shall be covered by at least one of the CFI constructs in the CFI-based validation matrix. | INS-101 | The CFI code is not valid against the CFI based validation matrix. | RTS field 3 against the list of valid CFI codes table and against the list of CFI Construct (Primary Key) in the CFI based validation table |
| Check that Mandatory fields are reported according to “CFI-based validations table”. | INS-102 | The following mandatory fields are not reported: “List of RTS23 number Id of missing field(s)”. | RTS field 3 vs all other RTS fields |
| Ch

In [64]:
import os
from fpdf import FPDF

# LangChain components
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

# Define constants
file = ["firds_reference_data_functional_specifications_v2.10.pdf"]
current_dir = os.getcwd()
PDF_PATH = os.path.join(current_dir, "resources", file[0])
EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
USER_QUERY = "Show me the entire table of Annex 1c: Reference Data Content and Consistency Validation Rules"
RERANKER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"

In [68]:
# ==============================================================================
#  STEP 1: LOAD AND CHUNK THE DOCUMENT
# ==============================================================================
print("\n--- Step 1: Loading and Chunking PDF ---")
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    length_function=len,
)
chunks = text_splitter.split_documents(documents)
print(f"PDF loaded and split into {len(chunks)} chunks.")

# ==============================================================================
#  STEP 2: EMBED THE CHUNKS
# ==============================================================================
print(f"\n--- Step 2: Embedding Chunks using '{EMBEDDING_MODEL_NAME}' ---")
# This will download the model from Hugging Face on its first run.
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
print("Embedding model loaded.")

# ==============================================================================
#  STEP 3: STORE IN A VECTOR DATABASE
# ==============================================================================
print("\n--- Step 3: Storing chunks in FAISS in-memory vector database ---")
# The from_documents method handles embedding and storing in one step.
vector_store = FAISS.from_documents(chunks, embeddings)
print("Chunks embedded and stored in FAISS.")

# ==============================================================================
#  STEP 4: RETRIEVE RELEVANT CHUNKS
# ==============================================================================
print("\n--- Step 4: Retrieving Top 10 Chunks via Similarity Search ---")
print(f"\nUser Query: \"{USER_QUERY}\"")

# Retrieve the top 5 most similar chunks
base_retriever = vector_store.as_retriever(search_kwargs={"k": 10})

print("\n--- Top 10 Retrieved Chunks ---")
initial_results = base_retriever.get_relevant_documents(USER_QUERY)
print(f"\n--- Top 5 Initial Results (from vector search alone) ---")
for i, chunk in enumerate(initial_results[:5], 1):
    print(f"\n--- Initial Result {i} ---\n")
    print(chunk.page_content)


--- Step 1: Loading and Chunking PDF ---
PDF loaded and split into 625 chunks.

--- Step 2: Embedding Chunks using 'sentence-transformers/all-mpnet-base-v2' ---
Embedding model loaded.

--- Step 3: Storing chunks in FAISS in-memory vector database ---
Chunks embedded and stored in FAISS.

--- Step 4: Retrieving Top 10 Chunks via Similarity Search ---

User Query: "Show me the entire table of Annex 1c: Reference Data Content and Consistency Validation Rules"

--- Top 10 Retrieved Chunks ---

--- Top 5 Initial Results (from vector search alone) ---

--- Initial Result 1 ---

ESMA REGULAR USE 
 
 
33 / 216 
Upcoming RCA The country of the Relevant Competent Authority of that instrument, as last 
determined by the system for the upcoming publication. 
Free-text fields 
used for 
consistency 
checks 
 “Free-text fields used for consistency checks” fields in “RTS23 Fields table” 
as listed in section 6.9 RTS23 Fields table. 
Non-free-text 
fields used for 
consistency 
checks 
 “Non-free-te

In [67]:
# ==============================================================================
#  STEP 5: RERANK RETRIEVED CHUNKS
# ==============================================================================
# The cross-encoder model will be downloaded on the first run.
# It takes the query and a list of documents and returns them, scored and re-ordered.
print(f"\n--- Initializing Reranker with '{RERANKER_MODEL_NAME}' ---")
model = HuggingFaceCrossEncoder(model_name=RERANKER_MODEL_NAME)
reranker = CrossEncoderReranker(model=model, top_n=3)

# 5d. Create the full retrieval pipeline with the reranker
# The ContextualCompressionRetriever uses the base retriever to fetch documents
# and then the reranker to re-order them based on relevance.
compression_retriever = ContextualCompressionRetriever(
    base_compressor=reranker, base_retriever=base_retriever
)
print("Reranking pipeline created.")

# 5e. Perform the final, reranked search
print("\n--- Performing search with reranking... ---")
reranked_chunks = compression_retriever.get_relevant_documents(USER_QUERY)
print("\n\n=========================================================")
print(f"--- Top 3 Reranked & Most Relevant Chunks ---")
print("=========================================================")
for i, chunk in enumerate(reranked_chunks, 1):
    print(f"\n--- Final Result {i} ---\n")
    print(chunk.page_content)


--- Performing search with reranking... ---


--- Top 3 Reranked & Most Relevant Chunks ---

--- Final Result 1 ---

ESMA REGULAR USE 
 
 
183 / 216 
 
9 Annex 1c: Reference Data Content and 
Consistency Validation Rules  
Control executed by the system 
Error 
code 
Error Message 
Concerned 
Fields 
The value of “Instrument Classification” shall 
be a valid ISO 10962 code and shall be 
covered by at least one of the CFI constructs 
in the CFI-based validation matrix. 
INS-101 
The CFI code is not valid 
against the CFI based 
validation matrix. 
RTS field 3 against the list of 
valid CFI codes table and 
against the list of CFI Construct 
(Primary Key) in the CFI based 
validation table 
Check that Mandatory fields are reported 
according to “CFI-based validations table”. 
INS-102 The following mandatory 
fields are not reported: 
“List of RTS23 number Id 
of missing field(s)”.

--- Final Result 2 ---

address errors on previous submission.  
Business 
Rules 
Table 33 - Reference Dat

In [71]:
# Import dependencies
import os
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorOptions, AcceleratorDevice
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, FormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
import time
from pathlib import Path

In [72]:
# List of files
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, "resources")
files = os.listdir(data_dir)

In [75]:
# Pipeline configs
accelerator_options = AcceleratorOptions(
    num_threads=4, device=AcceleratorDevice.AUTO
)
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.accelerator_options = accelerator_options

In [76]:
# Setup converter
converted = DocumentConverter(
    allowed_formats=[InputFormat.PDF],
    format_options={
        InputFormat.PDF: FormatOption(
            pipeline_cls=StandardPdfPipeline,
            pipeline_options=pipeline_options,
            backend=PyPdfiumDocumentBackend
        )
    }
)

In [78]:
# Begin parsing
for file in files:
    pdf_path = os.path.join(data_dir, file)
    # Check if file exists
    if not os.path.exists(pdf_path):
        print(f"Error: File '{pdf_path}' does not exist.")
        exit(1)
    print(f"Parsing file '{pdf_path}'...")

    start_time = time.time()
    print("Converting PDF to text...")
    conv_res = converted.convert(pdf_path)
    print("Converting done.")
    output_dir = Path("parsed")
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem
    # Save markdown
    md_filename = output_dir / f"{doc_filename}.md"
    conv_res.document.save_as_markdown(md_filename)
    end_time = time.time() - start_time
    print(f"Parsing done. Time elapsed: {end_time:.2f} seconds.")

Parsing file '/home/lewis/github/rag-strategies/resources/firds_reference_data_functional_specifications_v2.10.pdf'...
Converting PDF to text...
Converting done.
Parsing done. Time elapsed: 76.35 seconds.


In [79]:
from dotenv import load_dotenv
import os

# --- DEBUGGING STEP ---
# Print the current working directory to see where Python is looking.
print(f"Current working directory: {os.getcwd()}")

# Load environment variables from the .env file
load_dotenv()

# Get the API key from the environment variables
# The string "GENAI_API_KEY" must match the variable name in your .env file
api_key = os.getenv("GENAI_API_KEY")

# Check if the API key is loaded correctly
if not api_key:
    raise ValueError("No API key found. Please set the GENAI_API_KEY in your .env file.")


Current working directory: /home/lewis/github/rag-strategies


In [80]:
def read_pdf_as_bytes(file_path):
    try:
        with open(file_path, "rb") as file:
            pdf_bytes = file.read()
        return pdf_bytes
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None

In [81]:
def extract_text_from_pdf(pdf_path):
    try:
        try:
            from PyPDF2 import PdfReader
        except ImportError:
            from pypdf import PdfReader
        full_text = ""
        page_text = []

        # Open and read PDF
        with open(pdf_path, "rb") as file:
            pdf_reader = PdfReader(file)

            for page_num, page in enumerate(pdf_reader.pages):
                text = page.extract_text()
                page_text.append({"page": page_num + 1, "text": text})
                full_text += f"\n\n--- Page {page_num + 1} ---\n\n{text}"
        full_text_bytes = full_text.encode("utf-8")
        return read_pdf_as_bytes(pdf_path), page_text

    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return "", []

In [90]:
import json


def get_section_map_from_gemini(full_text):
    print("Asking Gemini to identify the document structure...")
    prompt = """
    You are a technical document parser. Your task is to analyse the provided text from a PDF.
    Identify all the file specification sections. A section typically starts with a pattern like "d.dd XXXXX", "d.d XXXXXX", "d XXXXXXX", or "d Annex dd: XXXXXXXX". These section headers are bolded.

    Extract the following for each section found:
    1. The full section title (e.g., '6.11 Rejection statistics table'.
    2. The page number where the section title appears.

    Return the result as a JSON array of objects. Each object should have two keys: 'title' and 'start_page'.
    Ensure the page number is an integer.

    Example of a single JSON object in the array:
    {
        "section_title": "6.11 Rejection statistics table",
        "start_page": 10
    }
    """

    client = genai.Client(api_key=api_key)
    response = client.models.generate_content(
        model="gemini-2.5-pro",
        config={
            'temperature': 0.0,
            'response_mime_type': 'application/json'
        },
        contents=[
            types.Part.from_bytes(
                data=full_text,
                mime_type='application/pdf'
            ),
            prompt
        ]
    )
    try:
        section_map = json.loads(response.text)
        print(f"Gemini successfully identified {len(section_map)} sections.")
        return section_map
    except json.JSONDecodeError:
        print("Error: Gemini did not return a valid JSON response.")
        print(response.text)
        return None

In [91]:
def create_logical_chunks(page_texts, section_map):
    print("Creating logical chunks based on the section map...")
    text_by_page = {p["page"]: p["text"] for p in page_texts}

    chunks = []
    sorted_sections = sorted(section_map, key=lambda x: x["start_page"])

    for i, section in enumerate(sorted_sections):
        start_page = section["start_page"]
        section_title = section["section_title"]

        end_page = None
        if i + 1 < len(sorted_sections):
            end_page = sorted_sections[i + 1]["start_page"] - 1

        if end_page is None or end_page < start_page:
            end_page = len(page_texts)

        chunk_text = ""
        # we use end_page + 2 to overlap with one additional page, to handle the case where a single page has 2 sections
        for page_num in range(start_page, end_page + 2):
            if page_num in text_by_page:
                chunk_text += text_by_page[page_num] + "\n\n"

        # Clean up the chunk: find the start of the current section text
        title_pos = chunk_text.find(section_title)
        if title_pos != -1:
            chunk_text = chunk_text[title_pos:]

        chunks.append({
            "section_title": section_title,
            "text": chunk_text.strip(),
            "start_page": start_page,
            "end_page": end_page
        })

        print(f"Created {len(chunks)} logical chunks.")
        return chunks

In [92]:
# ==============================================================================
#  STEP 1: LOAD AND CHUNK THE DOCUMENT
# ==============================================================================
print("\n--- Step 1: Loading and Chunking PDF ---")
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()

parsed_dir = "parsed"
os.makedirs(parsed_dir, exist_ok=True)
section_map_path = os.path.join(parsed_dir, "section_map.json")

full_doc_text, pages = extract_text_from_pdf(PDF_PATH)
if os.path.exists(section_map_path):
    with open(section_map_path, "r") as f:
        section_map = json.load(f)
    print("Loaded existing section map.")
else:
    section_map = get_section_map_from_gemini(full_doc_text)
    if section_map:
        with open(section_map_path, "w") as f:
            json.dump(section_map, f, indent=2)
        print("Saved section map to section_map.json.")

chunks = create_logical_chunks(pages, section_map)

print(f"PDF loaded and split into {len(chunks)} chunks.")


--- Step 1: Loading and Chunking PDF ---
Asking Gemini to identify the document structure...
Gemini successfully identified 171 sections.
Saved section map to section_map.json.
Creating logical chunks based on the section map...
Created 1 logical chunks.
PDF loaded and split into 1 chunks.


In [None]:
# ==============================================================================
#  STEP 2: EMBED THE CHUNKS
# ==============================================================================
print(f"\n--- Step 2: Embedding Chunks using '{EMBEDDING_MODEL_NAME}' ---")
# This will download the model from Hugging Face on its first run.
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
print("Embedding model loaded.")

# ==============================================================================
#  STEP 3: STORE IN A VECTOR DATABASE
# ==============================================================================
print("\n--- Step 3: Storing chunks in FAISS in-memory vector database ---")
# The from_documents method handles embedding and storing in one step.
vector_store = FAISS.from_documents(chunks, embeddings)
print("Chunks embedded and stored in FAISS.")

# ==============================================================================
#  STEP 4: RETRIEVE RELEVANT CHUNKS
# ==============================================================================
print("\n--- Step 4: Retrieving Top 10 Chunks via Similarity Search ---")
print(f"\nUser Query: \"{USER_QUERY}\"")

# Retrieve the top 5 most similar chunks
base_retriever = vector_store.as_retriever(search_kwargs={"k": 10})

print("\n--- Top 10 Retrieved Chunks ---")
initial_results = base_retriever.get_relevant_documents(USER_QUERY)
print(f"\n--- Top 5 Initial Results (from vector search alone) ---")
for i, chunk in enumerate(initial_results[:5], 1):
    print(f"\n--- Initial Result {i} ---\n")
    print(chunk.page_content)