In [1]:
# Use the environment variable if the user doesn't provide Project ID.
import os
import json
import vertexai
from google import genai
from google.genai.types import GenerateContentConfig, Part
from dotenv import load_dotenv
load_dotenv()
PROJECT_ID = os.getenv("PROJECT_ID")  # @param {type: "string", placeholder: "[your-project-id]" isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "xyz":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "australia-southeast1")

vertexai.init(project=PROJECT_ID, location=LOCATION)

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

In [2]:
from utils import save_pdf_pages
save_pdf_pages(".\\processed_docs\\Privacy_Act_AU\\Privacy_Act_AU.pdf", 
               1, 
               20, 
               ".\\processed_docs\\Privacy_Act_AU\\Privacy_Act_AU_extracted.pdf")

Extracted 20 pages (1 to 20) to .\processed_docs\Privacy_Act_AU\Privacy_Act_AU_extracted.pdf


'.\\processed_docs\\Privacy_Act_AU\\Privacy_Act_AU_extracted.pdf'

In [2]:
import os
from google.oauth2 import service_account

# Path to your service account key file
key_path = "C:\\Users\\shres\\Projects\\RAG-case-study\keys\\keyproject-401005-6e1cdcbb5996.json"

# Create credentials using the service account key file
credentials = service_account.Credentials.from_service_account_file(
    key_path
)

# Set the credentials for the current environment
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
# auth_request = transport.requests.Request()
# credentials.refresh(auth_request)

In [3]:
from pydantic import BaseModel, Field

PDF_MIME_TYPE = "application/pdf"
JSON_MIME_TYPE = "application/json"
ENUM_MIME_TYPE = "text/x.enum"
MODEL_ID = "gemini-1.5-pro-002"

In [4]:
from typing import List, Optional

class Section(BaseModel):
    title: str = Field(description="The title of the section")
    page_number: int = Field(description="The page number of the section")
    section_number: str = Field(description="The section number of the section")

class SubDivision(BaseModel):
    title: str = Field(description="The title of the sub-division")
    sections: List[Section] | None = Field(description="The sections of the sub-division")

class Division(BaseModel):
    title: str = Field(description="The title of the division")
    sub_divisions: List[SubDivision] | None = Field(description="The sub-divisions of the division")
    sections: List[Section] | None = Field(description="The sections of the division")

class Part_TOC(BaseModel):
    title: str = Field(description="The title of the part")
    divisions: List[Division] | None = Field(description="The divisions of the part")
    sections: List[Section] | None = Field(description="The sections of the part")

class TableOfContents(BaseModel):
    parts: List[Part_TOC] = Field(description="The parts of the table of contents")

class Response(BaseModel):
    table_of_contents: TableOfContents = Field(description="The table of contents of the document")
    is_finished: bool = Field(description="Whether the table of contents has been extracted fully")

class ResponsePart(BaseModel):
    part: Part_TOC = Field(description="The part of the table of contents")
    is_part_extracted: bool = Field(description="Whether the part has been extracted fully")
    part_extracted: str = Field(description="Title of the part that has been extracted")
    next_part_title: str | None = Field(description="Title of the next part of the table of contents to be extracted")
    is_toc_finished: bool = Field(description="Whether the table of contents has been extracted fully."
                                  "Yes if the part is the last part of the table of contents, otherwise no")
    




In [1]:
class sectionTOC(BaseModel):
    title: str = Field(description="The title of the section")
    page_number: int = Field(description="The page number of the section")

class ccpaTOC(BaseModel):
    main_heading: str | None = Field(description="The title of the main heading")
    section: List[sectionTOC] | None = Field(description="The sections within the heading")


NameError: name 'BaseModel' is not defined

In [5]:
SYSTEM_PROMPT_PAU = """
You are an intelligent agent who extracts table of contents from documents part by part.
Your task is to:
1. Extract ONLY ONE PART at a time from the table of contents
2. If no previous extraction exists, start with the first part
3. If previous parts were extracted, extract the next part as indicated
4. Set is_part_extracted to true when the current part is fully extracted
5. Set part_extracted to the title of the current part you've extracted
6. Set next_part_title to the title of the next part to be extracted (null if this is the last part)
7. Set is_toc_finished to true ONLY if this is the last part of the table of contents

Important:
- Extract only ONE part per response
- Be thorough and accurate in extraction
- Maintain proper structure (part -> divisions -> subdivisions -> sections)
- Include all page numbers and section numbers
- If you see the next part title in the document, include it in next_part_title
"""

In [6]:
def initialize_vertex_ai(project_id: str, location: str):
    """Initialize Vertex AI client"""
    vertexai.init(project=project_id, location=location)
    return genai.Client(vertexai=True, project=project_id, location=location)

def extract_next_part(
    client: genai.Client,
    file_path: str,
    current_toc: dict,
    system_prompt: str,
    next_part_title: Optional[str] = None,
    is_markdown: bool = False
) -> ResponsePart:
    """
    Extract the next part of the table of contents from either PDF or markdown
    
    Args:
        client: Initialized Vertex AI client
        file_path: Path to PDF or markdown file
        current_toc: Current state of the table of contents
        next_part_title: Title of the part to extract (if known)
        is_markdown: Boolean indicating if the input file is markdown
    
    Returns:
        ResponsePart containing the extracted part and metadata
    """
    print("starting extraction...")
    
    # Read file based on type
    if is_markdown:
        with open(file_path, "r", encoding='utf-8') as f:
            file_content = f.read()
        # For markdown, add content directly as string
        contents = [
            "Extract the next part of the table of contents.",
            file_content
        ]
    else:
        # For PDF, use Part.from_bytes as before
        with open(file_path, "rb") as f:
            file_bytes = f.read()
        contents = [
            "Extract the next part of the table of contents.",
            Part.from_bytes(data=file_bytes, mime_type=PDF_MIME_TYPE),
        ]
    
    print("file read...")
    
    # If we have current TOC and next part title, include them
    if current_toc and len(current_toc["parts"]) > 0:
        contents.insert(1, f"Previously extracted parts: {json.dumps(current_toc)}")
    if next_part_title:
        contents.insert(1, f"Please extract the part titled: {next_part_title}")
    
    print("contents prepared...")
    
    # Generate content using LLM
    response = client.models.generate_content(
        model=MODEL_ID,
        contents=contents,
        config=GenerateContentConfig(
            system_instruction=system_prompt,
            temperature=0,
            response_schema=ResponsePart,
            response_mime_type=JSON_MIME_TYPE,
        ),
    )
    print("response generated...")
    
    # Parse and validate response
    result = json.loads(response.text)
    return ResponsePart(**result)

def extract_full_toc(
    client: genai.Client,
    file_path: str,
    max_parts: int = 20,
    is_markdown: bool = False
) -> dict:
    """
    Extract complete table of contents part by part
    
    Args:
        client: Initialized Vertex AI client
        file_path: Path to PDF or markdown file
        max_parts: Maximum number of parts to prevent infinite loops
        is_markdown: Boolean indicating if the input file is markdown
    
    Returns:
        Complete table of contents as dictionary
    """
    # Initialize empty TOC
    toc = {"parts": []}
    next_part_title = None
    parts_extracted = 0
    print("initializing TOC...")
    
    while parts_extracted < max_parts:
        print("extracting next part...")
        # Extract next part
        response = extract_next_part(client, file_path, toc, SYSTEM_PROMPT_PAU, next_part_title, is_markdown)
        
        # Add the extracted part to our TOC
        toc["parts"].append(response.part.model_dump())
        parts_extracted += 1

        # Save intermediate result
        with open("Experiments\\002\\toc_data_intermediate.json", 'w', encoding='utf-8') as f:
            json.dump(toc, f, indent=4)

        print(f"Extracted part: {response.part_extracted}")
        
        # Check if we're done
        if response.is_toc_finished:
            break
            
        # Update next part to extract
        next_part_title = response.next_part_title
        
        if not next_part_title:
            print("Warning: No next part title provided but TOC not marked as finished")
            break

    return toc

In [7]:
def main():
    # Initialize Vertex AI
    PROJECT_ID = os.getenv("PROJECT_ID")
    LOCATION = "australia-southeast1"
    client = initialize_vertex_ai(PROJECT_ID, LOCATION)

    # Path to PDF
    file_path = ".\\Experiments\\002\\toc_data_full.md"

    try:
        # Extract table of contents
        toc = extract_full_toc(client, file_path, is_markdown=True)

        # Ensure output directory exists
        os.makedirs("Experiments\\002", exist_ok=True)

        # Write final result to file
        output_file = "Experiments\\002\\toc_data_full_mistral.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(toc, f, indent=4)

        print(f"Complete table of contents has been written to {output_file}")

    except Exception as e:
        print(f"Error occurred: {str(e)}")

if __name__ == "__main__":
    main()

initializing TOC...
extracting next part...
starting extraction...
file read...
contents prepared...
response generated...
Extracted part: Part I—Preliminary
extracting next part...
starting extraction...
file read...
contents prepared...
response generated...
Extracted part: Part II—Interpretation
extracting next part...
starting extraction...
file read...
contents prepared...
response generated...
Extracted part: Part III—Information privacy
extracting next part...
starting extraction...
file read...
contents prepared...
response generated...
Extracted part: Part IIIA—Credit reporting
extracting next part...
starting extraction...
file read...
contents prepared...
response generated...
Extracted part: Part IIIB—Privacy codes
extracting next part...
starting extraction...
file read...
contents prepared...
response generated...
Extracted part: Part IIIC-Notification of eligible data breaches
extracting next part...
starting extraction...
file read...
contents prepared...
response gener

In [1]:
import json
from utils import fix_unicode_characters
# Usage
json_path = "Experiments/002/toc_data_full_mistral.json"
fix_unicode_characters(json_path)

Successfully processed and saved Experiments/002/toc_data_full_mistral.json


### Experiments
- Experiment 1: Extracting structured JSON response of table of contents of Privancy_Act_AU.pdf from the PDF directly via gemini
- Experiment 2: Using Mistral OCR to parse Table of Contents to markdown + Using markdown as input to extract structured JSON via gemini

### Observations:
- Experiment 1 has some drawbacks when there are longer and convoluted Parts to extract
- Experiment 2 solves those problems with a more optimal OCR to Markdown converter. Gemini then accurately extracts JSON from the markdown of the table of contents
- Markdown to JSON is more accurate compared to PDF to JSON

In [2]:


## Extracting structured output using Mistral OCR

from mistralai import Mistral
import os

api_key = os.environ["Mistral_API_KEY"]

client = Mistral(api_key=api_key)

uploaded_pdf = client.files.upload(
    file={
        "file_name": "Cybersecurity_California_Privacy.pdf",
        "content": open(".\\processed_docs\\Cybersecurity_California_Privacy\\Cybersecurity_California_Privacy.pdf", "rb"),
    },
    purpose="ocr"
)  

In [3]:
import os
from mistralai import Mistral
signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
api_key = os.environ["Mistral_API_KEY"]
client = Mistral(api_key=api_key)
ocr_response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": signed_url.url,
    }
)

In [4]:
def combine_markdown_pages(ocr_response):
    """
    Combines markdown content from multiple pages into a single string
    
    Args:
        ocr_response: Dictionary containing 'pages' key with list of objects having markdown content
        
    Returns:
        str: Combined markdown content from all pages
    """
    combined_markdown = ""
    
    for page in ocr_response.pages:
        combined_markdown += page.markdown + "\n\n"  # Add double newline between pages
        
    return combined_markdown.strip()  # Remove trailing whitespace

# Example usage:
# combined_content = combine_markdown_pages(ocr_response)
# 
# # Write to file if needed
# with open("output.md", "w", encoding="utf-8") as f:
#     f.write(combined_content)

In [34]:
#Writing Table of contents to markdown
combined_content = combine_markdown_pages(ocr_response)

# Write to file if needed
with open("Experiments\\002\\toc_data_full.md", "w", encoding="utf-8") as f:
    f.write(combined_content)

In [5]:
#Writing the document to markdown
combined_content = combine_markdown_pages(ocr_response)

# Write to file if needed
with open(".\\processed_docs\\Cybersecurity_California_Privacy\\Cybersecurity_California_Privacy_extracted.md", "w", encoding="utf-8") as f:
    f.write(combined_content)