In [None]:
import os
import sys
from pathlib import Path

backend_dir = os.path.dirname(os.path.abspath(""))
if backend_dir not in sys.path:
    sys.path.insert(0, backend_dir)

from app.config import settings

print("API Key loaded:", "✓" if settings.GOOGLECLOUD_API_KEY else "✗")
print("Test PDF exists:", os.path.exists("test_dataset/test_pdf.pdf"))

API Key loaded: ✓
Test PDF exists: True
PDF file size: 208,261 bytes (203.4 KB)


In [2]:
from google.cloud import documentai_v1 as documentai
from google.api_core.client_options import ClientOptions
import json

project_id = "92552972525"
location = "us"
processor_id = "e481a89b861ab094"

opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

try:
    from google.oauth2 import service_account
    credentials_dict = json.loads(settings.GOOGLECLOUD_API_KEY)
    credentials = service_account.Credentials.from_service_account_info(credentials_dict)
    client = documentai.DocumentProcessorServiceClient(client_options=opts, credentials=credentials)
    print("✓ Using service account credentials")
except:
    client = documentai.DocumentProcessorServiceClient(client_options=opts)
    print("✓ Using default application credentials")

print(f"Project: {project_id}")
print(f"Location: {location}")
print(f"Processor: {processor_id}")

✓ Using default application credentials
Project: 92552972525
Location: us
Processor: e481a89b861ab094


E0000 00:00:1760388940.561747 4158257 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [3]:
pdf_path = "test_dataset/test_pdf.pdf"

with open(pdf_path, "rb") as f:
    pdf_content = f.read()

processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

print(f"Processing with: {processor_name}")

raw_document = documentai.RawDocument(content=pdf_content, mime_type="application/pdf")

request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document)

result = client.process_document(request=request)
document = result.document

print(f"✓ Processed {pdf_path}")
print(f"Document has {len(document.pages)} page(s)")
print(f"Full text length: {len(document.text)} characters")


Processing with: projects/92552972525/locations/us/processors/e481a89b861ab094
✓ Processed test_dataset/test_pdf.pdf
Document has 0 page(s)
Full text length: 0 characters


In [4]:
print("=" * 80)
print("FULL EXTRACTED TEXT (first 1000 chars):")
print("=" * 80)
print(document.text[:1000])
print("...")
print("=" * 80)


FULL EXTRACTED TEXT (first 1000 chars):

...


In [5]:
print("\n" + "=" * 80)
print("CHUNKING DEMONSTRATION")
print("=" * 80)

def chunk_by_paragraphs(text, min_chunk_size=50):
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    
    chunks = []
    for idx, para in enumerate(paragraphs):
        if len(para) >= min_chunk_size:
            chunks.append({
                'index': idx,
                'text': para,
                'length': len(para)
            })
    
    return chunks

chunks = chunk_by_paragraphs(document.text, min_chunk_size=100)

print(f"\nCreated {len(chunks)} chunks from document")
print(f"\nFirst 3 chunks:")
for i, chunk in enumerate(chunks[:3]):
    print(f"\n--- Chunk {i} (length: {chunk['length']}) ---")
    print(chunk['text'][:200] + "..." if len(chunk['text']) > 200 else chunk['text'])



CHUNKING DEMONSTRATION

Created 0 chunks from document

First 3 chunks:


In [6]:
print("\n" + "=" * 80)
print("PAGE-LEVEL INFORMATION")
print("=" * 80)

for page_num, page in enumerate(document.pages, start=1):
    print(f"\nPage {page_num}:")
    print(f"  Paragraphs: {len(page.paragraphs)}")
    print(f"  Lines: {len(page.lines)}")
    print(f"  Tokens: {len(page.tokens)}")
    print(f"  Tables: {len(page.tables)}")
    
    if page.tables:
        for table_idx, table in enumerate(page.tables):
            print(f"\n  Table {table_idx + 1}:")
            print(f"    Rows: {len(table.body_rows)}")
            print(f"    Header Rows: {len(table.header_rows)}")



PAGE-LEVEL INFORMATION


In [7]:
print("\n" + "=" * 80)
print("PARAGRAPH EXTRACTION WITH PAGE NUMBERS (for provenance)")
print("=" * 80)

def extract_paragraphs_with_pages(document):
    paragraphs_with_pages = []
    
    for page_num, page in enumerate(document.pages, start=1):
        for para_idx, paragraph in enumerate(page.paragraphs):
            text_anchor = paragraph.layout.text_anchor
            
            if text_anchor.text_segments:
                text = ""
                for segment in text_anchor.text_segments:
                    start_index = int(segment.start_index) if segment.start_index else 0
                    end_index = int(segment.end_index) if segment.end_index else 0
                    text += document.text[start_index:end_index]
                
                if text.strip() and len(text.strip()) > 50:
                    paragraphs_with_pages.append({
                        'page_number': page_num,
                        'paragraph_index': para_idx,
                        'text': text.strip()
                    })
    
    return paragraphs_with_pages

paras = extract_paragraphs_with_pages(document)

print(f"\nExtracted {len(paras)} paragraphs with page numbers")
print(f"\nFirst 3 paragraphs:")
for para in paras[:3]:
    print(f"\n[Page {para['page_number']}]")
    print(para['text'][:200] + "..." if len(para['text']) > 200 else para['text'])



PARAGRAPH EXTRACTION WITH PAGE NUMBERS (for provenance)

Extracted 0 paragraphs with page numbers

First 3 paragraphs:



DEBUGGING: Check processor status

✓ Processor found!
Name: layout-processor
Type: LAYOUT_PARSER_PROCESSOR
State: 1

✓ Processor is ENABLED and ready
