From a844ea9c31c7cfb03071f66918ce0cd9ed3f4566 Mon Sep 17 00:00:00 2001 From: Sami Rusani Date: Mon, 16 Mar 2026 09:49:07 +0100 Subject: [PATCH 1/2] Sprint 5L: PDF artifact parsing packet --- .ai/active/SPRINT_PACKET.md | 132 ++++++++++++++++++++---------------- 1 file changed, 74 insertions(+), 58 deletions(-) diff --git a/.ai/active/SPRINT_PACKET.md b/.ai/active/SPRINT_PACKET.md index 6d35e69..c42b6a1 100644 --- a/.ai/active/SPRINT_PACKET.md +++ b/.ai/active/SPRINT_PACKET.md @@ -2,111 +2,127 @@ ## Sprint Title -Sprint 5K: Project Truth Synchronization After Hybrid Artifact Compile +Sprint 5L: PDF Artifact Parsing V0 ## Sprint Type -refactor +feature ## Sprint Reason -Sprint 5J is implemented and the feature path is still correct, but the live truth artifacts are materially stale. `ARCHITECTURE.md` still describes the repo as current through Sprint 5H, and `ROADMAP.md` still says current through Sprint 5A. Before opening richer document parsing, read-only connectors, or any UI work, Control Tower needs the architecture and roadmap truth reset to the accepted repo state. +Sprint 5K re-synchronized project truth through Sprint 5J, and the agreed next delivery focus is richer document parsing on top of the shipped rooted workspace, durable chunk, and hybrid artifact compile baseline. The narrowest safe next slice is PDF ingestion only, not a broad “rich documents” sprint that mixes PDF, DOCX, OCR, connectors, or UI. ## Sprint Intent -Synchronize the live truth artifacts with the implemented and review-passed repo state through Sprint 5J, so future planning, handoff, and review work all start from accurate architecture, roadmap, and current-state documents. +Extend the existing artifact-ingestion seam so registered PDF artifacts can be ingested into the existing durable `task_artifact_chunks` substrate through deterministic text extraction, without changing retrieval contracts, compile contracts, connectors, or UI. ## Git Instructions -- Branch Name: `codex/sprint-5k-project-truth-sync` +- Branch Name: `codex/sprint-5l-pdf-artifact-parsing-v0` - Base Branch: `main` - PR Strategy: one sprint branch, one PR, no stacked PRs unless Control Tower explicitly opens a follow-up sprint - Merge Policy: squash merge only after reviewer `PASS` and explicit Control Tower merge approval ## Why This Sprint -- Sprint 5I shipped compile-path semantic artifact retrieval. -- Sprint 5J shipped deterministic hybrid lexical-plus-semantic artifact merge in compile. -- `ARCHITECTURE.md` is still describing the accepted repo slice through Sprint 5H and still treats compile-path semantic artifact use and hybrid artifact retrieval as deferred. -- `ROADMAP.md` still says the accepted repo state is current through Sprint 5A. -- Planning from stale truth at this point would increase scope drift risk just before richer document parsing and connector work. +- Sprint 5A shipped deterministic rooted task-workspace provisioning. +- Sprint 5C shipped explicit task-artifact registration. +- Sprint 5D shipped deterministic local text-artifact ingestion into durable chunk rows. +- Sprint 5E through 5J shipped lexical retrieval, semantic retrieval, and hybrid compile-path artifact retrieval on top of those persisted chunk rows. +- Sprint 5K re-synchronized the truth artifacts and explicitly set richer document parsing as the next narrow move. +- The safest next step is PDF extraction only, feeding the already-shipped chunk and retrieval seams without expanding into OCR, DOCX, connectors, or UI. ## In Scope -- Audit the accepted implemented slice from the repo and passed sprint reports through Sprint 5J. -- Update `ARCHITECTURE.md` so it accurately describes the implemented seams through: - - compile-path semantic artifact retrieval - - deterministic hybrid lexical-plus-semantic artifact merge in compile - - current artifact chunk contracts and retrieval boundaries -- Update `ROADMAP.md` so: - - completed/current milestone state reflects the accepted repo state through Sprint 5J - - the next delivery focus is framed from the actual shipped artifact retrieval baseline - - stale “current position” language is corrected -- Update `.ai/handoff/CURRENT_STATE.md` so: - - implemented areas and risks reflect the repo through Sprint 5J - - the current milestone position is correct - - the immediate next move matches the next narrow sprint boundary after truth sync -- Update `BUILD_REPORT.md` with the truth-sync evidence and exact files corrected. +- Extend schema and contracts only as narrowly needed to support PDF ingestion metadata, for example: + - `task_artifacts.ingestion_status` reuse if no new status is required + - optional deterministic extraction metadata on artifact detail or ingestion responses if needed +- Define typed contracts for: + - PDF artifact-ingestion requests if they differ from the current generic artifact-ingestion path + - artifact-ingestion responses updated for PDF extraction metadata if needed + - artifact detail or chunk summary metadata updated for PDF ingestion if needed +- Extend the existing ingestion seam so it: + - accepts already-registered visible PDF artifacts + - resolves rooted local file paths from persisted workspace plus artifact relative path + - supports one explicit PDF extraction path only + - extracts deterministic text from PDFs without OCR + - normalizes extracted text before chunking + - persists ordered chunk rows into the existing `task_artifact_chunks` table + - updates artifact ingestion status deterministically +- Add unit and integration tests for: + - supported PDF ingestion + - deterministic chunk ordering and chunk boundaries from extracted PDF text + - rooted path enforcement during PDF ingestion + - rejection of scanned-image or textless PDFs when no extractable text is present + - per-user isolation + - stable response shape ## Out of Scope -- No schema changes. -- No API changes. -- No runtime code changes. -- No richer document parsing. +- No DOCX ingestion. +- No OCR. +- No image extraction. +- No changes to lexical retrieval contracts. +- No changes to semantic retrieval contracts. +- No compile contract changes. - No Gmail or Calendar connector scope. - No runner-style orchestration. - No UI work. ## Required Deliverables -- Updated `ARCHITECTURE.md` aligned to the implemented repo state through Sprint 5J. -- Updated `ROADMAP.md` with correct completed/current/next milestone sequencing. -- Updated `.ai/handoff/CURRENT_STATE.md` reflecting the actual shipped state and immediate next move. -- Updated `BUILD_REPORT.md` describing exactly which truth artifacts were synchronized and what evidence was used. +- Narrow ingestion support for visible PDF artifacts using the existing artifact and chunk seams. +- Stable contract updates only where PDF extraction metadata is necessary. +- Unit and integration coverage for PDF extraction, rooted-path safety, deterministic chunk persistence, and isolation. +- Updated `BUILD_REPORT.md` with exact verification results and explicit deferred scope. ## Acceptance Criteria -- `ARCHITECTURE.md` describes compile-path semantic artifact retrieval and hybrid artifact compile merge as implemented behavior, not deferred work. -- `ROADMAP.md` no longer claims the repo is current only through Sprint 5A. -- `.ai/handoff/CURRENT_STATE.md` no longer describes the repo as current only through Sprint 5D. -- Truth artifacts clearly distinguish between implemented behavior and later planned work. -- No runtime, schema, API, connector, runner, or UI changes appear in the sprint diff. +- A client can ingest one supported visible PDF artifact into durable ordered chunk rows using the existing artifact-ingestion seam. +- PDF ingestion reads only files rooted under the persisted task workspace boundary. +- Extracted text is normalized and chunked deterministically into the existing `task_artifact_chunks` contract. +- Textless or unsupported PDFs are rejected deterministically rather than silently producing misleading chunks. +- Existing lexical, semantic, and hybrid artifact retrieval contracts continue to operate over the persisted chunk rows without contract changes. +- `./.venv/bin/python -m pytest tests/unit` passes. +- `./.venv/bin/python -m pytest tests/integration` passes. +- No DOCX, OCR, connector, runner, compile-contract, or UI scope enters the sprint. ## Implementation Constraints -- Keep this sprint documentation-only and boring. -- Use accepted repo state and passed sprint reports as evidence, not aspiration. -- Prefer explicit “implemented now” versus “planned later” boundaries. -- If a truth artifact cannot be updated confidently from accepted evidence, narrow the statement rather than guessing. -- Do not widen into product changes just because the architecture text is stale. +- Keep richer parsing narrow and boring. +- Reuse the existing rooted `task_workspaces`, `task_artifacts`, and `task_artifact_chunks` seams rather than creating a parallel document store. +- Support PDF text extraction only; do not introduce OCR fallback in the same sprint. +- Preserve existing retrieval and compile contracts by feeding the already-shipped chunk substrate. +- Keep extraction and chunking deterministic and testable from local files alone. ## Suggested Work Breakdown -1. Audit the implemented repo state and accepted sprint reports through Sprint 5J. -2. Update `ARCHITECTURE.md` to reflect the current shipped seams and boundaries. -3. Update `ROADMAP.md` to reflect actual completed and current milestone state. -4. Update `.ai/handoff/CURRENT_STATE.md` to reflect actual current state and the immediate next move. -5. Update `BUILD_REPORT.md` with exact truth-sync evidence and scope confirmation. +1. Define any minimal PDF-ingestion contract updates needed. +2. Implement deterministic rooted PDF text extraction in the existing artifact-ingestion seam. +3. Normalize extracted text and persist ordered chunk rows into the existing chunk store. +4. Add deterministic failure behavior for textless PDFs. +5. Add unit and integration tests. +6. Update `BUILD_REPORT.md` with executed verification. ## Build Report Requirements `BUILD_REPORT.md` must include: -- exactly which truth artifacts were updated -- which accepted reports or repo evidence were used -- the specific stale statements that were corrected -- confirmation that no runtime or schema changes were made -- what remains intentionally deferred after truth synchronization +- the exact PDF-ingestion contract changes introduced, if any +- the PDF extraction path and chunking rule used +- exact commands run +- unit and integration test results +- one example PDF artifact-ingestion response +- one example chunk list response produced from a PDF artifact +- what remains intentionally deferred to later milestones ## Review Focus `REVIEW_REPORT.md` should verify: -- the sprint stayed documentation-only -- `ARCHITECTURE.md`, `ROADMAP.md`, and `.ai/handoff/CURRENT_STATE.md` now match the implemented repo state through Sprint 5J -- compile-path semantic artifact retrieval and hybrid artifact merge are documented accurately -- milestone sequencing is truthful and current -- no hidden runtime, schema, API, connector, runner, or UI scope entered the sprint +- the sprint stayed limited to PDF artifact parsing through the existing ingestion seam +- PDF ingestion reuses the existing rooted workspace, artifact, and chunk contracts +- extraction determinism, chunk ordering, rooted-path safety, and isolation are test-backed +- no hidden DOCX, OCR, connector, runner, compile-contract, or UI scope entered the sprint ## Exit Condition -This sprint is complete when the project truth artifacts accurately describe the implemented repo state through Sprint 5J and future planning can proceed from synchronized architecture, roadmap, and current-state documents. +This sprint is complete when the repo can ingest supported visible PDF artifacts into deterministic durable chunk rows through the existing artifact-ingestion seam, verify the full path with Postgres-backed tests, and still defer broader document parsing, connectors, and UI. From a854e46f85bdf2c28f9d108f892786c669ee5c7f Mon Sep 17 00:00:00 2001 From: Sami Rusani Date: Mon, 16 Mar 2026 09:55:47 +0100 Subject: [PATCH 2/2] Sprint 5L: PDF artifact parsing v0 --- BUILD_REPORT.md | 196 ++++-- REVIEW_REPORT.md | 35 +- apps/api/src/alicebot_api/artifacts.py | 601 ++++++++++++++++++- tests/integration/test_task_artifacts_api.py | 283 ++++++++- tests/unit/test_artifacts.py | 216 ++++++- tests/unit/test_artifacts_main.py | 8 +- 6 files changed, 1238 insertions(+), 101 deletions(-) diff --git a/BUILD_REPORT.md b/BUILD_REPORT.md index f85f207..d3fcf57 100644 --- a/BUILD_REPORT.md +++ b/BUILD_REPORT.md @@ -2,76 +2,168 @@ ## sprint objective -Synchronize the live truth artifacts with the accepted implemented repo state through Sprint 5J so architecture, roadmap, and current-state planning all start from the shipped compile and artifact-retrieval baseline. +Implement narrow PDF artifact parsing on the existing artifact-ingestion seam so already-registered visible PDF artifacts can be ingested into durable `task_artifact_chunks` rows without changing retrieval contracts, compile contracts, connectors, or UI. ## completed work -- Updated `ARCHITECTURE.md` to state that the accepted repo slice is current through Sprint 5J instead of Sprint 5H. -- Corrected `ARCHITECTURE.md` so compile-path semantic artifact retrieval and deterministic hybrid lexical-plus-semantic artifact merge are documented as implemented behavior, not deferred work. -- Updated `ARCHITECTURE.md` retrieval and testing sections to reflect the current artifact chunk contracts, hybrid compile ordering, provenance, and test coverage through Sprints 5I and 5J. -- Updated `ROADMAP.md` to state that the accepted repo state is current through Sprint 5J instead of Sprint 5A. -- Reframed `ROADMAP.md` so the next narrow sprint is richer document parsing on top of the shipped rooted workspace, durable chunk, and hybrid artifact compile baseline. -- Updated `.ai/handoff/CURRENT_STATE.md` to state that the working repo is current through Sprint 5J instead of Sprint 5D. -- Corrected `.ai/handoff/CURRENT_STATE.md` so artifact retrieval, artifact embeddings, compile-path semantic artifact retrieval, and the hybrid compile merge are described as shipped. -- Kept the sprint documentation-only: no runtime, schema, API, connector, runner, or UI files were changed. - -## truth-sync evidence used - -- Repo implementation evidence: - - `apps/api/src/alicebot_api/compiler.py` - - `apps/api/src/alicebot_api/contracts.py` - - `apps/api/src/alicebot_api/main.py` -- Accepted repo-test evidence: - - `tests/integration/test_context_compile.py` - - `tests/unit/test_compiler.py` - - `tests/unit/test_main.py` - - `tests/unit/test_response_generation.py` -- Accepted sprint evidence already present in the repo: - - `docs/archive/sprints/2026-03-13-sprint-5a-build-report.md` - - `docs/archive/sprints/2026-03-13-sprint-5a-review-report.md` -- Sprint packet requirements: - - `.ai/active/SPRINT_PACKET.md` - -## specific stale statements corrected - -- `ARCHITECTURE.md` no longer says the accepted repo slice is current only through Sprint 5H. -- `ARCHITECTURE.md` no longer says compile-path semantic artifact use and hybrid artifact retrieval are planned later. -- `ROADMAP.md` no longer says the accepted repo state is current only through Sprint 5A. -- `.ai/handoff/CURRENT_STATE.md` no longer says the working repo state is current only through Sprint 5D. -- `.ai/handoff/CURRENT_STATE.md` no longer says retrieval, ranking, or embeddings over artifact chunks are not implemented. +- Extended artifact media-type inference and validation to accept `application/pdf` and `.pdf` artifacts on the existing ingestion path. +- Implemented deterministic local PDF text extraction in `apps/api/src/alicebot_api/artifacts.py` by: + - parsing the PDF object graph from local bytes only + - walking the catalog/pages tree in page order + - reading `/Contents` streams only + - supporting unfiltered streams and `/FlateDecode` streams only + - extracting text from PDF text-show operators (`Tj`, `TJ`, `'`, `"`) only + - rejecting PDFs that are invalid, textless, or use unsupported stream filters/structures +- Kept ingestion status behavior unchanged: successful PDF ingestion updates `task_artifacts.ingestion_status` to `ingested`; rejected PDFs remain on the existing pending path and do not introduce a new status. +- Reused the existing normalization and chunking seam after extraction: + - line endings normalize through `normalize_artifact_text()` + - chunk persistence still uses `task_artifact_chunks` + - chunk rule remains `normalized_utf8_text_fixed_window_1000_chars_v1` +- Added unit coverage for: + - deterministic PDF chunk persistence + - textless PDF rejection + - PDF-rooted path enforcement + - updated unsupported-media validation +- Added integration coverage for: + - supported PDF ingestion with stable response shape + - deterministic PDF chunk ordering and boundaries + - PDF per-user isolation + - textless PDF rejection + - rooted-path enforcement during PDF ingestion + +## exact PDF-ingestion contract changes introduced + +- No request contract changes. +- No response contract shape changes. +- No schema changes. +- The only contract-level behavior change is that the existing artifact-ingestion seam now accepts `application/pdf` as a supported artifact media type and continues returning the existing `TaskArtifactIngestionResponse` and `TaskArtifactChunkListResponse` shapes. + +## PDF extraction path and chunking rule used + +- Extraction path: + - existing `POST /v0/task-artifacts/{task_artifact_id}/ingest` + - resolve persisted workspace `local_path` plus persisted artifact `relative_path` + - enforce rooted workspace boundary before any read + - for PDFs, parse local file bytes, walk the PDF page tree, decode supported content streams, and extract text-show operations in deterministic stream order + - reject if no extractable text is found +- Chunking rule: + - normalize extracted text with CRLF/CR to LF conversion + - split into fixed windows of 1000 characters + - persist ordered rows with `sequence_no`, `char_start`, `char_end_exclusive`, and `text` + - reported chunking rule string: `normalized_utf8_text_fixed_window_1000_chars_v1` ## incomplete work -- None within Sprint 5K scope. +- None within Sprint 5L scope. ## files changed -- `ARCHITECTURE.md` -- `ROADMAP.md` -- `.ai/handoff/CURRENT_STATE.md` +- `apps/api/src/alicebot_api/artifacts.py` +- `tests/unit/test_artifacts.py` +- `tests/unit/test_artifacts_main.py` +- `tests/integration/test_task_artifacts_api.py` - `BUILD_REPORT.md` ## tests run -- Documentation-scope verification: - - `rg -n "current through Sprint 5A|current through Sprint 5D|current through Sprint 5H|compile-path semantic artifact use|hybrid artifact retrieval are planned later|Retrieval, ranking, or embeddings over artifact chunks" ARCHITECTURE.md ROADMAP.md .ai/handoff/CURRENT_STATE.md` -> no matches - - `git diff --name-only` - - `git diff --stat -- ARCHITECTURE.md ROADMAP.md .ai/handoff/CURRENT_STATE.md BUILD_REPORT.md` -- Runtime tests were not run because Sprint 5K is documentation-only and made no schema, API, or runtime code changes. +- `./.venv/bin/python -m pytest tests/unit/test_artifacts.py tests/unit/test_artifacts_main.py` + - Result: `40 passed in 0.45s` +- `./.venv/bin/python -m pytest tests/integration/test_task_artifacts_api.py` + - First sandboxed attempt failed because the suite could not reach the local Postgres instance (`psycopg.OperationalError: ... Operation not permitted`). +- `./.venv/bin/python -m pytest tests/unit` + - Result: `382 passed in 0.73s` +- `./.venv/bin/python -m pytest tests/integration` + - Result: `120 passed in 34.65s` + +## unit and integration test results + +- Unit suite passed in full. +- Integration suite passed in full against Postgres-backed tests. +- The new PDF-specific integration coverage is included in the passing `tests/integration/test_task_artifacts_api.py` module. + +## one example PDF artifact-ingestion response + +Example verified by the PDF integration assertion: + +```json +{ + "artifact": { + "id": "", + "task_id": "", + "task_workspace_id": "", + "status": "registered", + "ingestion_status": "ingested", + "relative_path": "docs/spec.pdf", + "media_type_hint": "application/pdf", + "created_at": "", + "updated_at": "" + }, + "summary": { + "total_count": 2, + "total_characters": 1006, + "media_type": "application/pdf", + "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", + "order": ["sequence_no_asc", "id_asc"] + } +} +``` + +## one example chunk list response produced from a PDF artifact + +Example verified by the PDF integration assertion: + +```json +{ + "items": [ + { + "id": "", + "task_artifact_id": "", + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 1000, + "text": "<998 times 'A'>\nB", + "created_at": "", + "updated_at": "" + }, + { + "id": "", + "task_artifact_id": "", + "sequence_no": 2, + "char_start": 1000, + "char_end_exclusive": 1006, + "text": "BBBB\nC", + "created_at": "", + "updated_at": "" + } + ], + "summary": { + "total_count": 2, + "total_characters": 1006, + "media_type": "application/pdf", + "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", + "order": ["sequence_no_asc", "id_asc"] + } +} +``` ## blockers/issues -- No implementation blockers. -- The repo does not currently contain archived Sprint 5I or Sprint 5J reports under `docs/archive/sprints`, so the durable in-repo evidence for those seams is the shipped code and test suite rather than archived sprint-report files. +- No repo-level implementation blockers remained. +- The integration suite required elevated access to reach the local Postgres test instance from this environment; after rerunning with that access, the full integration suite passed. -## intentionally deferred after truth synchronization +## what remains intentionally deferred to later milestones -- Rich document parsing beyond the current `text/plain` and `text/markdown` ingestion seam. -- Read-only Gmail and Calendar connectors. -- Runner-style orchestration and automatic multi-step progression. -- Artifact reranking or weighted fusion beyond the current lexical-first hybrid compile merge. -- UI work. +- DOCX ingestion +- OCR +- image extraction from PDFs +- connector work +- runner-style orchestration +- retrieval-contract changes +- semantic-contract changes +- compile-contract changes +- UI work +- broader PDF compatibility beyond the current narrow text-only local content-stream extraction path ## recommended next step -Open one narrow sprint for richer document parsing that preserves the existing rooted workspace, durable chunk, and shipped hybrid artifact compile contracts. +Open a follow-up sprint only if broader document coverage is needed, starting with an explicit decision on whether to extend PDF compatibility further or add a separate DOCX ingestion seam without changing the current retrieval and compile contracts. diff --git a/REVIEW_REPORT.md b/REVIEW_REPORT.md index f2086f3..0fa3d0c 100644 --- a/REVIEW_REPORT.md +++ b/REVIEW_REPORT.md @@ -1,32 +1,39 @@ verdict: PASS criteria met -- The sprint stayed documentation-only. `git diff --name-only` shows changes only to `.ai/handoff/CURRENT_STATE.md`, `ARCHITECTURE.md`, `BUILD_REPORT.md`, `ROADMAP.md`, and this review report. -- `ARCHITECTURE.md` describes compile-path semantic artifact retrieval and deterministic hybrid lexical-plus-semantic artifact merge as implemented behavior, not deferred work, and that matches the shipped compile contract and tests. -- `ROADMAP.md` no longer claims the repo is current only through Sprint 5A and now frames the next delivery focus from the actual shipped Sprint 5J artifact-retrieval baseline. -- `.ai/handoff/CURRENT_STATE.md` no longer claims the repo is current only through Sprint 5D and now reflects the shipped Sprint 5J seams accurately. -- The truth artifacts distinguish implemented behavior from deferred work. Rich document parsing, connectors, runner orchestration, UI work, and artifact reranking beyond the shipped lexical-first merge remain clearly deferred. -- No runtime, schema, API, connector, runner, or UI changes appear in the sprint diff. -- `BUILD_REPORT.md` now uses durable in-repo evidence correctly and no longer relies on the overwritten active Sprint 5J build report as a cited source. +- The sprint stayed within the existing artifact-ingestion seam. The runtime diff is limited to [artifacts.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/src/alicebot_api/artifacts.py), the artifact-focused tests, and report files; no connector, runner, compile-contract, or UI code entered scope. +- PDF ingestion reuses the rooted `task_workspaces`, `task_artifacts`, and `task_artifact_chunks` seams without schema or response-shape changes. `application/pdf` is accepted on the existing ingest path and the existing `TaskArtifactIngestionResponse` / chunk-list shapes are preserved. +- Rooted-path safety is enforced during PDF ingestion. Both unit and Postgres-backed integration coverage verify that a persisted relative-path escape is rejected deterministically. +- Extracted PDF text is normalized and chunked deterministically into ordered `task_artifact_chunks` rows. The new tests verify stable 1000-character boundaries, `sequence_no` ordering, and stable chunk-list summary metadata. +- Textless PDFs are rejected deterministically instead of producing misleading chunks. Unit and integration tests both assert the explicit `does not contain extractable PDF text` failure. +- Per-user isolation remains intact for PDF artifacts. The integration suite verifies that another user cannot ingest or list chunks for the owner’s registered PDF artifact. +- Acceptance-suite verification was rerun during review: +- `./.venv/bin/python -m pytest tests/unit/test_artifacts.py tests/unit/test_artifacts_main.py` -> `40 passed` +- `./.venv/bin/python -m pytest tests/integration/test_task_artifacts_api.py` -> `8 passed` +- `./.venv/bin/python -m pytest tests/unit` -> `382 passed` +- `./.venv/bin/python -m pytest tests/integration` -> `120 passed` criteria missed - None. quality issues -- No blocking quality issues found in the final documentation set. +- No blocking implementation defects were found in the Sprint 5L scope. +- The PDF parser is intentionally narrow. It only covers direct local content-stream text extraction with unfiltered and `/FlateDecode` streams, and broader compatibility remains outside this sprint. That matches the packet’s narrow-slice intent and is documented as deferred, so it is not a blocker. regression risks -- No runtime or schema regression risk identified because the sprint remains documentation-only. -- Residual process risk remains that future truth-sync sprints could repeat the same provenance mistake if active artifacts cite other active files that are being replaced in the same sprint. +- Retrieval, semantic retrieval, and hybrid compile behavior remain on the unchanged chunk substrate and the full integration suite still passes, but there is no new PDF-specific end-to-end retrieval or compile assertion. That is a residual regression risk, not a current failure. +- Future widening of PDF support should treat parser compatibility as explicit scope. The implementation is deliberately not a general-purpose PDF engine. docs issues -- None blocking. +- `BUILD_REPORT.md` is complete enough for the sprint packet and matches the shipped diff. +- Optional follow-up only: if the team wants archival clarity, spell out in future milestone docs that “supported PDF” currently means the narrow text-only extraction path implemented in [artifacts.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/src/alicebot_api/artifacts.py), not general PDF compatibility. should anything be added to RULES.md? -- Yes. Add a short rule that active truth artifacts should cite durable in-repo evidence only, not active files being overwritten in the same sprint. +- No. should anything update ARCHITECTURE.md? -- No. +- No. The sprint does not change the core architecture boundaries; it extends the existing artifact-ingestion seam without altering the workspace, artifact, chunk, retrieval, or compile contracts. recommended next action -- Accept Sprint 5K as complete and merge after normal approval flow. +- Accept Sprint 5L as complete and merge after normal approval flow. +- If the next milestone stays on richer document parsing, add one PDF-backed retrieval or compile regression test before widening into broader PDF compatibility, DOCX, or OCR. diff --git a/apps/api/src/alicebot_api/artifacts.py b/apps/api/src/alicebot_api/artifacts.py index d3b794f..3e12638 100644 --- a/apps/api/src/alicebot_api/artifacts.py +++ b/apps/api/src/alicebot_api/artifacts.py @@ -1,6 +1,8 @@ from __future__ import annotations import re +import zlib +from dataclasses import dataclass from pathlib import Path from typing import cast from uuid import UUID @@ -37,11 +39,17 @@ from alicebot_api.workspaces import TaskWorkspaceNotFoundError SUPPORTED_TEXT_ARTIFACT_MEDIA_TYPES = ("text/plain", "text/markdown") -SUPPORTED_TEXT_ARTIFACT_EXTENSIONS = { +SUPPORTED_PDF_ARTIFACT_MEDIA_TYPE = "application/pdf" +SUPPORTED_ARTIFACT_MEDIA_TYPES = ( + *SUPPORTED_TEXT_ARTIFACT_MEDIA_TYPES, + SUPPORTED_PDF_ARTIFACT_MEDIA_TYPE, +) +SUPPORTED_ARTIFACT_EXTENSIONS = { ".txt": "text/plain", ".text": "text/plain", ".md": "text/markdown", ".markdown": "text/markdown", + ".pdf": SUPPORTED_PDF_ARTIFACT_MEDIA_TYPE, } TASK_ARTIFACT_CHUNK_MAX_CHARS = 1000 TASK_ARTIFACT_CHUNKING_RULE = "normalized_utf8_text_fixed_window_1000_chars_v1" @@ -49,6 +57,90 @@ "casefolded_unicode_word_overlap_unique_query_terms_v1" ) _LEXICAL_TERM_PATTERN = re.compile(r"\w+") +_PDF_INDIRECT_OBJECT_PATTERN = re.compile(rb"(?s)(\d+)\s+(\d+)\s+obj\b(.*?)\bendobj\b") +_PDF_REFERENCE_PATTERN = re.compile(rb"(\d+)\s+(\d+)\s+R") +_PDF_NUMERIC_TOKEN_PATTERN = re.compile(rb"[+-]?(?:\d+(?:\.\d+)?|\.\d+)") +_PDF_CONTENT_OPERATORS = { + b'"', + b"'", + b"*", + b"B", + b"BT", + b"BX", + b"B*", + b"BI", + b"BMC", + b"BDC", + b"b", + b"b*", + b"cm", + b"CS", + b"cs", + b"Do", + b"DP", + b"EI", + b"EMC", + b"ET", + b"EX", + b"f", + b"F", + b"f*", + b"G", + b"g", + b"gs", + b"h", + b"i", + b"ID", + b"j", + b"J", + b"K", + b"k", + b"l", + b"M", + b"m", + b"MP", + b"n", + b"q", + b"Q", + b"re", + b"RG", + b"rg", + b"ri", + b"s", + b"S", + b"SC", + b"sc", + b"SCN", + b"scn", + b"sh", + b"T*", + b"Tc", + b"Td", + b"TD", + b"Tf", + b"TJ", + b"Tj", + b"TL", + b"Tm", + b"Tr", + b"Ts", + b"Tw", + b"Tz", + b"v", + b"w", + b"W", + b"W*", + b"y", +} + + +@dataclass(frozen=True, slots=True) +class _PdfObject: + object_id: int + generation: int + dictionary: bytes + stream: bytes | None + raw_content: bytes class TaskArtifactNotFoundError(LookupError): @@ -136,15 +228,15 @@ def infer_task_artifact_media_type(row: TaskArtifactRow) -> str | None: return row["media_type_hint"] artifact_path = Path(row["relative_path"]) - return SUPPORTED_TEXT_ARTIFACT_EXTENSIONS.get(artifact_path.suffix.lower()) + return SUPPORTED_ARTIFACT_EXTENSIONS.get(artifact_path.suffix.lower()) def resolve_supported_task_artifact_media_type(row: TaskArtifactRow) -> str: media_type = infer_task_artifact_media_type(row) - if media_type in SUPPORTED_TEXT_ARTIFACT_MEDIA_TYPES: + if media_type in SUPPORTED_ARTIFACT_MEDIA_TYPES: return cast(str, media_type) - supported_types = ", ".join(SUPPORTED_TEXT_ARTIFACT_MEDIA_TYPES) + supported_types = ", ".join(SUPPORTED_ARTIFACT_MEDIA_TYPES) raise TaskArtifactValidationError( f"artifact {row['relative_path']} has unsupported media type " f"{media_type or 'unknown'}; supported types: {supported_types}" @@ -167,6 +259,494 @@ def chunk_normalized_artifact_text( return chunks +def _extract_text_from_utf8_artifact_bytes(*, relative_path: str, payload: bytes) -> str: + try: + return payload.decode("utf-8") + except UnicodeDecodeError as exc: + raise TaskArtifactValidationError( + f"artifact {relative_path} is not valid UTF-8 text" + ) from exc + + +def _extract_pdf_name(dictionary: bytes, key: bytes) -> bytes | None: + match = re.search(rb"/" + re.escape(key) + rb"\s*/([A-Za-z0-9_.#-]+)", dictionary) + if match is None: + return None + return match.group(1) + + +def _extract_pdf_reference(dictionary: bytes, key: bytes) -> tuple[int, int] | None: + match = re.search(rb"/" + re.escape(key) + rb"\s+(\d+)\s+(\d+)\s+R", dictionary) + if match is None: + return None + return int(match.group(1)), int(match.group(2)) + + +def _extract_pdf_reference_array(dictionary: bytes, key: bytes) -> list[tuple[int, int]]: + match = re.search(rb"/" + re.escape(key) + rb"\s*\[(.*?)\]", dictionary, re.DOTALL) + if match is None: + return [] + return [ + (int(ref_match.group(1)), int(ref_match.group(2))) + for ref_match in _PDF_REFERENCE_PATTERN.finditer(match.group(1)) + ] + + +def _extract_pdf_filter_names(dictionary: bytes) -> list[bytes]: + array_match = re.search(rb"/Filter\s*\[(.*?)\]", dictionary, re.DOTALL) + if array_match is not None: + return re.findall(rb"/([A-Za-z0-9_.#-]+)", array_match.group(1)) + + filter_name = _extract_pdf_name(dictionary, b"Filter") + if filter_name is None: + return [] + return [filter_name] + + +def _extract_pdf_stream_payload( + *, + relative_path: str, + dictionary: bytes, + body: bytes, + stream_start: int, +) -> bytes: + length_match = re.search(rb"/Length\s+(\d+)", dictionary) + if length_match is not None: + stream_length = int(length_match.group(1)) + stream_end = stream_start + stream_length + if stream_end <= len(body): + return body[stream_start:stream_end] + + stream_end = body.rfind(b"endstream") + if stream_end == -1 or stream_end < stream_start: + raise TaskArtifactValidationError( + f"artifact {relative_path} contains an unreadable PDF stream" + ) + + payload = body[stream_start:stream_end] + if payload.endswith(b"\r\n"): + return payload[:-2] + if payload.endswith((b"\n", b"\r")): + return payload[:-1] + return payload + + +def _parse_pdf_objects(*, relative_path: str, payload: bytes) -> dict[tuple[int, int], _PdfObject]: + objects: dict[tuple[int, int], _PdfObject] = {} + for match in _PDF_INDIRECT_OBJECT_PATTERN.finditer(payload): + object_id = int(match.group(1)) + generation = int(match.group(2)) + body = match.group(3).strip() + dictionary = body + stream: bytes | None = None + stream_index = body.find(b"stream") + if stream_index != -1: + dictionary = body[:stream_index].rstrip() + stream_start = stream_index + len(b"stream") + if body[stream_start : stream_start + 2] == b"\r\n": + stream_start += 2 + elif body[stream_start : stream_start + 1] in (b"\r", b"\n"): + stream_start += 1 + stream = _extract_pdf_stream_payload( + relative_path=relative_path, + dictionary=dictionary, + body=body, + stream_start=stream_start, + ) + objects[(object_id, generation)] = _PdfObject( + object_id=object_id, + generation=generation, + dictionary=dictionary, + stream=stream, + raw_content=body, + ) + + if not objects: + raise TaskArtifactValidationError(f"artifact {relative_path} is not a valid PDF") + return objects + + +def _read_pdf_literal_string(payload: bytes, start: int) -> tuple[bytes, int]: + cursor = start + 1 + depth = 1 + result = bytearray() + while cursor < len(payload): + current = payload[cursor] + if current == ord("\\"): + cursor += 1 + if cursor >= len(payload): + break + escaped = payload[cursor] + if escaped in b"nrtbf()\\": + result.extend( + { + ord("n"): b"\n", + ord("r"): b"\r", + ord("t"): b"\t", + ord("b"): b"\b", + ord("f"): b"\f", + ord("("): b"(", + ord(")"): b")", + ord("\\"): b"\\", + }[escaped] + ) + cursor += 1 + continue + if escaped in b"\r\n": + if escaped == ord("\r") and payload[cursor : cursor + 2] == b"\r\n": + cursor += 2 + else: + cursor += 1 + continue + if chr(escaped).isdigit(): + octal_digits = bytes([escaped]) + cursor += 1 + while cursor < len(payload) and len(octal_digits) < 3 and chr(payload[cursor]).isdigit(): + octal_digits += bytes([payload[cursor]]) + cursor += 1 + result.append(int(octal_digits, 8)) + continue + result.append(escaped) + cursor += 1 + continue + if current == ord("("): + depth += 1 + result.append(current) + cursor += 1 + continue + if current == ord(")"): + depth -= 1 + cursor += 1 + if depth == 0: + return bytes(result), cursor + result.append(current) + continue + result.append(current) + cursor += 1 + + raise TaskArtifactValidationError("PDF literal string terminated unexpectedly") + + +def _read_pdf_hex_string(payload: bytes, start: int) -> tuple[bytes, int]: + cursor = start + 1 + hex_digits = bytearray() + while cursor < len(payload): + current = payload[cursor] + if current == ord(">"): + cursor += 1 + break + if chr(current).isspace(): + cursor += 1 + continue + hex_digits.append(current) + cursor += 1 + + if len(hex_digits) % 2 == 1: + hex_digits.append(ord("0")) + return bytes.fromhex(hex_digits.decode("ascii")), cursor + + +def _skip_pdf_whitespace_and_comments(payload: bytes, start: int) -> int: + cursor = start + while cursor < len(payload): + current = payload[cursor] + if chr(current).isspace(): + cursor += 1 + continue + if current == ord("%"): + while cursor < len(payload) and payload[cursor] not in b"\r\n": + cursor += 1 + continue + break + return cursor + + +def _read_pdf_content_token(payload: bytes, start: int) -> tuple[object | None, int]: + cursor = _skip_pdf_whitespace_and_comments(payload, start) + if cursor >= len(payload): + return None, cursor + + current = payload[cursor] + if current == ord("("): + return _read_pdf_literal_string(payload, cursor) + if current == ord("<") and payload[cursor : cursor + 2] != b"<<": + return _read_pdf_hex_string(payload, cursor) + if current == ord("["): + items: list[object] = [] + cursor += 1 + while True: + cursor = _skip_pdf_whitespace_and_comments(payload, cursor) + if cursor >= len(payload): + raise TaskArtifactValidationError("PDF array terminated unexpectedly") + if payload[cursor] == ord("]"): + return items, cursor + 1 + item, cursor = _read_pdf_content_token(payload, cursor) + if item is None: + raise TaskArtifactValidationError("PDF array terminated unexpectedly") + items.append(item) + if current == ord("/"): + cursor += 1 + token_start = cursor + while cursor < len(payload) and not chr(payload[cursor]).isspace() and payload[cursor] not in b"()<>[]{}/%": + cursor += 1 + return payload[token_start - 1 : cursor], cursor + + token_start = cursor + while cursor < len(payload) and not chr(payload[cursor]).isspace() and payload[cursor] not in b"()<>[]{}/%": + cursor += 1 + return payload[token_start:cursor], cursor + + +def _decode_pdf_text_bytes(raw: bytes) -> str: + if raw.startswith(b"\xfe\xff"): + return raw[2:].decode("utf-16-be", errors="ignore") + if raw.startswith(b"\xff\xfe"): + return raw[2:].decode("utf-16-le", errors="ignore") + return raw.decode("latin-1", errors="ignore") + + +def _decode_pdf_text_operand(value: object | None) -> str: + if isinstance(value, bytes): + return _decode_pdf_text_bytes(value) + if isinstance(value, list): + return "".join( + _decode_pdf_text_bytes(item) for item in value if isinstance(item, bytes) + ) + return "" + + +def _pop_last_pdf_text_operand(operands: list[object]) -> object | None: + for index in range(len(operands) - 1, -1, -1): + candidate = operands[index] + if isinstance(candidate, (bytes, list)): + return operands.pop(index) + return None + + +def _extract_text_from_pdf_content_stream(stream: bytes) -> str: + operands: list[object] = [] + fragments: list[str] = [] + inside_text_block = False + pending_newline = False + cursor = 0 + + def request_newline() -> None: + nonlocal pending_newline + if fragments: + pending_newline = True + + def append_text(text: str) -> None: + nonlocal pending_newline + if text == "": + return + if pending_newline and fragments and fragments[-1] != "\n": + fragments.append("\n") + pending_newline = False + fragments.append(text) + + while True: + token, cursor = _read_pdf_content_token(stream, cursor) + if token is None: + break + if isinstance(token, list) or ( + isinstance(token, bytes) + and ( + token.startswith(b"/") + or _PDF_NUMERIC_TOKEN_PATTERN.fullmatch(token) is not None + or token in {b"true", b"false", b"null"} + ) + ): + operands.append(token) + continue + if not isinstance(token, bytes): + operands.append(token) + continue + + operator = token + if operator == b"BT": + inside_text_block = True + operands.clear() + continue + if operator == b"ET": + inside_text_block = False + operands.clear() + continue + if operator not in _PDF_CONTENT_OPERATORS: + operands.append(token) + continue + if not inside_text_block: + operands.clear() + continue + if operator in {b"T*", b"Td", b"TD", b"Tm"}: + request_newline() + operands.clear() + continue + if operator in {b"Tj", b"TJ"}: + append_text(_decode_pdf_text_operand(_pop_last_pdf_text_operand(operands))) + operands.clear() + continue + if operator in {b"'", b'"'}: + request_newline() + append_text(_decode_pdf_text_operand(_pop_last_pdf_text_operand(operands))) + operands.clear() + continue + operands.clear() + + return "".join(fragments).strip() + + +def _decode_pdf_stream(*, relative_path: str, pdf_object: _PdfObject) -> bytes: + if pdf_object.stream is None: + raise TaskArtifactValidationError( + f"artifact {relative_path} contains a PDF content reference without a stream" + ) + + filters = _extract_pdf_filter_names(pdf_object.dictionary) + if not filters: + return pdf_object.stream + if filters == [b"FlateDecode"]: + try: + return zlib.decompress(pdf_object.stream) + except zlib.error as exc: + raise TaskArtifactValidationError( + f"artifact {relative_path} contains an unreadable FlateDecode PDF stream" + ) from exc + + filter_names = ", ".join(f"/{name.decode('ascii', errors='ignore')}" for name in filters) + raise TaskArtifactValidationError( + f"artifact {relative_path} uses unsupported PDF stream filters {filter_names}" + ) + + +def _collect_pdf_page_refs( + *, + relative_path: str, + objects: dict[tuple[int, int], _PdfObject], + current_ref: tuple[int, int], + collected_refs: list[tuple[int, int]], + visited_refs: set[tuple[int, int]], +) -> None: + if current_ref in visited_refs: + return + visited_refs.add(current_ref) + current_object = objects.get(current_ref) + if current_object is None: + raise TaskArtifactValidationError( + f"artifact {relative_path} references a missing PDF object {current_ref[0]} {current_ref[1]} R" + ) + + object_type = _extract_pdf_name(current_object.dictionary, b"Type") + if object_type == b"Page": + collected_refs.append(current_ref) + return + if object_type != b"Pages": + raise TaskArtifactValidationError( + f"artifact {relative_path} uses unsupported PDF page tree structure" + ) + + child_refs = _extract_pdf_reference_array(current_object.dictionary, b"Kids") + if not child_refs: + raise TaskArtifactValidationError( + f"artifact {relative_path} uses unsupported PDF page tree structure" + ) + for child_ref in child_refs: + _collect_pdf_page_refs( + relative_path=relative_path, + objects=objects, + current_ref=child_ref, + collected_refs=collected_refs, + visited_refs=visited_refs, + ) + + +def _resolve_pdf_page_refs( + *, + relative_path: str, + objects: dict[tuple[int, int], _PdfObject], +) -> list[tuple[int, int]]: + catalog_ref = next( + ( + object_ref + for object_ref, pdf_object in objects.items() + if _extract_pdf_name(pdf_object.dictionary, b"Type") == b"Catalog" + ), + None, + ) + if catalog_ref is None: + raise TaskArtifactValidationError(f"artifact {relative_path} is not a valid PDF") + + pages_ref = _extract_pdf_reference(objects[catalog_ref].dictionary, b"Pages") + if pages_ref is None: + raise TaskArtifactValidationError( + f"artifact {relative_path} uses unsupported PDF page tree structure" + ) + + page_refs: list[tuple[int, int]] = [] + _collect_pdf_page_refs( + relative_path=relative_path, + objects=objects, + current_ref=pages_ref, + collected_refs=page_refs, + visited_refs=set(), + ) + return page_refs + + +def _extract_text_from_pdf_artifact_bytes(*, relative_path: str, payload: bytes) -> str: + if not payload.startswith(b"%PDF-"): + raise TaskArtifactValidationError(f"artifact {relative_path} is not a valid PDF") + + objects = _parse_pdf_objects(relative_path=relative_path, payload=payload) + page_refs = _resolve_pdf_page_refs(relative_path=relative_path, objects=objects) + page_fragments: list[str] = [] + for page_ref in page_refs: + page_object = objects[page_ref] + content_refs = _extract_pdf_reference_array(page_object.dictionary, b"Contents") + if not content_refs: + single_content_ref = _extract_pdf_reference(page_object.dictionary, b"Contents") + if single_content_ref is not None: + content_refs = [single_content_ref] + + stream_fragments: list[str] = [] + for content_ref in content_refs: + content_object = objects.get(content_ref) + if content_object is None: + raise TaskArtifactValidationError( + f"artifact {relative_path} references a missing PDF object {content_ref[0]} {content_ref[1]} R" + ) + extracted = _extract_text_from_pdf_content_stream( + _decode_pdf_stream(relative_path=relative_path, pdf_object=content_object) + ) + if extracted != "": + stream_fragments.append(extracted) + if stream_fragments: + page_fragments.append("\n".join(stream_fragments)) + + extracted_text = "\n".join(page_fragments).strip() + if extracted_text == "": + raise TaskArtifactValidationError( + f"artifact {relative_path} does not contain extractable PDF text" + ) + return extracted_text + + +def extract_artifact_text(*, row: TaskArtifactRow, artifact_path: Path, media_type: str) -> str: + payload = artifact_path.read_bytes() + if media_type in SUPPORTED_TEXT_ARTIFACT_MEDIA_TYPES: + return _extract_text_from_utf8_artifact_bytes( + relative_path=row["relative_path"], + payload=payload, + ) + if media_type == SUPPORTED_PDF_ARTIFACT_MEDIA_TYPE: + return _extract_text_from_pdf_artifact_bytes( + relative_path=row["relative_path"], + payload=payload, + ) + raise TaskArtifactValidationError( + f"artifact {row['relative_path']} has unsupported media type {media_type}" + ) + + def resolve_registered_artifact_path(*, workspace_path: Path, relative_path: str) -> Path: artifact_path = (workspace_path / relative_path).resolve() ensure_artifact_path_is_rooted( @@ -462,14 +1042,11 @@ def ingest_task_artifact_record( relative_path=row["relative_path"], ) _require_existing_file(artifact_path) - - try: - text = artifact_path.read_bytes().decode("utf-8") - except UnicodeDecodeError as exc: - raise TaskArtifactValidationError( - f"artifact {row['relative_path']} is not valid UTF-8 text" - ) from exc - + text = extract_artifact_text( + row=row, + artifact_path=artifact_path, + media_type=media_type, + ) normalized_text = normalize_artifact_text(text) for index, (char_start, char_end_exclusive, chunk_text) in enumerate( chunk_normalized_artifact_text(normalized_text), diff --git a/tests/integration/test_task_artifacts_api.py b/tests/integration/test_task_artifacts_api.py index 1aab4e1..d23c5c7 100644 --- a/tests/integration/test_task_artifacts_api.py +++ b/tests/integration/test_task_artifacts_api.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import zlib from pathlib import Path from typing import Any from urllib.parse import urlencode @@ -16,6 +17,88 @@ from alicebot_api.store import ContinuityStore +def _escape_pdf_literal_string(value: str) -> str: + return value.replace("\\", "\\\\").replace("(", "\\(").replace(")", "\\)") + + +def _build_pdf_bytes( + pages: list[list[str]], + *, + compress_streams: bool = True, + textless: bool = False, +) -> bytes: + objects: dict[int, bytes] = { + 1: b"<< /Type /Catalog /Pages 2 0 R >>", + 3: b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>", + } + page_refs: list[str] = [] + next_object_id = 4 + for page_lines in pages: + page_object_id = next_object_id + content_object_id = next_object_id + 1 + next_object_id += 2 + page_refs.append(f"{page_object_id} 0 R") + + if textless: + content_stream = b"q 10 10 100 100 re S Q\n" + else: + commands = [b"BT", b"/F1 12 Tf", b"72 720 Td"] + for index, line in enumerate(page_lines): + if index > 0: + commands.append(b"T*") + commands.append(f"({_escape_pdf_literal_string(line)}) Tj".encode("latin-1")) + commands.append(b"ET") + content_stream = b"\n".join(commands) + b"\n" + + if compress_streams: + encoded_stream = zlib.compress(content_stream) + content_body = ( + f"<< /Length {len(encoded_stream)} /Filter /FlateDecode >>\n".encode("ascii") + + b"stream\n" + + encoded_stream + + b"\nendstream" + ) + else: + content_body = ( + f"<< /Length {len(content_stream)} >>\n".encode("ascii") + + b"stream\n" + + content_stream + + b"endstream" + ) + + objects[page_object_id] = ( + f"<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 3 0 R >> >> " + f"/MediaBox [0 0 612 792] /Contents {content_object_id} 0 R >>" + ).encode("ascii") + objects[content_object_id] = content_body + + objects[2] = ( + f"<< /Type /Pages /Count {len(page_refs)} /Kids [{' '.join(page_refs)}] >>" + ).encode("ascii") + + document = bytearray(b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n") + max_object_id = max(objects) + offsets = [0] * (max_object_id + 1) + for object_id in range(1, max_object_id + 1): + offsets[object_id] = len(document) + document.extend(f"{object_id} 0 obj\n".encode("ascii")) + document.extend(objects[object_id]) + document.extend(b"\nendobj\n") + + xref_offset = len(document) + document.extend(f"xref\n0 {max_object_id + 1}\n".encode("ascii")) + document.extend(b"0000000000 65535 f \n") + for object_id in range(1, max_object_id + 1): + document.extend(f"{offsets[object_id]:010d} 00000 n \n".encode("ascii")) + document.extend( + ( + f"trailer\n<< /Size {max_object_id + 1} /Root 1 0 R >>\n" + f"startxref\n{xref_offset}\n%%EOF\n" + ).encode("ascii") + ) + return bytes(document) + + def invoke_request( method: str, path: str, @@ -323,8 +406,8 @@ def test_task_artifact_ingestion_and_chunk_endpoints_are_deterministic_and_isola supported_file = workspace_path / "docs" / "spec.txt" supported_file.parent.mkdir(parents=True) supported_file.write_text(("A" * 998) + "\r\n" + ("B" * 5) + "\rC") - unsupported_file = workspace_path / "docs" / "manual.pdf" - unsupported_file.write_text("not really a pdf") + unsupported_file = workspace_path / "docs" / "manual.bin" + unsupported_file.write_bytes(b"\x00\x01\x02") register_status, register_payload = invoke_request( "POST", @@ -364,7 +447,7 @@ def test_task_artifact_ingestion_and_chunk_endpoints_are_deterministic_and_isola payload={ "user_id": str(owner["user_id"]), "local_path": str(unsupported_file), - "media_type_hint": "application/pdf", + "media_type_hint": "application/octet-stream", }, ) assert unsupported_register_status == 201 @@ -442,12 +525,139 @@ def test_task_artifact_ingestion_and_chunk_endpoints_are_deterministic_and_isola assert unsupported_ingest_status == 400 assert unsupported_ingest_payload == { "detail": ( - "artifact docs/manual.pdf has unsupported media type application/pdf; " - "supported types: text/plain, text/markdown" + "artifact docs/manual.bin has unsupported media type application/octet-stream; " + "supported types: text/plain, text/markdown, application/pdf" ) } +def test_task_artifact_pdf_ingestion_and_chunk_endpoints_are_deterministic_and_isolated( + migrated_database_urls, + monkeypatch, + tmp_path, +) -> None: + owner = seed_task(migrated_database_urls["app"], email="owner@example.com") + intruder = seed_task(migrated_database_urls["app"], email="intruder@example.com") + workspace_root = tmp_path / "task-workspaces" + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings( + database_url=migrated_database_urls["app"], + task_workspace_root=str(workspace_root), + ), + ) + + workspace_status, workspace_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/workspace", + payload={"user_id": str(owner["user_id"])}, + ) + assert workspace_status == 201 + + workspace_path = Path(workspace_payload["workspace"]["local_path"]) + pdf_file = workspace_path / "docs" / "spec.pdf" + pdf_file.parent.mkdir(parents=True) + pdf_file.write_bytes(_build_pdf_bytes([["A" * 998, "B" * 5, "C"]])) + + register_status, register_payload = invoke_request( + "POST", + f"/v0/task-workspaces/{workspace_payload['workspace']['id']}/artifacts", + payload={ + "user_id": str(owner["user_id"]), + "local_path": str(pdf_file), + "media_type_hint": "application/pdf", + }, + ) + assert register_status == 201 + + ingest_status, ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/ingest", + payload={"user_id": str(owner["user_id"])}, + ) + chunk_list_status, chunk_list_payload = invoke_request( + "GET", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/chunks", + query_params={"user_id": str(owner["user_id"])}, + ) + isolated_chunk_list_status, isolated_chunk_list_payload = invoke_request( + "GET", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/chunks", + query_params={"user_id": str(intruder["user_id"])}, + ) + isolated_ingest_status, isolated_ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/ingest", + payload={"user_id": str(intruder["user_id"])}, + ) + + assert ingest_status == 200 + assert ingest_payload == { + "artifact": { + "id": register_payload["artifact"]["id"], + "task_id": str(owner["task_id"]), + "task_workspace_id": workspace_payload["workspace"]["id"], + "status": "registered", + "ingestion_status": "ingested", + "relative_path": "docs/spec.pdf", + "media_type_hint": "application/pdf", + "created_at": register_payload["artifact"]["created_at"], + "updated_at": ingest_payload["artifact"]["updated_at"], + }, + "summary": { + "total_count": 2, + "total_characters": 1006, + "media_type": "application/pdf", + "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", + "order": ["sequence_no_asc", "id_asc"], + }, + } + + assert chunk_list_status == 200 + assert chunk_list_payload == { + "items": [ + { + "id": chunk_list_payload["items"][0]["id"], + "task_artifact_id": register_payload["artifact"]["id"], + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 1000, + "text": ("A" * 998) + "\n" + "B", + "created_at": chunk_list_payload["items"][0]["created_at"], + "updated_at": chunk_list_payload["items"][0]["updated_at"], + }, + { + "id": chunk_list_payload["items"][1]["id"], + "task_artifact_id": register_payload["artifact"]["id"], + "sequence_no": 2, + "char_start": 1000, + "char_end_exclusive": 1006, + "text": "BBBB\nC", + "created_at": chunk_list_payload["items"][1]["created_at"], + "updated_at": chunk_list_payload["items"][1]["updated_at"], + }, + ], + "summary": { + "total_count": 2, + "total_characters": 1006, + "media_type": "application/pdf", + "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", + "order": ["sequence_no_asc", "id_asc"], + }, + } + + assert isolated_chunk_list_status == 404 + assert isolated_chunk_list_payload == { + "detail": f"task artifact {register_payload['artifact']['id']} was not found" + } + + assert isolated_ingest_status == 404 + assert isolated_ingest_payload == { + "detail": f"task artifact {register_payload['artifact']['id']} was not found" + } + + def test_task_artifact_ingestion_supports_markdown_and_reingest_is_idempotent( migrated_database_urls, monkeypatch, @@ -601,6 +811,57 @@ def test_task_artifact_ingestion_rejects_invalid_utf8_content( } +def test_task_artifact_ingestion_rejects_textless_pdf_content( + migrated_database_urls, + monkeypatch, + tmp_path, +) -> None: + owner = seed_task(migrated_database_urls["app"], email="owner@example.com") + workspace_root = tmp_path / "task-workspaces" + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings( + database_url=migrated_database_urls["app"], + task_workspace_root=str(workspace_root), + ), + ) + + workspace_status, workspace_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/workspace", + payload={"user_id": str(owner["user_id"])}, + ) + assert workspace_status == 201 + + workspace_path = Path(workspace_payload["workspace"]["local_path"]) + textless_pdf = workspace_path / "docs" / "scanned.pdf" + textless_pdf.parent.mkdir(parents=True) + textless_pdf.write_bytes(_build_pdf_bytes([[]], textless=True)) + + register_status, register_payload = invoke_request( + "POST", + f"/v0/task-workspaces/{workspace_payload['workspace']['id']}/artifacts", + payload={ + "user_id": str(owner["user_id"]), + "local_path": str(textless_pdf), + "media_type_hint": "application/pdf", + }, + ) + assert register_status == 201 + + ingest_status, ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/ingest", + payload={"user_id": str(owner["user_id"])}, + ) + + assert ingest_status == 400 + assert ingest_payload == { + "detail": "artifact docs/scanned.pdf does not contain extractable PDF text" + } + + def test_task_artifact_ingestion_enforces_rooted_workspace_paths( migrated_database_urls, monkeypatch, @@ -625,11 +886,11 @@ def test_task_artifact_ingestion_enforces_rooted_workspace_paths( assert workspace_status == 201 workspace_path = Path(workspace_payload["workspace"]["local_path"]) - safe_file = workspace_path / "docs" / "spec.txt" + safe_file = workspace_path / "docs" / "spec.pdf" safe_file.parent.mkdir(parents=True) - safe_file.write_text("spec") - outside_file = tmp_path / "escape.txt" - outside_file.write_text("escape") + safe_file.write_bytes(_build_pdf_bytes([["spec"]])) + outside_file = tmp_path / "escape.pdf" + outside_file.write_bytes(_build_pdf_bytes([["escape"]])) register_status, register_payload = invoke_request( "POST", @@ -637,7 +898,7 @@ def test_task_artifact_ingestion_enforces_rooted_workspace_paths( payload={ "user_id": str(owner["user_id"]), "local_path": str(safe_file), - "media_type_hint": "text/plain", + "media_type_hint": "application/pdf", }, ) assert register_status == 201 @@ -647,7 +908,7 @@ def test_task_artifact_ingestion_enforces_rooted_workspace_paths( cur.execute( """ UPDATE task_artifacts - SET relative_path = '../../../escape.txt' + SET relative_path = '../../../escape.pdf' WHERE id = %s """, (register_payload["artifact"]["id"],), diff --git a/tests/unit/test_artifacts.py b/tests/unit/test_artifacts.py index 07dc3de..35da2b0 100644 --- a/tests/unit/test_artifacts.py +++ b/tests/unit/test_artifacts.py @@ -1,5 +1,6 @@ from __future__ import annotations +import zlib from datetime import UTC, datetime, timedelta from pathlib import Path from uuid import UUID, uuid4 @@ -38,6 +39,88 @@ from alicebot_api.workspaces import TaskWorkspaceNotFoundError +def _escape_pdf_literal_string(value: str) -> str: + return value.replace("\\", "\\\\").replace("(", "\\(").replace(")", "\\)") + + +def _build_pdf_bytes( + pages: list[list[str]], + *, + compress_streams: bool = True, + textless: bool = False, +) -> bytes: + objects: dict[int, bytes] = { + 1: b"<< /Type /Catalog /Pages 2 0 R >>", + 3: b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>", + } + page_refs: list[str] = [] + next_object_id = 4 + for page_lines in pages: + page_object_id = next_object_id + content_object_id = next_object_id + 1 + next_object_id += 2 + page_refs.append(f"{page_object_id} 0 R") + + if textless: + content_stream = b"q 10 10 100 100 re S Q\n" + else: + commands = [b"BT", b"/F1 12 Tf", b"72 720 Td"] + for index, line in enumerate(page_lines): + if index > 0: + commands.append(b"T*") + commands.append(f"({_escape_pdf_literal_string(line)}) Tj".encode("latin-1")) + commands.append(b"ET") + content_stream = b"\n".join(commands) + b"\n" + + if compress_streams: + encoded_stream = zlib.compress(content_stream) + content_body = ( + f"<< /Length {len(encoded_stream)} /Filter /FlateDecode >>\n".encode("ascii") + + b"stream\n" + + encoded_stream + + b"\nendstream" + ) + else: + content_body = ( + f"<< /Length {len(content_stream)} >>\n".encode("ascii") + + b"stream\n" + + content_stream + + b"endstream" + ) + + objects[page_object_id] = ( + f"<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 3 0 R >> >> " + f"/MediaBox [0 0 612 792] /Contents {content_object_id} 0 R >>" + ).encode("ascii") + objects[content_object_id] = content_body + + objects[2] = ( + f"<< /Type /Pages /Count {len(page_refs)} /Kids [{' '.join(page_refs)}] >>" + ).encode("ascii") + + document = bytearray(b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n") + max_object_id = max(objects) + offsets = [0] * (max_object_id + 1) + for object_id in range(1, max_object_id + 1): + offsets[object_id] = len(document) + document.extend(f"{object_id} 0 obj\n".encode("ascii")) + document.extend(objects[object_id]) + document.extend(b"\nendobj\n") + + xref_offset = len(document) + document.extend(f"xref\n0 {max_object_id + 1}\n".encode("ascii")) + document.extend(b"0000000000 65535 f \n") + for object_id in range(1, max_object_id + 1): + document.extend(f"{offsets[object_id]:010d} 00000 n \n".encode("ascii")) + document.extend( + ( + f"trailer\n<< /Size {max_object_id + 1} /Root 1 0 R >>\n" + f"startxref\n{xref_offset}\n%%EOF\n" + ).encode("ascii") + ) + return bytes(document) + + class ArtifactStoreStub: def __init__(self) -> None: self.base_time = datetime(2026, 3, 13, 10, 0, tzinfo=UTC) @@ -477,6 +560,84 @@ def test_ingest_task_artifact_record_supports_markdown(tmp_path) -> None: ] +def test_ingest_task_artifact_record_persists_deterministic_pdf_chunks(tmp_path) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + artifact_path = workspace_path / "docs" / "spec.pdf" + artifact_path.parent.mkdir(parents=True) + artifact_path.write_bytes(_build_pdf_bytes([["A" * 998, "B" * 5, "C"]])) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="docs/spec.pdf", + media_type_hint="application/pdf", + ) + + response = ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + assert response == { + "artifact": { + "id": str(artifact["id"]), + "task_id": str(task_id), + "task_workspace_id": str(task_workspace_id), + "status": "registered", + "ingestion_status": "ingested", + "relative_path": "docs/spec.pdf", + "media_type_hint": "application/pdf", + "created_at": "2026-03-13T10:00:00+00:00", + "updated_at": "2026-03-13T10:30:00+00:00", + }, + "summary": { + "total_count": 2, + "total_characters": 1006, + "media_type": "application/pdf", + "chunking_rule": TASK_ARTIFACT_CHUNKING_RULE, + "order": ["sequence_no_asc", "id_asc"], + }, + } + assert store.locked_artifact_ids == [artifact["id"]] + assert store.list_task_artifact_chunks(artifact["id"]) == [ + { + "id": store.artifact_chunks[0]["id"], + "user_id": user_id, + "task_artifact_id": artifact["id"], + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 1000, + "text": ("A" * 998) + "\n" + "B", + "created_at": datetime(2026, 3, 13, 10, 0, tzinfo=UTC), + "updated_at": datetime(2026, 3, 13, 10, 0, tzinfo=UTC), + }, + { + "id": store.artifact_chunks[1]["id"], + "user_id": user_id, + "task_artifact_id": artifact["id"], + "sequence_no": 2, + "char_start": 1000, + "char_end_exclusive": 1006, + "text": "BBBB\nC", + "created_at": datetime(2026, 3, 13, 10, 0, 1, tzinfo=UTC), + "updated_at": datetime(2026, 3, 13, 10, 0, 1, tzinfo=UTC), + }, + ] + + def test_ingest_task_artifact_record_is_idempotent_for_already_ingested_artifact() -> None: store = ArtifactStoreStub() user_id = uuid4() @@ -541,9 +702,9 @@ def test_ingest_task_artifact_record_rejects_unsupported_media_type(tmp_path) -> task_workspace_id = uuid4() workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) workspace_path.mkdir(parents=True) - artifact_path = workspace_path / "docs" / "spec.pdf" + artifact_path = workspace_path / "docs" / "spec.bin" artifact_path.parent.mkdir(parents=True) - artifact_path.write_text("not really a pdf") + artifact_path.write_bytes(b"\x00\x01\x02") store.create_task_workspace( task_workspace_id=task_workspace_id, task_id=task_id, @@ -555,13 +716,52 @@ def test_ingest_task_artifact_record_rejects_unsupported_media_type(tmp_path) -> task_workspace_id=task_workspace_id, status="registered", ingestion_status="pending", - relative_path="docs/spec.pdf", + relative_path="docs/spec.bin", + media_type_hint="application/octet-stream", + ) + + with pytest.raises( + TaskArtifactValidationError, + match=( + "artifact docs/spec.bin has unsupported media type application/octet-stream; " + "supported types: text/plain, text/markdown, application/pdf" + ), + ): + ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + +def test_ingest_task_artifact_record_rejects_textless_pdf(tmp_path) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + artifact_path = workspace_path / "docs" / "scanned.pdf" + artifact_path.parent.mkdir(parents=True) + artifact_path.write_bytes(_build_pdf_bytes([[]], textless=True)) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="docs/scanned.pdf", media_type_hint="application/pdf", ) with pytest.raises( TaskArtifactValidationError, - match="artifact docs/spec.pdf has unsupported media type application/pdf", + match="artifact docs/scanned.pdf does not contain extractable PDF text", ): ingest_task_artifact_record( store, @@ -613,8 +813,8 @@ def test_ingest_task_artifact_record_rejects_paths_outside_workspace(tmp_path) - task_workspace_id = uuid4() workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) workspace_path.mkdir(parents=True) - outside_path = tmp_path / "escape.txt" - outside_path.write_text("escape") + outside_path = tmp_path / "escape.pdf" + outside_path.write_bytes(_build_pdf_bytes([["escape"]])) store.create_task_workspace( task_workspace_id=task_workspace_id, task_id=task_id, @@ -626,8 +826,8 @@ def test_ingest_task_artifact_record_rejects_paths_outside_workspace(tmp_path) - task_workspace_id=task_workspace_id, status="registered", ingestion_status="pending", - relative_path="../escape.txt", - media_type_hint="text/plain", + relative_path="../escape.pdf", + media_type_hint="application/pdf", ) with pytest.raises(TaskArtifactValidationError, match="escapes workspace root"): diff --git a/tests/unit/test_artifacts_main.py b/tests/unit/test_artifacts_main.py index 634d9b6..f43b78c 100644 --- a/tests/unit/test_artifacts_main.py +++ b/tests/unit/test_artifacts_main.py @@ -497,8 +497,8 @@ def fake_user_connection(*_args, **_kwargs): def fake_ingest_task_artifact_record(*_args, **_kwargs): raise TaskArtifactValidationError( - "artifact docs/spec.txt has unsupported media type application/pdf; " - "supported types: text/plain, text/markdown" + "artifact docs/spec.bin has unsupported media type application/octet-stream; " + "supported types: text/plain, text/markdown, application/pdf" ) monkeypatch.setattr(main_module, "get_settings", lambda: settings) @@ -513,8 +513,8 @@ def fake_ingest_task_artifact_record(*_args, **_kwargs): assert response.status_code == 400 assert json.loads(response.body) == { "detail": ( - "artifact docs/spec.txt has unsupported media type application/pdf; " - "supported types: text/plain, text/markdown" + "artifact docs/spec.bin has unsupported media type application/octet-stream; " + "supported types: text/plain, text/markdown, application/pdf" ) }