In [None]:
# Reinstall the package
%cd /Users/robertford/Repos/fade
!source .venv/bin/activate
%uv pip install -e .
# or to be more specific
# %uv pip install matplotlib scikit-image notebook numpy pandas pillow scipy tqdm ipykernel jupyter paddlepaddle


In [None]:
# Test cell to verify installations
import sys
print(f"Python version: {sys.version}")

# Test core dependencies
try:
    import matplotlib
    print(f"matplotlib version: {matplotlib.__version__}")
except ImportError as e:
    print(f"matplotlib import error: {e}")

try:
    import numpy
    print(f"numpy version: {numpy.__version__}")
except ImportError as e:
    print(f"numpy import error: {e}")

try:
    import pandas
    print(f"pandas version: {pandas.__version__}")
except ImportError as e:
    print(f"pandas import error: {e}")

try:
    import PIL
    print(f"Pillow version: {PIL.__version__}")
except ImportError as e:
    print(f"Pillow import error: {e}")

try:
    import skimage
    print(f"scikit-image version: {skimage.__version__}")
except ImportError as e:
    print(f"scikit-image import error: {e}")

try:
    import scipy
    print(f"scipy version: {scipy.__version__}")
except ImportError as e:
    print(f"scipy import error: {e}")

try:
    import paddle
    print(f"paddle version: {paddle.__version__}")
except ImportError as e:
    print(f"paddle import error: {e}")

# Test FADE package
try:
    from fade.pipeline import PipelineState
    print("FADE package imported successfully")
except ImportError as e:
    print(f"FADE package import error: {e}")

In [1]:
# Test the pipeline with a sample document
from fade.pipeline import (
    PipelineState,
    setup_working_directory,
    extract_document_pages,
    detect_entities,
    classify_entities,
    report_unclassified_entities,
    process_entities,
    log_process
)

import os

# Get absolute path to the PDF using the correct base path
base_path = "/Users/robertford/Repos/fade"  # Your project root
pdf_path = os.path.join(base_path, "data", "f1040.pdf")
print(f"PDF path: {pdf_path}")
print(f"File exists: {os.path.exists(pdf_path)}")

# Initialize a test state with absolute path
initial_state = PipelineState(
    document_id=pdf_path,  # Using absolute path
    working_dir="",
    images=[],
    entities={},
    unclassified_entities={},
    logs=[],
    error=None
)


[2025/03/19 16:52:59] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/robertford/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/robertford/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec

### FADE Pipeline Steps

#### 1. Setup Working Directory
Creates a working directory for processing the document, copying the source file to a dedicated workspace.

In [2]:
# Setup working directory
print("\nTesting pipeline setup...")
state = setup_working_directory(initial_state)
print("\nSetup Working Directory Logs:")
for log in state.logs:
    print(f"- {log['message']}")
if state.error:
    print(f"Error: {state.error}")


Testing pipeline setup...

Setup Working Directory Logs:
- Created working directory: /Users/robertford/Repos/fade/data/f1040.pdf_working


#### 2. Extract Document Pages
Converts each page of the PDF into an image file. Also extracts and processes any embedded PDF attachments.

In [None]:
# Test extract document pages
print("\nTesting document page extraction...")
state = extract_document_pages(state)
print("\nExtract Document Pages Logs:")
for log in state.logs:
    print(f"- {log['message']}")
print(f"\nNumber of images extracted: {len(state.images)}")
if state.error:
    print(f"Error: {state.error}")


Testing document page extraction...
Error processing attachments in f1040.pdf: 'Document' object has no attribute 'embfile_get_names'

Extract Document Pages Logs:
- Created working directory: /Users/robertford/Repos/fade/data/f1040.pdf_working
- Extracted 2 page images

Number of images extracted: 2


#### 3. Detect Entities
Uses PaddleOCR to detect and extract text elements from each page image. Creates visualizations showing detected text regions.
Key optimizations:
- Multiprocessing enabled
- Image size limited to 960px
- Fast recognition model (SVTR_LCNet)## 3. Detect Entities

In [None]:
# Test detect entities
if len(state.images) > 0:
    print("\nTesting entity detection...")
    state = detect_entities(state)
    print("\nDetect Entities Logs:")
    for log in state.logs:
        print(f"- {log['message']}")
    print(f"\nNumber of entities detected: {len(state.entities)}")
    if state.error:
        print(f"Error: {state.error}")
else:
    print("\nSkipping entity detection since no images were extracted.")


Testing entity detection...


Processing pages:   0%|          | 0/2 [00:00<?, ?it/s]

#### 4. Classify Entities
Categorizes detected entities into types (text, data, image, etc.) using layout detection and OCR confidence scores.

In [None]:
# Test classify_entities
print("\nTesting entity classification...")
state = classify_entities(state)
print("\nClassify Entities Logs:")
for log in state.logs:
    print(f"- {log['message']}")
print(f"\nClassified entities: {len(state.entities) - len(state.unclassified_entities)}")
print(f"Unclassified entities: {len(state.unclassified_entities)}")
if state.error:
    print(f"Error: {state.error}")

#### 5. Report Unclassified Entities
Generates a report of any entities that couldn't be automatically classified, allowing for manual review if needed.

In [None]:
# Test report_unclassified_entities
print("\nTesting unclassified entities report...")
state = report_unclassified_entities(state)
print("\nUnclassified Entities Report Logs:")
for log in state.logs:
    print(f"- {log['message']}")
if state.error:
    print(f"Error: {state.error}")

#### 6. Process Entities
Processes each classified entity according to its type:
- Text: Extracts content using OCR
- Data: Converts tables to CSV
- Images: Saves as PNG files
Creates a JSON structure documenting all entities and their relationships.

In [None]:
# Test process_entities
print("\nTesting entity processing...")
state = process_entities(state)
print("\nProcess Entities Logs:")
for log in state.logs:
    print(f"- {log['message']}")
if state.error:
    print(f"Error: {state.error}")

#### 7. Log Process
Records the entire processing pipeline's results, including:
- Processing times
- Success/failure rates
- Entity counts
- Error logs
Saves this information for analysis and debugging.

In [None]:
# Test log_process
print("\nTesting process logging...")
state = log_process(state)
print("\nLog Process Logs:")
for log in state.logs:
    print(f"- {log['message']}")
if state.error:
    print(f"Error: {state.error}")

#### Final State Summary
Shows the complete processing results:
- Document information
- Entity counts
- Output files generated
- Any errors encountered

In [None]:
# Print final state summary
print("\nFinal State Summary:")
print(f"Document ID: {state.document_id}")
print(f"Working Directory: {state.working_dir}")
print(f"Number of Images: {len(state.images)}")
print(f"Number of Entities: {len(state.entities)}")
print(f"Number of Unclassified Entities: {len(state.unclassified_entities)}")
print(f"Error: {state.error}")

# Print output directory contents
import os
output_dir = os.path.join(state.working_dir, "output")
if os.path.exists(output_dir):
    print("\nOutput Directory Contents:")
    for item in os.listdir(output_dir):
        print(f"- {item}")