In [None]:
class PipelineStage:
    def __init__(self, config):
        self.config = config

    def run(self, data):
        """
        data: dict containing the current document state
        returns: dict (possibly augmented)
        """
        raise NotImplementedError


In [2]:
class DocumentPipeline:
    def __init__(self, stages):
        self.stages = stages  # list of PipelineStage objects

    def run(self, data):
        for stage in self.stages:
            print(f"Running {stage.__class__.__name__}...")
            data = stage.run(data)
        return data


In [6]:
import numpy as np
class PDFIngestor(PipelineStage):
    def run(self, data):
        from pdf2image import convert_from_path
        pages = convert_from_path(self.config["pdf_path"], dpi=300)
        data["images"] = [np.array(p) for p in pages]
        return data


In [7]:
class LayoutDetector(PipelineStage):
    def run(self, data):
        import deepdoctection as dd
        layout_model = dd.get_model("layout")  # pseudo-code
        detections = layout_model(data["image"])
        data["layout"] = detections
        return data


In [8]:
class MaskRegions(PipelineStage):
    def run(self, data):
        import cv2
        image = data["image"].copy()
        for reg in data["layout"]:
            if reg["label"] in self.config["remove_labels"]:
                x1,y1,x2,y2 = reg["bbox"]
                cv2.rectangle(image, (x1,y1), (x2,y2), (255,255,255), -1)
        data["image"] = image
        return data


In [9]:
class LocalEnhancer(PipelineStage):
    def run(self, data):
        import cv2
        img = data["image"]
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        data["image"] = clahe.apply(gray)
        return data


In [11]:
class TesseractOCR(PipelineStage):
    def run(self, data):
        import pytesseract
        text = pytesseract.image_to_string(data["image"])
        data["ocr_text"] = text
        return data


In [16]:
from pathlib import Path
import yaml, cv2
# load configuration
config = yaml.safe_load(open("config.yml"))
# define stages
stages = [
    PDFIngestor(config["ingest"]),
    LayoutDetector(config["layout"]),
    MaskRegions(config["mask"]),
    LocalEnhancer(config["enhance"]),
    TesseractOCR(config["ocr"]),
]
# create pipeline
pipe = DocumentPipeline(stages)
# run on one document
doc_data = {"pdf_path": "sample_invoice.pdf"}
result = pipe.run(doc_data)
print(result["ocr_text"])

ImportError: libGL.so.1: cannot open shared object file: No such file or directory