diff --git a/Makefile b/Makefile index 55e065b8..231b4025 100644 --- a/Makefile +++ b/Makefile @@ -15,4 +15,12 @@ prospector: build-test .PHONY: pytest pytest: build-test - docker run --rm --env=PYTHONPATH=/opt/ --volume=$$(pwd)/results:/results --volume=$$(pwd)/tests:/tests tests bash -c 'cd /tests && pytest --durations=0 --verbose --color=yes' + docker run --rm --env=PYTHONPATH=/opt/ --volume=$$(pwd)/results:/results --volume=$$(pwd)/tests:/tests --volume=$$(pwd)/scan_to_paperless:/opt/scan_to_paperless tests bash -c 'cd /tests && pytest --durations=0 --verbose --color=yes' + +.PHONY: pytest-last-failed +pytest-last-failed: + docker run --rm --env=PYTHONPATH=/opt/ --volume=$$(pwd)/results:/results --volume=$$(pwd)/tests:/tests --volume=$$(pwd)/scan_to_paperless:/opt/scan_to_paperless tests bash -c 'cd /tests && pytest --durations=0 --verbose --color=yes --last-failed' + +.PHONY: pytest-exitfirst +pytest-exitfirst: + docker run --rm --env=PYTHONPATH=/opt/ --volume=$$(pwd)/results:/results --volume=$$(pwd)/tests:/tests --volume=$$(pwd)/scan_to_paperless:/opt/scan_to_paperless tests bash -c 'cd /tests && pytest --durations=0 --verbose --color=yes --exitfirst' diff --git a/config.md b/config.md index a8f207c2..a2017c63 100644 --- a/config.md +++ b/config.md @@ -30,5 +30,6 @@ - **`min_box_black_crop`** *(number)*: The minimum black in a box on content find on witch one we will crop [%]. Default: `2`. - **`min_box_black_limit`** *(number)*: The minimum black in a box on content find the limits based on content [%]. Default: `2`. - **`min_box_black_empty`** *(number)*: The minimum black in a box on content find to determine if the page is empty [%]. Default: `2`. + - **`box_kernel_size`** *(number)*: The block size used in a box on content find [mm]. Default: `1.5`. - **`box_block_size`** *(number)*: The block size used in a box on content find [mm]. Default: `1.5`. - - **`box_threshold_value_c`** *(number)*: A variable of double type representing the constant used in the both methods (subtracted from the mean or weighted mean, used in a box on content find. Default: `25`. + - **`box_threshold_value_c`** *(number)*: A variable used on threshold, should be low on low contrast image, used in a box on content find. Default: `70`. diff --git a/process.md b/process.md index acf3459e..1b95efda 100644 --- a/process.md +++ b/process.md @@ -57,5 +57,6 @@ - **`min_box_black_crop`** *(number)*: The minimum black in a box on content find on witch one we will crop [%]. Default: `2`. - **`min_box_black_limit`** *(number)*: The minimum black in a box on content find the limits based on content [%]. Default: `2`. - **`min_box_black_empty`** *(number)*: The minimum black in a box on content find to determine if the page is empty [%]. Default: `2`. - - **`box_block_size`** *(number)*: The block size used in a box on content find [mm]. Default: `1.5`. - - **`box_threshold_value_c`** *(number)*: A variable of double type representing the constant used in the both methods (subtracted from the mean or weighted mean, used in a box on content find. Default: `25`. + - **`box_kernel_size`** *(number)*: The block size used in a box on content find [mm]. Default: `1.5`. + - **`box_block_size`** *(number)*: The block size used in a box on threshold for content find [mm]. Default: `1.5`. + - **`box_threshold_value_c`** *(number)*: A variable used on threshold, should be low on low contrast image, used in a box on content find. Default: `70`. diff --git a/scan_to_paperless/config.py b/scan_to_paperless/config.py index c3a6aa88..772de929 100644 --- a/scan_to_paperless/config.py +++ b/scan_to_paperless/config.py @@ -81,10 +81,14 @@ # The block size used in a box on content find [mm] # # default: 1.5 + "box_kernel_size": Union[int, float], + # The block size used in a box on content find [mm] + # + # default: 1.5 "box_block_size": Union[int, float], - # A variable of double type representing the constant used in the both methods (subtracted from the mean or weighted mean, used in a box on content find + # A variable used on threshold, should be low on low contrast image, used in a box on content find # - # default: 25 + # default: 70 "box_threshold_value_c": Union[int, float], }, total=False, diff --git a/scan_to_paperless/config_schema.json b/scan_to_paperless/config_schema.json index 792c48db..c374048d 100644 --- a/scan_to_paperless/config_schema.json +++ b/scan_to_paperless/config_schema.json @@ -104,6 +104,11 @@ "default": 2, "description": "The minimum black in a box on content find to determine if the page is empty [%]" }, + "box_kernel_size": { + "type": "number", + "default": 1.5, + "description": "The block size used in a box on content find [mm]" + }, "box_block_size": { "type": "number", "default": 1.5, @@ -111,8 +116,8 @@ }, "box_threshold_value_c": { "type": "number", - "default": 25, - "description": "A variable of double type representing the constant used in the both methods (subtracted from the mean or weighted mean, used in a box on content find" + "default": 70, + "description": "A variable used on threshold, should be low on low contrast image, used in a box on content find" } } } diff --git a/scan_to_paperless/process.py b/scan_to_paperless/process.py index 5fea1872..d418bfae 100755 --- a/scan_to_paperless/process.py +++ b/scan_to_paperless/process.py @@ -326,12 +326,15 @@ def crop(context: Context, margin_horizontal: int, margin_vertical: int) -> None Margin in px """ image = context.get_masked() + process_count = context.get_process_count() contours = find_contours( image, + f"{process_count}-crop", context.get_px_value("min_box_size_crop", 3), context.config["args"].get("min_box_black_crop", 2), + context.get_px_value("box_kernel_size", 1.5), context.get_px_value("box_block_size", 1.5), - context.config["args"].get("box_threshold_value_c", 25), + context.config["args"].get("box_threshold_value_c", 70), ) if contours: @@ -341,7 +344,7 @@ def crop(context: Context, margin_horizontal: int, margin_vertical: int) -> None save_image( image, context.root_folder, - "{}-crop".format(context.get_process_count()), + "{}-crop".format(process_count), context.image_name, True, ) @@ -547,13 +550,17 @@ def zero_ranges(values: np_ndarray_int) -> np_ndarray_int: def find_limit_contour( image: np_ndarray_int, + name: str, vertical: bool, min_box_size: float, min_box_black: Union[int, float], - block_size: Union[float, int] = 17, - threshold_value_c: Union[float, int] = 25, + kernel_size: Union[float, int] = 16, + block_size: Union[float, int] = 16, + threshold_value_c: Union[float, int] = 100, ) -> Tuple[List[int], List[Tuple[int, int, int, int]]]: - contours = find_contours(image, min_box_size, min_box_black, block_size, threshold_value_c) + contours = find_contours( + image, name, min_box_size, min_box_black, kernel_size, block_size, threshold_value_c + ) image_size = image.shape[1 if vertical else 0] values = np.zeros(image_size) @@ -578,11 +585,13 @@ def fill_limits( peaks, properties = find_lines(image, vertical) contours_limits, contours = find_limit_contour( image, + f"{context.get_process_count()}-limits", vertical, context.get_px_value("min_box_size_limit", 10), context.config["args"].get("min_box_black_limit", 2), + context.get_px_value("box_kernel_size", 1.5), context.get_px_value("box_block_size", 1.5), - context.config["args"].get("box_threshold_value_c", 25), + context.config["args"].get("box_threshold_value_c", 70), ) for contour_limit in contours: draw_rectangle(image, contour_limit) @@ -613,21 +622,26 @@ def fill_limits( def find_contours( image: np_ndarray_int, + name: str, min_size: Union[float, int], min_black: Union[float, int], + kernel_size: Union[float, int] = 16, block_size: Union[float, int] = 16, - threshold_value_c: Union[float, int] = 25, + threshold_value_c: Union[float, int] = 100, ) -> List[Tuple[int, int, int, int]]: gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) block_size = int(round(block_size / 2) * 2) + kernel_size = int(round(kernel_size / 2)) # Clean the image using otsu method with the inversed binarized image thresh = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, block_size + 1, threshold_value_c ) + if os.environ.get("PROGRESS", "FALSE") == "TRUE": + cv2.imwrite(os.path.join(name, "threshold.png"), thresh) # Assign a rectangle kernel size - kernel = np.ones((5, 5), "uint8") + kernel = np.ones((kernel_size, kernel_size), "uint8") par_img = cv2.dilate(thresh, kernel, iterations=5) contours, _ = cv2.findContours(par_img.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) @@ -640,7 +654,12 @@ def find_contours( contour_image = rgb2gray(contour_image) if ((1 - np.mean(contour_image)) * 100) > min_black: result.append( - (x + block_size / 2, y + block_size / 2, width - block_size, height - block_size) + ( + x + kernel_size * 2, + y + kernel_size * 2, + width - kernel_size * 4, + height - kernel_size * 4, + ) ) return result @@ -665,7 +684,7 @@ def transform( images = [] process_count = 0 - if config["args"]["assisted_split"]: + if config["args"].get("assisted_split", False): config["assisted_split"] = [] for index, img in enumerate(step["sources"]): @@ -691,10 +710,12 @@ def transform( # Is empty ? contours = find_contours( context.get_masked(), + f"{context.get_process_count()}-is-empty", context.get_px_value("min_box_size_empty", 20), context.config["args"].get("min_box_black_crop", 2), + context.get_px_value("box_kernel_size", 1.5), context.get_px_value("box_block_size", 1.5), - context.config["args"].get("box_threshold_value_c", 25), + context.config["args"].get("box_threshold_value_c", 70), ) if not contours: print("Ignore image with no content: {}".format(img)) @@ -702,7 +723,7 @@ def transform( tesseract(context) - if config["args"]["assisted_split"]: + if config["args"].get("assisted_split", False): assisted_split: scan_to_paperless.process_schema.AssistedSplit = {} name = os.path.join(root_folder, context.image_name) assert context.image is not None @@ -739,7 +760,7 @@ def transform( return { "sources": images, - "name": "split" if config["args"]["assisted_split"] else "finalise", + "name": "split" if config["args"].get("assisted_split", False) else "finalise", "process_count": process_count, } @@ -925,7 +946,7 @@ def finalise( images = step["sources"] - if config["args"]["append_credit_card"]: + if config["args"].get("append_credit_card", False): images2 = [] for img in images: if os.path.exists(img): diff --git a/scan_to_paperless/process_schema.json b/scan_to_paperless/process_schema.json index 0be13fd1..3ffcfaca 100644 --- a/scan_to_paperless/process_schema.json +++ b/scan_to_paperless/process_schema.json @@ -108,15 +108,20 @@ "default": 2, "description": "The minimum black in a box on content find to determine if the page is empty [%]" }, - "box_block_size": { + "box_kernel_size": { "type": "number", "default": 1.5, "description": "The block size used in a box on content find [mm]" }, + "box_block_size": { + "type": "number", + "default": 1.5, + "description": "The block size used in a box on threshold for content find [mm]" + }, "box_threshold_value_c": { "type": "number", - "default": 25, - "description": "A variable of double type representing the constant used in the both methods (subtracted from the mean or weighted mean, used in a box on content find" + "default": 70, + "description": "A variable used on threshold, should be low on low contrast image, used in a box on content find" } } } diff --git a/scan_to_paperless/process_schema.py b/scan_to_paperless/process_schema.py index 66299a73..cbebed36 100644 --- a/scan_to_paperless/process_schema.py +++ b/scan_to_paperless/process_schema.py @@ -85,10 +85,14 @@ # The block size used in a box on content find [mm] # # default: 1.5 + "box_kernel_size": Union[int, float], + # The block size used in a box on threshold for content find [mm] + # + # default: 1.5 "box_block_size": Union[int, float], - # A variable of double type representing the constant used in the both methods (subtracted from the mean or weighted mean, used in a box on content find + # A variable used on threshold, should be low on low contrast image, used in a box on content find # - # default: 25 + # default: 70 "box_threshold_value_c": Union[int, float], }, total=False, diff --git a/tests/600.expected.png b/tests/600.expected.png new file mode 100644 index 00000000..1356834c Binary files /dev/null and b/tests/600.expected.png differ diff --git a/tests/600.png b/tests/600.png new file mode 100644 index 00000000..98f6eec3 Binary files /dev/null and b/tests/600.png differ diff --git a/tests/assisted-split-contour-1.expected.png b/tests/assisted-split-contour-1.expected.png index 44ded90e..53d01855 100644 Binary files a/tests/assisted-split-contour-1.expected.png and b/tests/assisted-split-contour-1.expected.png differ diff --git a/tests/assisted-split-contour-3.expected.png b/tests/assisted-split-contour-3.expected.png index 6cba37cf..65d4a078 100644 Binary files a/tests/assisted-split-contour-3.expected.png and b/tests/assisted-split-contour-3.expected.png differ diff --git a/tests/assisted-split-contour-5.expected.png b/tests/assisted-split-contour-5.expected.png index 34d141ee..f324730d 100644 Binary files a/tests/assisted-split-contour-5.expected.png and b/tests/assisted-split-contour-5.expected.png differ diff --git a/tests/assisted-split-join-1.expected.png b/tests/assisted-split-join-1.expected.png index f2ae38c5..7afd7303 100644 Binary files a/tests/assisted-split-join-1.expected.png and b/tests/assisted-split-join-1.expected.png differ diff --git a/tests/assisted-split-join-2.expected.png b/tests/assisted-split-join-2.expected.png index 45ff08c1..f360e0f7 100644 Binary files a/tests/assisted-split-join-2.expected.png and b/tests/assisted-split-join-2.expected.png differ diff --git a/tests/assisted-split-lines-1.expected.png b/tests/assisted-split-lines-1.expected.png index beee2e0b..39c6ae2d 100644 Binary files a/tests/assisted-split-lines-1.expected.png and b/tests/assisted-split-lines-1.expected.png differ diff --git a/tests/assisted-split-lines-3.expected.png b/tests/assisted-split-lines-3.expected.png index 09b63cb3..c4333ded 100644 Binary files a/tests/assisted-split-lines-3.expected.png and b/tests/assisted-split-lines-3.expected.png differ diff --git a/tests/assisted-split-lines-4.expected.png b/tests/assisted-split-lines-4.expected.png index ebd35b68..c3676c60 100644 Binary files a/tests/assisted-split-lines-4.expected.png and b/tests/assisted-split-lines-4.expected.png differ diff --git a/tests/assisted-split-lines-5.expected.png b/tests/assisted-split-lines-5.expected.png index 1aaeb992..fba27c75 100644 Binary files a/tests/assisted-split-lines-5.expected.png and b/tests/assisted-split-lines-5.expected.png differ diff --git a/tests/credit-card-1.expected.png b/tests/credit-card-1.expected.png index 967894d5..d236de0f 100644 Binary files a/tests/credit-card-1.expected.png and b/tests/credit-card-1.expected.png differ diff --git a/tests/test_process.py b/tests/test_process.py index e21c1e6a..67f3fa70 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -19,8 +19,8 @@ def test_find_lines(): def test_find_limit_contour(): - limits, _ = process.find_limit_contour(load_image("limit-contour-1.png"), True, 40, 2) - assert limits == [1588] + limits, _ = process.find_limit_contour(load_image("limit-contour-1.png"), "test", True, 40, 2) + assert limits == [1592] def check_image_file(root_folder, image, name, level=0.9): @@ -58,6 +58,7 @@ def test_crop(): check_image(root_folder, process.crop_image(image, -100, 100, 200, 100, (255, 255, 255)), "crop-4") check_image(root_folder, process.crop_image(image, 100, 200, 100, 200, (255, 255, 255)), "crop-5") check_image(root_folder, process.crop_image(image, 200, 100, 200, 100, (255, 255, 255)), "crop-6") + shutil.rmtree(root_folder) def test_rotate(): @@ -72,6 +73,7 @@ def test_rotate(): check_image(root_folder, process.rotate_image(image, -90, (255, 255, 255)), "rotate-4") check_image(root_folder, process.rotate_image(image, 270, (255, 255, 255)), "rotate-4") check_image(root_folder, process.rotate_image(image, 180, (255, 255, 255)), "rotate-5") + shutil.rmtree(root_folder) def init_test(): @@ -86,10 +88,10 @@ def init_test(): @pytest.mark.parametrize( "type_,limit", [ - ("lines", {"name": "VL0", "type": "line detection", "value": 1812, "vertical": True, "margin": 0}), + ("lines", {"name": "VL1", "type": "line detection", "value": 1812, "vertical": True, "margin": 0}), ( "contour", - {"name": "VC0", "type": "contour detection", "value": 1617, "vertical": True, "margin": 0}, + {"name": "VC0", "type": "contour detection", "value": 1616, "vertical": True, "margin": 0}, ), ], ) @@ -108,7 +110,6 @@ def test_assisted_split_full(type_, limit): "args": { "assisted_split": True, "level": True, - "append_credit_card": False, "tesseract": False, "sharpen": True, }, @@ -153,6 +154,7 @@ def test_assisted_split_full(type_, limit): ] ) check_image_file(root_folder, os.path.join(root_folder, "final.png"), f"assisted-split-{type_}-5") + shutil.rmtree(root_folder) # @pytest.mark.skip(reason='for test') @@ -173,7 +175,6 @@ def test_assisted_split_join_full(): "args": { "assisted_split": True, "level": True, - "append_credit_card": False, "tesseract": False, }, "destination": os.path.join(root_folder, "final.pdf"), @@ -217,6 +218,7 @@ def test_assisted_split_join_full(): ] ) check_image_file(root_folder, os.path.join(root_folder, "final.png"), "assisted-split-join-2") + shutil.rmtree(root_folder) # @pytest.mark.skip(reason='for test') @@ -236,7 +238,6 @@ def test_assisted_split_booth(): "args": { "assisted_split": True, "level": False, - "append_credit_card": False, "nocrop": True, "tesseract": False, "margin_horizontal": 0, @@ -266,6 +267,7 @@ def test_assisted_split_booth(): check_image_file(root_folder, step["sources"][1], "assisted-split-booth-2") check_image_file(root_folder, step["sources"][2], "assisted-split-booth-3") check_image_file(root_folder, step["sources"][3], "assisted-split-booth-4") + shutil.rmtree(root_folder) # @pytest.mark.skip(reason='for test') @@ -280,9 +282,7 @@ def test_full(progress, experimental): os.makedirs(root_folder) config = { "args": { - "assisted_split": False, "level": True, - "append_credit_card": False, "tesseract": False, }, "destination": os.path.join(root_folder, "final.pdf"), @@ -318,6 +318,7 @@ def test_full(progress, experimental): ] ) check_image_file(root_folder, os.path.join(root_folder, "final.png"), "all-2") + shutil.rmtree(root_folder) # @pytest.mark.skip(reason='for test') @@ -329,7 +330,6 @@ def test_credit_card_full(): os.makedirs(root_folder) config = { "args": { - "assisted_split": False, "level": True, "append_credit_card": True, }, @@ -360,6 +360,7 @@ def test_credit_card_full(): ] ) check_image_file(root_folder, os.path.join(root_folder, "final.png"), "credit-card-1") + shutil.rmtree(root_folder) # @pytest.mark.skip(reason='for test') @@ -371,9 +372,7 @@ def test_empty(): os.makedirs(root_folder) config = { "args": { - "assisted_split": False, "level": True, - "append_credit_card": False, }, "destination": os.path.join(root_folder, "final.pdf"), } @@ -385,3 +384,21 @@ def test_empty(): step = process.transform(config, step, "/tmp/test-config.yaml", root_folder) assert len(step["sources"]) == 0 assert step["name"] == "finalise" + shutil.rmtree(root_folder) + + +# @pytest.mark.skip(reason='for test') +@pytest.mark.parametrize("test,args", [("600", {"dpi": 600})]) +def test_custom_process(test, args): + init_test() + root_folder = f"/results/600" + if not os.path.exists(root_folder): + os.makedirs(root_folder) + config = { + "args": args, + } + step = {"sources": [os.path.join(os.path.dirname(__file__), f"{test}.png")]} + step = process.transform(config, step, "/tmp/test-config.yaml", root_folder) + assert len(step["sources"]) == 1 + check_image_file(root_folder, step["sources"][0], test) + shutil.rmtree(root_folder)