From 5ba6025cfe95e257f0930f4e811a91a999d2490f Mon Sep 17 00:00:00 2001 From: Juan Madurga Date: Wed, 20 Nov 2019 12:55:48 +0100 Subject: [PATCH] feature.force_jpg_conversion_for_image_ocr: convert all images format to jpg not only not RPG mode ones. Google vision do not accept all formarts i.e. pbm or ppm --- document_clipper/pdf.py | 12 ++++-------- tests/new_pdf.pdf | Bin 3488 -> 3488 bytes 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/document_clipper/pdf.py b/document_clipper/pdf.py index e5b3796..10b4be9 100644 --- a/document_clipper/pdf.py +++ b/document_clipper/pdf.py @@ -150,14 +150,10 @@ def get_page_max_dimensions(self, page_tag_node): return self._str_list_to_int_list(params) def _convert_to_jpg(self, image_path): - img = Image.open(image_path) - if img.mode != "RGB": - img = img.convert("RGB") - base = os.path.splitext(image_path)[0] - new_image_path = base + '.jpg' - img.save(new_image_path, 'JPEG') - image_path = new_image_path - return image_path + base = os.path.splitext(image_path)[0] + new_image_path = base + '.jpg' + save_image(Image.open(image_path), new_image_path, 'JPEG') + return new_image_path def _pdf_page_to_text(self, page): pdftotext_cmd = PDFToTextCommand() diff --git a/tests/new_pdf.pdf b/tests/new_pdf.pdf index ed9894ad63949eddc01556de133da9bd428489bc..935ecc91865cc2cb39c97ec02bcb5ce2f4efe946 100644 GIT binary patch delta 659 zcmb7Cze>YE7^hj3Q`c6=@*#H6B6nw+G=-9-Z9ot#sW=L0dySEhljI8O;Na+F$phFy z7gwLcr|?z0Bu#582;OqX_viQLzHg?^)aQq?!-Xpt95o5T=XW;@aL@JF^l9&)%|pRL z5rY8)9E2Sf^GP&hF*Nc(&y8dr==4ji6fo3W$EhuFC|L)hbQLMNEUCSeKZbcGm(+HJ z8*mR<7GmLu53W(FDZN`Qgl#_%ECOr?(n*I6`G_HW%0f?E0VOnX&||5Qyp+hMnbBER zf|bf)e=-o6;go|S+zn-?A9~QtMdj4C{Dp*U?d~tR(|p057Zt4^9p!VwSXO3^i_$-3 zYI^d%^{P;s>b%HxV!LhtJ)Ud+VLpo$u>Z56Nb~mgTTM6R5Yq!8pzJ}2q3ge5jk=bU zcdD#u%HgfnYs8R4DqGeY1FFfD;aq?OP1oln9CKKhiG>@9+lY;kMsC+_jF}tH*l^Q7?)qKz*9> zSu&z2^tG=9E+Ro;EyMt|I1_3#h2h)6UgP+iIRvkQNRTHHM_ya@aW>kDL^S0wFkRBD zA!s%MWn(_P1HSAQ@WH9UDRpURNe3DYc~aavLv1$V;|YslkX@b8JE}cEgV|7+|J5SN zL3B2Qd|F#y+Zbh2zpPv`{7VAVr7RY=kYFMgpe(