From f42fa4eba3c343cd3b9bcdfdf93bcd98353eec9a Mon Sep 17 00:00:00 2001 From: rogmann Date: Wed, 28 Sep 2022 00:12:57 +0200 Subject: [PATCH 1/3] BUG: Operator Td involves a matrix multiplication. --- PyPDF2/_page.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 459dae05c..66848bdb7 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1490,8 +1490,13 @@ def process_operation(operator: bytes, operands: List) -> None: # Table 5.5 page 406 elif operator == b"Td": check_crlf_space = True - tm_matrix[4] += float(operands[0]) - tm_matrix[5] += float(operands[1]) + # A special case is a translating only tm: + # tm[0..5] = 1 0 0 1 e f, + # i.e. tm[4] += tx, tm[5] += ty. + tx = float(operands[0]) + ty = float(operands[1]) + tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] + tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] elif operator == b"Tm": check_crlf_space = True tm_matrix = [ From 84a6f5748a5967487e3af9db6a721ea8d633636b Mon Sep 17 00:00:00 2001 From: rogmann Date: Wed, 28 Sep 2022 00:16:07 +0200 Subject: [PATCH 2/3] TST: Added test of Td-operation, adjusted test iss_1142. --- resources/Sample_Td-matrix.pdf | Bin 0 -> 1304 bytes tests/test_page.py | 26 +++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 resources/Sample_Td-matrix.pdf diff --git a/resources/Sample_Td-matrix.pdf b/resources/Sample_Td-matrix.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b3ea5de26930bd815fe88f75eaf34f929efc7b98 GIT binary patch literal 1304 zcmaJ>QE%EX5PtWsxILh@njm%{g-sO?v}K#NY88$4vhBeT1F1q>Ifl~y`h90e$!OI? z4(4+HzPs=4(z?AGw^gSnT5ViHkt9347Z(>4-Rgy@s0+(P5K)vqJs65EwbzARQ1r{V zDz_y?I8DQ$FlELk;|9W|Gup%FN_#`6SD#}U$8r#NardPBDB}|;PtL&PY_^^mN2i;7 zb=Cl5@;aPLyDmK`ioWMrMZc-XfzjVOu3fZ;IQaG9Xz5Mq`BYM$x-nUkN(H1L3juK9KhfPlgl>;el#TKEOt(k2 z$i!jFw$982IE8$<=sZcf3C-!nrqJkoQF*WpP@*H7N(QfNu|m>Ee%%4Y_}_+cKx(zk vJDnH!n(&a`<LfdJ^eRD4JNy9}?b&QPa+cJd_;tMXPoFb1cL^id;cf literal 0 HcmV?d00001 diff --git a/tests/test_page.py b/tests/test_page.py index 5c78b14bb..6dbee6833 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -297,7 +297,14 @@ def test_iss_1142(): name = "st2019.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) txt = reader.pages[3].extract_text() - assert txt.find("有限公司郑州分公司") > 0 + # The following text is contained in two different cells: + # assert txt.find("有限公司郑州分公司") > 0 + # 有限公司 = limited company + # 郑州分公司 = branch office in Zhengzhou + # First cell (see page 4/254): + assert txt.find("郑州药素电子商务有限公司") > 0 + # Next cell (first cell in next line): + assert txt.find("郑州分公司") > 0 @pytest.mark.parametrize( @@ -604,6 +611,23 @@ def filter_first_table(r): assert texts.font_dict["/Encoding"] == "/WinAnsiEncoding" assert text_dat_of_date.font_size == 9.96 + # Test 3: Read a table in a document using a non-translating + # but scaling Tm-operand + reader = PdfReader(RESOURCE_ROOT / "Sample_Td-matrix.pdf") + page_td_model = reader.pages[0] + # We store the translations of the Td-executions. + list_Td = [] + def visitor_td(op, args, cm, tm): + if op == b"Td": + list_Td.append((tm[4], tm[5])) + page_td_model.extract_text(visitor_operand_after=visitor_td) + assert len(list_Td) == 4 + # Check the translations of the four Td-executions. + assert list_Td[0] == (210.0, 110.0) + assert list_Td[1] == (410.0, 110.0) + assert list_Td[2] == (210.0, 210.0) + assert list_Td[3] == (410.0, 210.0) + @pytest.mark.parametrize( ("pdf_path", "password", "embedded", "unembedded"), From e5be25b9016f7b398e8a678dffcaa9f43f49116e Mon Sep 17 00:00:00 2001 From: rogmann Date: Wed, 28 Sep 2022 13:01:39 +0200 Subject: [PATCH 3/3] STY: Executed black. --- tests/test_page.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_page.py b/tests/test_page.py index 6dbee6833..8516b666c 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -617,9 +617,11 @@ def filter_first_table(r): page_td_model = reader.pages[0] # We store the translations of the Td-executions. list_Td = [] + def visitor_td(op, args, cm, tm): if op == b"Td": list_Td.append((tm[4], tm[5])) + page_td_model.extract_text(visitor_operand_after=visitor_td) assert len(list_Td) == 4 # Check the translations of the four Td-executions.