Skip to content

Commit

Permalink
consider scaling by fontSize(Tf, Tfs) and text matrix (Tm) (#559)
Browse files Browse the repository at this point in the history
* consider scaling of text space by fontSize(Tf, Tfs) and text matrix (Tm) to calculate accurate x,y-coordinates

* Update src/Smalot/PdfParser/Page.php

Co-authored-by: oliver681 <o_fink01@uni-muenster.de>
Co-authored-by: Konrad Abicht <hi@inspirito.de>
  • Loading branch information
3 people committed Dec 21, 2022
1 parent 6470cc9 commit 70433d1
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 22 deletions.
43 changes: 27 additions & 16 deletions src/Smalot/PdfParser/Page.php
Original file line number Diff line number Diff line change
Expand Up @@ -612,9 +612,7 @@ public function getDataCommands(array $extractedDecodedRawData = null): array

case 'Tf':
case 'TF':
if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
$extractedData[] = $command;
}
$extractedData[] = $command;
break;

/*
Expand Down Expand Up @@ -673,20 +671,32 @@ public function getDataTm(array $dataCommands = null): array
* Set default values for font data
*/
$defaultFontId = -1;
$defaultFontSize = 0;
$defaultFontSize = 1;

/*
* Setting where are the X and Y coordinates in the matrix (Tm)
* Indexes of horizontal/vertical scaling and X,Y-coordinates in the matrix (Tm)
*/
$hSc = 0; // horizontal scaling
/**
* index of vertical scaling in the array that encodes the text matrix.
* for more information: https://github.com/smalot/pdfparser/pull/559#discussion_r1053415500
*/
$vSc = 3;
$x = 4;
$y = 5;

/*
* x,y-coordinates of text space origin in user units
*
* These will be assigned the value of the currently printed string
*/
$Tx = 0;
$Ty = 0;

$Tm = $defaultTm;
$Tl = $defaultTl;
$fontId = $defaultFontId;
$fontSize = $defaultFontSize;
$fontSize = $defaultFontSize; // reflects fontSize set by Tf or Tfs

$extractedTexts = $this->getTextArray();
$extractedData = [];
Expand All @@ -695,11 +705,11 @@ public function getDataTm(array $dataCommands = null): array
switch ($command['o']) {
/*
* BT
* Begin a text object, inicializind the Tm and Tlm to identity matrix
* Begin a text object, initializing the Tm and Tlm to identity matrix
*/
case 'BT':
$Tm = $defaultTm;
$Tl = $defaultTl; // review this.
$Tl = $defaultTl;
$Tx = 0;
$Ty = 0;
$fontId = $defaultFontId;
Expand All @@ -712,20 +722,21 @@ public function getDataTm(array $dataCommands = null): array
*/
case 'ET':
$Tm = $defaultTm;
$Tl = $defaultTl; // review this
$Tl = $defaultTl;
$Tx = 0;
$Ty = 0;
$fontId = $defaultFontId;
$fontSize = $defaultFontSize;
break;

/*
* leading TL
* text leading TL
* Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
* Initial value: 0
*/
case 'TL':
$Tl = (float) $command['c'];
// scaled text leading
$Tl = (float) $command['c'] * (float) $Tm[$vSc];
break;

/*
Expand All @@ -735,8 +746,8 @@ public function getDataTm(array $dataCommands = null): array
*/
case 'Td':
$coord = explode(' ', $command['c']);
$Tx += (float) $coord[0];
$Ty += (float) $coord[1];
$Tx += (float) $coord[0] * (float) $Tm[$hSc];
$Ty += (float) $coord[1] * (float) $Tm[$vSc];
$Tm[$x] = (string) $Tx;
$Tm[$y] = (string) $Ty;
break;
Expand All @@ -752,9 +763,9 @@ public function getDataTm(array $dataCommands = null): array
*/
case 'TD':
$coord = explode(' ', $command['c']);
$Tl = (float) $coord[1];
$Tx += (float) $coord[0];
$Ty -= (float) $coord[1];
$Tl = -((float) $coord[1] * (float) $Tm[$vSc]);
$Tx += (float) $coord[0] * (float) $Tm[$hSc];
$Ty += (float) $coord[1] * (float) $Tm[$vSc];
$Tm[$x] = (string) $Tx;
$Tm[$y] = (string) $Ty;
break;
Expand Down
83 changes: 77 additions & 6 deletions tests/Integration/PageTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,9 @@ public function testGetDataCommands(): void
$pages = $document->getPages();
$page = $pages[0];
$dataCommands = $page->getDataCommands();
$this->assertCount(168, $dataCommands);
$this->assertCount(174, $dataCommands);

$tmItem = $dataCommands[1];
$tmItem = $dataCommands[2];
$this->assertCount(3, $tmItem);
$this->assertArrayHasKey('t', $tmItem);
$this->assertArrayHasKey('o', $tmItem);
Expand All @@ -267,7 +267,7 @@ public function testGetDataCommands(): void
$this->assertStringContainsString('Tm', $tmItem['o']);
$this->assertStringContainsString('0.999429 0 0 1 201.96 720.68', $tmItem['c']);

$tjItem = $dataCommands[2];
$tjItem = $dataCommands[3];
$this->assertCount(3, $tjItem);
$this->assertArrayHasKey('t', $tjItem);
$this->assertArrayHasKey('o', $tjItem);
Expand All @@ -292,6 +292,7 @@ public function testGetDataTm(): void
$page = $pages[0];

$dataTm = $page->getDataTm();

$this->assertCount(81, $dataTm);

$item = $dataTm[0];
Expand All @@ -308,8 +309,8 @@ public function testGetDataTm(): void
],
$item[0]
);

$this->assertStringContainsString('Document title', $item[1]);

$item = $dataTm[2];
$this->assertEquals(
[
Expand All @@ -322,7 +323,6 @@ public function testGetDataTm(): void
],
$item[0]
);

$this->assertStringContainsString('Calibri : Lorem ipsum dolor sit amet, consectetur a', $item[1]);

$item = $dataTm[80];
Expand All @@ -332,7 +332,7 @@ public function testGetDataTm(): void
'0',
'0',
'1',
'343.003',
'342.840222606',
'81.44',
],
$item[0]
Expand Down Expand Up @@ -443,6 +443,77 @@ public function testGetDataTm(): void
$item[0]
);
$this->assertStringContainsString('Purchase 2', $item[1]);

// test if scaling by fontSize (Tf, Tfs) and test matrix (Tm) are taken into account
$dataCommands = [
['t' => '', 'o' => 'BT', 'c' => ''], // begin text
['t' => '/', 'o' => 'Tf', 'c' => 'TT0 1'], // set font and scale font by 1 pt
['t' => '', 'o' => 'Tm', 'c' => '7.5 -0 0 8.5 45.36 791.52'], // additionally scale by 7.5 pt
['t' => '', 'o' => 'Td', 'c' => '0.568 0'], // move 0.568 * 7.5 pts (7.5 is horizontal scaling) to the right
['t' => '(', 'o' => 'Tj', 'c' => 'test'], // print "test"
['t' => '', 'o' => 'TD', 'c' => '-3.5 -1.291'], // move 3.5 * 7.5 pts left, 1.291 * 8.5 (vertical scaling) pts down and set text leading to 9.464
['t' => '(', 'o' => 'Tj', 'c' => 'another test'], // print "another test"
['t' => '', 'o' => '\'', 'c' => 'again a test'], // go to next line and print "again a test"
['t' => '', 'o' => 'TL', 'c' => '5'], // set text leading by TL
['t' => '', 'o' => '\'', 'c' => 'the next line'], // go to next line and print "the next line"
];

// verify scaling is taken into account for Td
$dataTm = $page->getDataTm($dataCommands);
$item = $dataTm[0];
$this->assertEquals(
[
'7.5',
'-0',
'0',
'8.5',
'49.62',
'791.52',
],
$item[0]
);

// verify scaling is taken into account for TD
$item = $dataTm[1];
$this->assertEquals(
[
'7.5',
'-0',
'0',
'8.5',
'23.37',
'780.5465',
],
$item[0]
);

// verify scaling is taken into account for text leading set by TD
$item = $dataTm[2];
$this->assertEquals(
[
'7.5',
'-0',
'0',
'8.5',
'23.37',
'769.573',
],
$item[0]
);

// verify scaling is taken into account for text leading set by TL
$item = $dataTm[3];
$this->assertEquals(
[
'7.5',
'-0',
'0',
'8.5',
'23.37',
'727.073',
],
$item[0]
);
}

public function testDataTmFontInfoHasToBeIncluded(): void
Expand Down

0 comments on commit 70433d1

Please sign in to comment.