Skip to content

Commit

Permalink
Add support for UTF-16BE strings
Browse files Browse the repository at this point in the history
Also renames `unescape` to `unescape_ascii_string` in order to make a
distinction between working with ASCII text strings and the Unicode
UTF-16BE hex strings.

Fixes <#7>.
  • Loading branch information
zmughal committed Jan 8, 2018
1 parent c294f50 commit 9bdc0b8
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 14 deletions.
1 change: 1 addition & 0 deletions dist.ini
Expand Up @@ -14,6 +14,7 @@ AutoPrereqs.skip[0] = ^DateTime$
Test::PodSpelling.stopwords[ 0] = PDF
Test::PodSpelling.stopwords[ 1] = PNG
Test::PodSpelling.stopwords[ 2] = initialises
Test::PodSpelling.stopwords[ 3] = ASCIIHexDecode

[Prereqs / Recommends]
DateTime = 0
81 changes: 69 additions & 12 deletions lib/Renard/Incunabula/MuPDF/mutool/ObjectParser.pm
Expand Up @@ -5,10 +5,14 @@ package Renard::Incunabula::MuPDF::mutool::ObjectParser;
use Moo;
use Renard::Incunabula::Common::Types qw(Str Bool File InstanceOf);
use Renard::Incunabula::MuPDF::mutool::DateObject;
use Encode qw(decode encode_utf8);
use utf8;

=head1 Types
TypeString
TypeStringASCII
TypeStringUTF16BE
TypeNumber
TypeBoolean
TypeReference
Expand All @@ -21,13 +25,15 @@ attribute.
=cut
use constant {
TypeString => 1,
TypeNumber => 2,
TypeBoolean => 3,
TypeReference => 4,
TypeDictionary => 5,
TypeDate => 6,
TypeArray => 7,
TypeString => 1,
TypeStringASCII => 2,
TypeStringUTF16BE => 3,
TypeNumber => 4,
TypeBoolean => 5,
TypeReference => 6,
TypeDictionary => 7,
TypeDate => 8,
TypeArray => 9,
};

=attr filename
Expand Down Expand Up @@ -132,9 +138,12 @@ method _parse() {
);
$self->type($self->TypeDate);
} else {
$self->data($self->unescape($string));
$self->type($self->TypeString);
$self->data($self->unescape_ascii_string($string));
$self->type($self->TypeStringASCII);
}
} elsif( $scalar =~ /^<(?<String>\s*FE\s*FF[^>]*)>/ ) {
$self->data( $self->decode_hex_utf16be( $+{String} ) );
$self->type($self->TypeStringUTF16BE);
} elsif( $scalar =~ /^\[/ ) {
$self->data('NOT PARSED');
$self->type($self->TypeArray);
Expand All @@ -144,16 +153,16 @@ method _parse() {
}
}

=classmethod unescape
=classmethod unescape_ascii_string
classmethod unescape((Str) $pdf_string )
classmethod unescape_ascii_string((Str) $pdf_string )
A class method that unescapes the escape sequences in a PDF string.
Returns a C<Str>.
=cut
classmethod unescape((Str) $pdf_string ) {
classmethod unescape_ascii_string((Str) $pdf_string ) {
my $new_string = $pdf_string;
# TABLE 3.2 Escape sequences in literal strings (pg. 54)
my %map = (
Expand Down Expand Up @@ -181,6 +190,54 @@ classmethod unescape((Str) $pdf_string ) {
$new_string;
}

=classmethod decode_hex_utf16be
classmethod decode_hex_utf16be( (Str) $pdf_string )
A class method that decodes data stored in angle brackets.
Currently only implements Unicode character encoding for what is called a
I<UTF-16BE encoded string with a leading byte order marker> using
B<ASCIIHexDecode>:
=for :list
* first two bytes must be the Unicode byte order marker (C<U+FEFF>),
* one byte per each pair of hex characters (C<< /[0-9A-F]{2}/ >>))
* whitespace is ignored
See the following parts of PDF Reference 1.7:
=for :list
* Section 3.3.1 ASCIIHexDecode Filter (pg. 69) and
* Section 3.8.1 Text String Type (pg. 158)
Returns a C<Str>.
=cut
classmethod decode_hex_utf16be( (Str) $pdf_string ) {
if( $pdf_string =~ /^FE\s*FF/ ) {
# it is a UTF-16BE string
my $string = decode('UTF-16',
pack(
'H*',
# remove strings
$pdf_string =~ s/\s+//gr
)
);

# This is a text string, so we can enable the UTF8 flag.
utf8::upgrade($string);

return $string;
} else {
# Possibly PDFDocEncoded string type?
die "Not a UTF-16BE hex string";
}
}

=attr data
A C<Str> containing the parsed data.
Expand Down
29 changes: 27 additions & 2 deletions t/Renard/Incunabula/MuPDF/mutool/ObjectParser.t
@@ -1,6 +1,7 @@
#!/usr/bin/env perl

use Test::Most tests => 2;
use utf8;
use Test::Most tests => 3;

use Renard::Incunabula::Common::Setup;
use Renard::Incunabula::MuPDF::mutool::ObjectParser;
Expand All @@ -16,7 +17,7 @@ subtest "Unsecape" => sub {

for my $test (@tests) {
is(
Renard::Incunabula::MuPDF::mutool::ObjectParser->unescape( $test->{input} ),
Renard::Incunabula::MuPDF::mutool::ObjectParser->unescape_ascii_string( $test->{input} ),
$test->{output},
"unescape @{[ $test->{input} ]}"
);
Expand All @@ -43,4 +44,28 @@ subtest "Boolean" => sub {
}
};

subtest "Decode hex UTF-16BE" => sub {
my @tests = (
{
input => 'FEFF004D006900630072006F0073006F0066007400AE00200050006F0077006500720050006F0069006E007400AE00200032003000310030',
output => 'Microsoft® PowerPoint® 2010',
},
{
# with spaces
input => 'FE FF 00 4D 006900630072006F0073006F0066007400AE00200050006F0077006500720050006F0069006E007400AE00200032003000310030',
output => 'Microsoft® PowerPoint® 2010',
},
);

for my $test (@tests) {
binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';
is(
Renard::Incunabula::MuPDF::mutool::ObjectParser->decode_hex_utf16be( $test->{input} ),
$test->{output},
"UTF-16BE decode @{[ $test->{input} ]}"
);
}
};

done_testing;

0 comments on commit 9bdc0b8

Please sign in to comment.