From 9bdc0b8f429b3f23611a9f5efe7b4270d2f21856 Mon Sep 17 00:00:00 2001 From: Zakariyya Mughal Date: Sun, 7 Jan 2018 23:31:24 -0600 Subject: [PATCH] Add support for UTF-16BE strings Also renames `unescape` to `unescape_ascii_string` in order to make a distinction between working with ASCII text strings and the Unicode UTF-16BE hex strings. Fixes . --- dist.ini | 1 + .../Incunabula/MuPDF/mutool/ObjectParser.pm | 81 ++++++++++++++++--- .../Incunabula/MuPDF/mutool/ObjectParser.t | 29 ++++++- 3 files changed, 97 insertions(+), 14 deletions(-) diff --git a/dist.ini b/dist.ini index e6de9ea..9e3f8b3 100644 --- a/dist.ini +++ b/dist.ini @@ -14,6 +14,7 @@ AutoPrereqs.skip[0] = ^DateTime$ Test::PodSpelling.stopwords[ 0] = PDF Test::PodSpelling.stopwords[ 1] = PNG Test::PodSpelling.stopwords[ 2] = initialises +Test::PodSpelling.stopwords[ 3] = ASCIIHexDecode [Prereqs / Recommends] DateTime = 0 diff --git a/lib/Renard/Incunabula/MuPDF/mutool/ObjectParser.pm b/lib/Renard/Incunabula/MuPDF/mutool/ObjectParser.pm index 7a856f1..06e5d82 100644 --- a/lib/Renard/Incunabula/MuPDF/mutool/ObjectParser.pm +++ b/lib/Renard/Incunabula/MuPDF/mutool/ObjectParser.pm @@ -5,10 +5,14 @@ package Renard::Incunabula::MuPDF::mutool::ObjectParser; use Moo; use Renard::Incunabula::Common::Types qw(Str Bool File InstanceOf); use Renard::Incunabula::MuPDF::mutool::DateObject; +use Encode qw(decode encode_utf8); +use utf8; =head1 Types TypeString + TypeStringASCII + TypeStringUTF16BE TypeNumber TypeBoolean TypeReference @@ -21,13 +25,15 @@ attribute. =cut use constant { - TypeString => 1, - TypeNumber => 2, - TypeBoolean => 3, - TypeReference => 4, - TypeDictionary => 5, - TypeDate => 6, - TypeArray => 7, + TypeString => 1, + TypeStringASCII => 2, + TypeStringUTF16BE => 3, + TypeNumber => 4, + TypeBoolean => 5, + TypeReference => 6, + TypeDictionary => 7, + TypeDate => 8, + TypeArray => 9, }; =attr filename @@ -132,9 +138,12 @@ method _parse() { ); $self->type($self->TypeDate); } else { - $self->data($self->unescape($string)); - $self->type($self->TypeString); + $self->data($self->unescape_ascii_string($string)); + $self->type($self->TypeStringASCII); } + } elsif( $scalar =~ /^<(?\s*FE\s*FF[^>]*)>/ ) { + $self->data( $self->decode_hex_utf16be( $+{String} ) ); + $self->type($self->TypeStringUTF16BE); } elsif( $scalar =~ /^\[/ ) { $self->data('NOT PARSED'); $self->type($self->TypeArray); @@ -144,16 +153,16 @@ method _parse() { } } -=classmethod unescape +=classmethod unescape_ascii_string - classmethod unescape((Str) $pdf_string ) + classmethod unescape_ascii_string((Str) $pdf_string ) A class method that unescapes the escape sequences in a PDF string. Returns a C. =cut -classmethod unescape((Str) $pdf_string ) { +classmethod unescape_ascii_string((Str) $pdf_string ) { my $new_string = $pdf_string; # TABLE 3.2 Escape sequences in literal strings (pg. 54) my %map = ( @@ -181,6 +190,54 @@ classmethod unescape((Str) $pdf_string ) { $new_string; } +=classmethod decode_hex_utf16be + + classmethod decode_hex_utf16be( (Str) $pdf_string ) + +A class method that decodes data stored in angle brackets. + +Currently only implements Unicode character encoding for what is called a +I using +B: + +=for :list +* first two bytes must be the Unicode byte order marker (C), +* one byte per each pair of hex characters (C<< /[0-9A-F]{2}/ >>)) +* whitespace is ignored + + +See the following parts of PDF Reference 1.7: + + +=for :list +* Section 3.3.1 ASCIIHexDecode Filter (pg. 69) and +* Section 3.8.1 Text String Type (pg. 158) + + +Returns a C. + +=cut +classmethod decode_hex_utf16be( (Str) $pdf_string ) { + if( $pdf_string =~ /^FE\s*FF/ ) { + # it is a UTF-16BE string + my $string = decode('UTF-16', + pack( + 'H*', + # remove strings + $pdf_string =~ s/\s+//gr + ) + ); + + # This is a text string, so we can enable the UTF8 flag. + utf8::upgrade($string); + + return $string; + } else { + # Possibly PDFDocEncoded string type? + die "Not a UTF-16BE hex string"; + } +} + =attr data A C containing the parsed data. diff --git a/t/Renard/Incunabula/MuPDF/mutool/ObjectParser.t b/t/Renard/Incunabula/MuPDF/mutool/ObjectParser.t index 4eaff3b..3ced787 100644 --- a/t/Renard/Incunabula/MuPDF/mutool/ObjectParser.t +++ b/t/Renard/Incunabula/MuPDF/mutool/ObjectParser.t @@ -1,6 +1,7 @@ #!/usr/bin/env perl -use Test::Most tests => 2; +use utf8; +use Test::Most tests => 3; use Renard::Incunabula::Common::Setup; use Renard::Incunabula::MuPDF::mutool::ObjectParser; @@ -16,7 +17,7 @@ subtest "Unsecape" => sub { for my $test (@tests) { is( - Renard::Incunabula::MuPDF::mutool::ObjectParser->unescape( $test->{input} ), + Renard::Incunabula::MuPDF::mutool::ObjectParser->unescape_ascii_string( $test->{input} ), $test->{output}, "unescape @{[ $test->{input} ]}" ); @@ -43,4 +44,28 @@ subtest "Boolean" => sub { } }; +subtest "Decode hex UTF-16BE" => sub { + my @tests = ( + { + input => 'FEFF004D006900630072006F0073006F0066007400AE00200050006F0077006500720050006F0069006E007400AE00200032003000310030', + output => 'Microsoft® PowerPoint® 2010', + }, + { + # with spaces + input => 'FE FF 00 4D 006900630072006F0073006F0066007400AE00200050006F0077006500720050006F0069006E007400AE00200032003000310030', + output => 'Microsoft® PowerPoint® 2010', + }, + ); + + for my $test (@tests) { + binmode STDOUT, ':encoding(UTF-8)'; + binmode STDERR, ':encoding(UTF-8)'; + is( + Renard::Incunabula::MuPDF::mutool::ObjectParser->decode_hex_utf16be( $test->{input} ), + $test->{output}, + "UTF-16BE decode @{[ $test->{input} ]}" + ); + } +}; + done_testing;