Add support for UTF-16BE strings

Also renames `unescape` to `unescape_ascii_string` in order to make a distinction between working with ASCII text strings and the Unicode UTF-16BE hex strings. Fixes <#7>.
project-renard · Jan 8, 2018 · 9bdc0b8 · 9bdc0b8
1 parent c294f50
commit 9bdc0b8
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 14 deletions.
diff --git a/dist.ini b/dist.ini
@@ -14,6 +14,7 @@ AutoPrereqs.skip[0] = ^DateTime$
 Test::PodSpelling.stopwords[ 0] = PDF
 Test::PodSpelling.stopwords[ 1] = PNG
 Test::PodSpelling.stopwords[ 2] = initialises
+Test::PodSpelling.stopwords[ 3] = ASCIIHexDecode
 
 [Prereqs / Recommends]
 DateTime = 0
diff --git a/lib/Renard/Incunabula/MuPDF/mutool/ObjectParser.pm b/lib/Renard/Incunabula/MuPDF/mutool/ObjectParser.pm
@@ -5,10 +5,14 @@ package Renard::Incunabula::MuPDF::mutool::ObjectParser;
 use Moo;
 use Renard::Incunabula::Common::Types qw(Str Bool File InstanceOf);
 use Renard::Incunabula::MuPDF::mutool::DateObject;
+use Encode qw(decode encode_utf8);
+use utf8;
 
 =head1 Types
 
   TypeString
+  TypeStringASCII
+  TypeStringUTF16BE
   TypeNumber
   TypeBoolean
   TypeReference
@@ -21,13 +25,15 @@ attribute.
 
 =cut
 use constant {
-	TypeString     => 1,
-	TypeNumber     => 2,
-	TypeBoolean    => 3,
-	TypeReference  => 4,
-	TypeDictionary => 5,
-	TypeDate       => 6,
-	TypeArray      => 7,
+	TypeString            => 1,
+	TypeStringASCII       => 2,
+	TypeStringUTF16BE     => 3,
+	TypeNumber            => 4,
+	TypeBoolean           => 5,
+	TypeReference         => 6,
+	TypeDictionary        => 7,
+	TypeDate              => 8,
+	TypeArray             => 9,
 };
 
 =attr filename
@@ -132,9 +138,12 @@ method _parse() {
 				);
 				$self->type($self->TypeDate);
 			} else {
-				$self->data($self->unescape($string));
-				$self->type($self->TypeString);
+				$self->data($self->unescape_ascii_string($string));
+				$self->type($self->TypeStringASCII);
 			}
+		} elsif( $scalar =~ /^<(?<String>\s*FE\s*FF[^>]*)>/ ) {
+			$self->data( $self->decode_hex_utf16be( $+{String} ) );
+			$self->type($self->TypeStringUTF16BE);
 		} elsif( $scalar =~ /^\[/ ) {
 			$self->data('NOT PARSED');
 			$self->type($self->TypeArray);
@@ -144,16 +153,16 @@ method _parse() {
 	}
 }
 
-=classmethod unescape
+=classmethod unescape_ascii_string
 
-  classmethod unescape((Str) $pdf_string )
+  classmethod unescape_ascii_string((Str) $pdf_string )
 
 A class method that unescapes the escape sequences in a PDF string.
 
 Returns a C<Str>.
 
 =cut
-classmethod unescape((Str) $pdf_string ) {
+classmethod unescape_ascii_string((Str) $pdf_string ) {
 	my $new_string = $pdf_string;
 	# TABLE 3.2 Escape sequences in literal strings (pg. 54)
 	my %map = (
@@ -181,6 +190,54 @@ classmethod unescape((Str) $pdf_string ) {
 	$new_string;
 }
 
+=classmethod decode_hex_utf16be
+
+  classmethod decode_hex_utf16be( (Str) $pdf_string )
+
+A class method that decodes data stored in angle brackets.
+
+Currently only implements Unicode character encoding for what is called a
+I<UTF-16BE encoded string with a leading byte order marker> using
+B<ASCIIHexDecode>:
+
+=for :list
+* first two bytes must be the Unicode byte order marker (C<U+FEFF>),
+* one byte per each pair of hex characters (C<< /[0-9A-F]{2}/ >>))
+* whitespace is ignored
+
+
+See the following parts of PDF Reference 1.7:
+
+
+=for :list
+* Section 3.3.1 ASCIIHexDecode Filter (pg. 69) and
+* Section 3.8.1 Text String Type (pg. 158)
+
+
+Returns a C<Str>.
+
+=cut
+classmethod decode_hex_utf16be( (Str) $pdf_string ) {
+	if( $pdf_string =~ /^FE\s*FF/ ) {
+		# it is a UTF-16BE string
+		my $string = decode('UTF-16',
+			pack(
+				'H*',
+				# remove strings
+				$pdf_string =~ s/\s+//gr
+			)
+		);
+
+		# This is a text string, so we can enable the UTF8 flag.
+		utf8::upgrade($string);
+
+		return $string;
+	} else {
+		# Possibly PDFDocEncoded string type?
+		die "Not a UTF-16BE hex string";
+	}
+}
+
 =attr data
 
 A C<Str> containing the parsed data.

diff --git a/t/Renard/Incunabula/MuPDF/mutool/ObjectParser.t b/t/Renard/Incunabula/MuPDF/mutool/ObjectParser.t
@@ -1,6 +1,7 @@
 #!/usr/bin/env perl
 
-use Test::Most tests => 2;
+use utf8;
+use Test::Most tests => 3;
 
 use Renard::Incunabula::Common::Setup;
 use Renard::Incunabula::MuPDF::mutool::ObjectParser;
@@ -16,7 +17,7 @@ subtest "Unsecape" => sub {
 
 	for my $test (@tests) {
 		is(
-			Renard::Incunabula::MuPDF::mutool::ObjectParser->unescape( $test->{input} ),
+			Renard::Incunabula::MuPDF::mutool::ObjectParser->unescape_ascii_string( $test->{input} ),
 			$test->{output},
 			"unescape @{[ $test->{input} ]}"
 		);
@@ -43,4 +44,28 @@ subtest "Boolean" => sub {
 	}
 };
 
+subtest "Decode hex UTF-16BE" => sub {
+	my @tests = (
+		{
+			input => 'FEFF004D006900630072006F0073006F0066007400AE00200050006F0077006500720050006F0069006E007400AE00200032003000310030',
+			output => 'Microsoft® PowerPoint® 2010',
+		},
+		{
+			# with spaces
+			input => 'FE FF 00 4D 006900630072006F0073006F0066007400AE00200050006F0077006500720050006F0069006E007400AE00200032003000310030',
+			output => 'Microsoft® PowerPoint® 2010',
+		},
+	);
+
+	for my $test (@tests) {
+		binmode STDOUT, ':encoding(UTF-8)';
+		binmode STDERR, ':encoding(UTF-8)';
+		is(
+			Renard::Incunabula::MuPDF::mutool::ObjectParser->decode_hex_utf16be( $test->{input} ),
+			$test->{output},
+			"UTF-16BE decode @{[ $test->{input} ]}"
+		);
+	}
+};
+
 done_testing;