Skip to content

Commit

Permalink
Auto-detect utf8 PDF strings
Browse files Browse the repository at this point in the history
  • Loading branch information
ssimms committed May 21, 2019
1 parent e8e9c6f commit 8faf345
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 40 deletions.
5 changes: 5 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@

- Add support for cross-reference streams using 64-bit field widths.

- When the utf8 flag is set for a PDF string, automatically encode it as
UCS-16BE instead of requiring a separate flag to be set in the PDF object.
This resolves [RT #33497] and [RT #117031] in addition to making the code
easier to maintain.

- [RT #126274] Fix alignment when using UniFont with text_center or
text_right when all characters are in the same block.

Expand Down
10 changes: 1 addition & 9 deletions lib/PDF/API2.pm
Original file line number Diff line number Diff line change
Expand Up @@ -635,15 +635,7 @@ sub info {
if (scalar @_) {
foreach my $k (@{$self->{'infoMeta'}}) {
next unless defined $opt{$k};
if (is_utf8($opt{$k})) {
$self->{'pdf'}->{'Info'}->{$k} = PDFUtf($opt{$k} || 'NONE');
}
#elsif (is_utf8($opt{$k}) || utf8::valid($opt{$k})) {
# $self->{'pdf'}->{'Info'}->{$k} = PDFUtf($opt{$k} || 'NONE');
#}
else {
$self->{'pdf'}->{'Info'}->{$k} = PDFStr($opt{$k} || 'NONE');
}
$self->{'pdf'}->{'Info'}->{$k} = PDFStr($opt{$k} || 'NONE');
}
$self->{'pdf'}->out_obj($self->{'pdf'}->{'Info'});
}
Expand Down
9 changes: 1 addition & 8 deletions lib/PDF/API2/Annotation.pm
Original file line number Diff line number Diff line change
Expand Up @@ -232,14 +232,7 @@ sub content
{
my ($self,@t)=@_;
my $t=join("\n",@t);
if(is_utf8($t) || utf8::valid($t))
{
$self->{Contents}=PDFUtf($t);
}
else
{
$self->{Contents}=PDFStr($t);
}
$self->{Contents}=PDFStr($t);
return($self);
}

Expand Down
10 changes: 5 additions & 5 deletions lib/PDF/API2/Basic/PDF/String.pm
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,14 @@ sub as_pdf {
my ($self) = @_;
my $str = $self->{'val'};

if ($self->{' isutf'}) {
$str = join('', map { sprintf('%04X' , $_) } unpack('U*', $str) );
return "<FEFF$str>";
}
elsif ($self->{' ishex'}) { # imported as hex ?
if ($self->{' ishex'}) { # imported as hex ?
$str = unpack('H*', $str);
return "<$str>";
}
elsif ($self->{' isutf'} or (utf8::is_utf8($str) and $str =~ /[^[:ascii:]]/)) {
$str = join('', map { sprintf('%04X' , $_) } unpack('U*', $str) );
return "<FEFF$str>";
}
else {
if ($str =~ m/[^\n\r\t\b\f\040-\176\200-\377]/oi) {
$str =~ s/(.)/sprintf('%02X', ord($1))/oge;
Expand Down
15 changes: 3 additions & 12 deletions lib/PDF/API2/Basic/PDF/Utils.pm
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ sub PDFStr {
return PDF::API2::Basic::PDF::String->new(@_);
}

# Deprecated
sub PDFUtf { return PDFStr(@_) }

=head2 PDFStrHex
Creates a hex-string via PDF::API2::Basic::PDF::String->new
Expand All @@ -125,16 +128,4 @@ sub PDFStrHex {
return $string;
}

=head2 PDFUtf
Creates a utf8-string via PDF::API2::Basic::PDF::String->new
=cut

sub PDFUtf {
my $string = PDF::API2::Basic::PDF::String->new(@_);
$string->{' isutf'} = 1;
return $string;
}

1;
7 changes: 2 additions & 5 deletions t/annotate.t
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,9 @@ my $page = $pdf->page();
my $annotation = $page->annotation();
$annotation->text('This is an annotation', -rect => [ 72, 144, 172, 244 ]);

# Note: Annotation currently uses UTF-8 whenever possible, which is
# why the Contents section doesn't just have the simple text. I think
# it would be better to only use UTF-8 when necessary.
my $string = $pdf->stringify();
like($string,
qr{/Annot /Subtype /Text /Border \[ 0 0 0 \] /Contents <FEFF005400680069007300200069007300200061006E00200061006E006E006F0074006100740069006F006E> /Rect \[ 72 144 172 244 \]},
qr{/Annot /Subtype /Text /Border \[ 0 0 0 \] /Contents \(This is an annotation\) /Rect \[ 72 144 172 244 \]},
q{Text Annotation in a rectangle});

# [RT #118352] Crash if $page->annotation is called on a page with an
Expand All @@ -42,5 +39,5 @@ $annotation->text('This is an annotation', -rect => [ 72, 144, 172, 244 ]);

$string = $pdf->stringify();
like($string,
qr{/Annot /Subtype /Text /Border \[ 0 0 0 \] /Contents <FEFF005400680069007300200069007300200061006E00200061006E006E006F0074006100740069006F006E> /Rect \[ 72 144 172 244 \]},
qr{/Annot /Subtype /Text /Border \[ 0 0 0 \] /Contents \(This is an annotation\) /Rect \[ 72 144 172 244 \]},
q{Add an annotation to an existing annotations array stored in an indirect object});
10 changes: 9 additions & 1 deletion t/string.t
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use Test::More tests => 40;
use Test::More tests => 41;

use strict;
use warnings;
use utf8;

use PDF::API2::Basic::PDF::String;

Expand Down Expand Up @@ -188,6 +189,13 @@ is($string->val(),
q{Escape Character: 1-digit octal});


use PDF::API2::Basic::PDF::Utils;
$string = PDFStr('ΠΔΦ');
is($string->as_pdf(),
'<FEFF03A0039403A6>',
q{A string with the utf8 flag set is automatically encoded as UCS-16BE});


# RT 63918
$string = PDF::API2::Basic::PDF::String->from_pdf('(3\000f' . "\x5c\x5c" . '3\000f)');
is($string->val(),
Expand Down

0 comments on commit 8faf345

Please sign in to comment.