From e12937d97f7827f597d6433f673c402603dac4ed Mon Sep 17 00:00:00 2001 From: samalloing Date: Wed, 29 Apr 2020 10:14:25 +0200 Subject: [PATCH] proposition to fix #277, the value of Destination is printed instead of the java object and the error PDF_HUL_149 instead of PDF_HUL_122 --- .../hul/ois/jhove/module/PdfModule.java | 8 ++++-- .../hul/ois/jhove/module/pdf/Literal.java | 28 ++++++++++--------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/PdfModule.java b/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/PdfModule.java index d05095e25..3d98b3091 100644 --- a/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/PdfModule.java +++ b/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/PdfModule.java @@ -53,6 +53,7 @@ import edu.harvard.hul.ois.jhove.IdentifierType; import edu.harvard.hul.ois.jhove.InfoMessage; import edu.harvard.hul.ois.jhove.InternalSignature; +import edu.harvard.hul.ois.jhove.Message; import edu.harvard.hul.ois.jhove.Module; import edu.harvard.hul.ois.jhove.ModuleBase; import edu.harvard.hul.ois.jhove.NisoImageMetadata; @@ -3350,7 +3351,7 @@ protected void addDestination(PdfObject itemObj, String propName, // Encryption messes up name trees if (!_encrypted) { int pageObjNum = resolveIndirectDest( - dest.getIndirectDest()); + dest.getIndirectDest(), info); if (pageObjNum == -1) { // The scope of the reference is outside this // file, so we just report it as such. @@ -4042,7 +4043,7 @@ protected Property buildOutlineItemProperty(PdfDictionary dict, Destination dest = new Destination(destObj, this, false); if (dest.isIndirect()) { itemList.add(new Property(PROP_NAME_DESTINATION, - PropertyType.STRING, dest.getIndirectDest())); + PropertyType.STRING, dest.getIndirectDest().getStringValue())); } else { int pageObjNum = dest.getPageDestObjNumber(); Integer destPg = _pageSeqMap.get(new Integer(pageObjNum)); @@ -4157,7 +4158,7 @@ protected boolean doOutlineStuff(RepInfo info) { * We return the page sequence number for the referenced page. * If we can't find a match for the reference, we return -1. */ - protected int resolveIndirectDest(PdfSimpleObject key) throws PdfException { + protected int resolveIndirectDest(PdfSimpleObject key, RepInfo info) throws PdfException { if (key == null) { throw new IllegalArgumentException("Argument key can not be null"); } @@ -4173,6 +4174,7 @@ protected int resolveIndirectDest(PdfSimpleObject key) throws PdfException { key.getStringValue()); JhoveMessage message = JhoveMessages.getMessageInstance( MessageConstants.PDF_HUL_149.getId(), mess); + info.setMessage(new ErrorMessage(message)); throw new PdfInvalidException(message); // PDF-HUL-149 // OR if this is not considered invalid // return -1; diff --git a/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/pdf/Literal.java b/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/pdf/Literal.java index fe8d38b63..4adceb003 100644 --- a/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/pdf/Literal.java +++ b/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/pdf/Literal.java @@ -152,7 +152,7 @@ public long processLiteral (Tokenizer tok) throws IOException _rawBytes = new Vector<> (32); _state = State.LITERAL; - long offset = 0; + long offset = 0; for (;;) { ch = tok.readChar (); // If we get -1, then we've hit an EOF without proper termination of @@ -162,7 +162,6 @@ public long processLiteral (Tokenizer tok) throws IOException } offset++; _rawBytes.add (ch); - if (_state == State.LITERAL) { // We are still in a state of flux, determining the encoding if (ch == FE) { @@ -280,17 +279,18 @@ else if (_state == (State.LITERAL_UTF16_2)) { readUTFLanguageCode (tok); } else { - /* It turns out that a backslash may be double-byte, - * rather than the assumed single.byte. The following - * allows for this. Suggested by Justin Litman, Library - * of Congress, 2006-03-17. - */ - if (utfch == BACKSLASH) { - utfch = readBackslashSequence (false, tok); - if (utfch == 0) { - continue; /* Invalid character, ignore. */ - } - } + /* It turns out that a backslash may be double-byte, + * rather than the assumed single.byte. The following + * allows for this. Suggested by Justin Litman, Library + * of Congress, 2006-03-17. + */ + if (utfch == BACKSLASH) { + utfch = readBackslashSequence (false, tok); + if (utfch == 0) { + _state = State.LITERAL_UTF16_2; // skip the wrong char and reset to previous state + continue; /* Invalid character, ignore. */ + } + } buffer.append ((char) utfch); } } @@ -631,6 +631,8 @@ private int readBackslashSequence (boolean utf16, Tokenizer tok) return LF; case 0X72: // r return CR; + case 0xd: // this is an error for CR + return 0; case 0X74: // t return HT; case 0X68: // h