Skip to content

Commit

Permalink
Fix #3796, #3786: Implement UTF-8 support in java.util.zip classes (#…
Browse files Browse the repository at this point in the history
…3814)

* Fix #3798, #3786: Implement UTF-8 support in java.lang.zip classes
* Supply the missing reference .zip
* javalib `java.util.zip` classes now support writing and reading UTF-8 ("Unicode Transformation Format – 8-bit")
   entry names and archive and entry comments.
* `java.util.zip.ZipOutputStream` now follows the JVM practice of not throwing an Exception is zero entries
    are written. Former behavior was sensible, but not the JVM way.
* both now use standard `java.lang.String` methods to do Charset conversions.  In particular, this
   should now handle 4-byte UTF-8 codepoints.
  • Loading branch information
LeeTibbert committed Mar 7, 2024
1 parent 8be6975 commit 3c5c8d4
Show file tree
Hide file tree
Showing 8 changed files with 540 additions and 189 deletions.
98 changes: 98 additions & 0 deletions javalib/src/main/scala/java/util/zip/ZipByteConversions.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package java.util.zip

import java.nio.charset.{Charset, StandardCharsets}

private[zip] object ZipByteConversions {

/* This is an attempt consolidate and describe zip Charset conversion
* complexity in one place.
*
* One can not simplify the underlying frothing sea of zip complexity,
* especially as practiced in the wild, but _can_ try to reduce the
* Scala Native complexity riding loosely on top. The former are
* 'zip features'; the latter are bugs.
*
* See URL:
* https://en.wikipedia.org/wiki/ZIP_(file_format)#History
*
* The original Harmony code base comment:
* The actual character set is "IBM Code Page 437". As of
* Sep 2006, the Zip spec (APPNOTE.TXT) supports UTF-8. When
* bit 11 of the GP flags field is set, the file name and
* comment fields are UTF-8.
*
* "IBM Code Page 437" is also known as "Code Page 437" and/or
* "MS-DOS CP437".
*
* CP437 is not one of the Java StandardCharsets, so
* StandardCharsets.ISO_8859_1 (a.k.a Latin_1) is often used instead in
* order to convert all 8 bits of single bytes to Java UTF-16 Strings.
*
* CP437 is described as the "specified" (i.e. it may not actually be
* described in the spec) code page. Its limitations lead people to
* use either its later relative CP1252 (Latin-1 for Windows) or
* the local character set used by the operating system.
* Wild West, East, North, South, and probably Outer Space.
*
*
* The convention here is that the caller passes in Zip general purpose
* flag bits and a Charset to use if Bit 11 is clear/not_set. If that
* bit is set, then StandardCharsets.UTF_8 is used.
*
* The Charset passed in is probably, not required to be, the Charset
* constructor argument of the caller.
*
* Some remaining complexity (non-exhaustive):
*
* *) The author has seen one report that macOS uses UTF-8 for the name,
* archive comment, and entry comment coding but DOES NOT set
* the UTF-8 bit.
*
* Of true, that is an Apple "feature" and a future evolution of these
* methods need be changed to accommodate that feature.
*
* *) Where is my emoji?
*
* Not all recent Unicode codepoints, such as the latest emoji,
* may be available.
*
* Scala Native currently (2024-03) uses Unicode version 13.0.
* Unicode 15.1 was released in September, 2023.
*
* In theory, attempting to convert codepoints defined after
* Unicode 13.0 should throw an Exception. How strict is the
* Scala Native conversion code?
*/

final val UTF8_ENABLED_MASK = 0x800 // Bit 11, Decimal 2048

def getCharset(flagBits: Short, defaultCharset: Charset): Charset = {
if ((flagBits & UTF8_ENABLED_MASK) == UTF8_ENABLED_MASK)
StandardCharsets.UTF_8
else defaultCharset
}

/* zipGPBitFlag arguments contain the zip general purpose bit flag bits
* at both (decimal) offset:
* 6 bytes in the Local file header (LOCSIG "PK\3\4")
* 8 bytes in the Central directory header (CENSIG "PK\1\2")
*/

def bytesToString(
rawBytes: Array[Byte],
zipGpBitFlag: Short,
defaultCharset: Charset
): String = {
if ((rawBytes == null) || (rawBytes.length <= 0)) ""
else new String(rawBytes, getCharset(zipGpBitFlag, defaultCharset))
}

def bytesFromString(
str: String,
zipGpBitFlag: Short,
defaultCharset: Charset
): Array[Byte] = {
if (str == null) new Array[Byte](0)
else str.getBytes(getCharset(zipGpBitFlag, defaultCharset))
}
}
75 changes: 37 additions & 38 deletions javalib/src/main/scala/java/util/zip/ZipEntry.scala
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@ import java.io.{
UnsupportedEncodingException
}

import java.nio.charset.Charset

import scala.scalanative.posix.time._
import scala.scalanative.posix.timeOps.tmOps

import scala.scalanative.unsafe._

class ZipEntry private (
private[zip] var name: String,
private[zip] val name: String, // immutable for safety
private[zip] var comment: String,
private[zip] var compressedSize: Long,
private[zip] var crc: Long,
Expand All @@ -24,13 +26,12 @@ class ZipEntry private (
private[zip] var time: Int,
private[zip] var modDate: Int,
private[zip] var extra: Array[Byte],
private[zip] var nameLen: Int,
private[zip] var mLocalHeaderRelOffset: Long
) extends ZipConstants
with Cloneable {

def this(name: String) =
this(name, null, -1L, -1L, -1L, -1, -1, -1, null, -1, -1L)
this(name, null, -1L, -1L, -1L, -1, -1, -1, null, -1L)

def this(e: ZipEntry) =
this(
Expand All @@ -43,7 +44,6 @@ class ZipEntry private (
e.time,
e.modDate,
e.extra,
e.nameLen,
e.mLocalHeaderRelOffset
)

Expand Down Expand Up @@ -103,6 +103,14 @@ class ZipEntry private (
name.charAt(name.length - 1) == '/'

def setComment(string: String): Unit = {
/* This length is a count of Java UTF-16 characters. It is
* accurate for Strings which contain characters < 128 but may
* not be for greater values.
*
* Depending on the charset given to ZipOutputStream, its conversion
* to bytes may generate more than lengthLimit bytes, resulting in
* truncation that is not obvious or tested here.
*/
val lengthLimit = 0xffff
comment =
if (string == null || string.length() <= lengthLimit) string
Expand Down Expand Up @@ -204,18 +212,20 @@ object ZipEntry extends ZipConstants {
final val DEFLATED = 8
final val STORED = 0

private def myReadFully(in: InputStream, b: Array[Byte]): Unit = {
private[zip] def myReadFully(in: InputStream, b: Array[Byte]): Array[Byte] = {
var len = b.length
var off = 0

while (len > 0) {
val count = in.read(b, off, len)
if (count <= 0) {
if (count <= 0)
throw new EOFException()
}

off += count
len -= count
}

b
}

private[zip] def readIntLE(raf: RandomAccessFile): Long = {
Expand All @@ -233,10 +243,10 @@ object ZipEntry extends ZipConstants {

private[zip] def fromInputStream(
ler: LittleEndianReader,
in: InputStream
in: InputStream,
defaultCharset: Charset
): ZipEntry = {
val hdrBuf = ler.hdrBuf
myReadFully(in, hdrBuf)
val hdrBuf = myReadFully(in, ler.hdrBuf)

val sig =
((hdrBuf(0) & 0xff) | ((hdrBuf(1) & 0xff) << 8) |
Expand All @@ -246,6 +256,7 @@ object ZipEntry extends ZipConstants {
throw new ZipException("Central Directory Entry not found")
}

val gpBitFlag = ((hdrBuf(8) & 0xff) | ((hdrBuf(9) & 0xff) << 8)).toShort
val compressionMethod = (hdrBuf(10) & 0xff) | ((hdrBuf(11) & 0xff) << 8)
val time = (hdrBuf(12) & 0xff) | ((hdrBuf(13) & 0xff) << 8)
val modDate = (hdrBuf(14) & 0xff) | ((hdrBuf(15) & 0xff) << 8)
Expand All @@ -261,48 +272,37 @@ object ZipEntry extends ZipConstants {
(hdrBuf(24) & 0xff) | ((hdrBuf(25) & 0xff) << 8) | ((hdrBuf(
26
) & 0xff) << 16) | ((hdrBuf(27) << 24) & 0xffffffffL)

val nameLen = (hdrBuf(28) & 0xff) | ((hdrBuf(29) & 0xff) << 8)
val extraLen = (hdrBuf(30) & 0xff) | ((hdrBuf(31) & 0xff) << 8)
val commentLen = (hdrBuf(32) & 0xff) | ((hdrBuf(33) & 0xff) << 8)

val mLocalHeaderRelOffset =
(hdrBuf(42) & 0xff) | ((hdrBuf(43) & 0xff) << 8) | ((hdrBuf(
44
) & 0xff) << 16) | ((hdrBuf(45) << 24) & 0xffffffffL)

val nameBytes = new Array[Byte](nameLen)
myReadFully(in, nameBytes)
val nameBytes = myReadFully(in, new Array[Byte](nameLen))

val extra =
if (extraLen > 0) {
val extra = new Array[Byte](extraLen)
myReadFully(in, extra)
extra
} else {
null
}
if (extraLen <= 0) null
else myReadFully(in, new Array[Byte](extraLen))

val commentBytes =
if (commentLen > 0) {
val commentBytes = new Array[Byte](commentLen)
myReadFully(in, commentBytes)
commentBytes
} else {
null
}
if (commentLen <= 0) null
else myReadFully(in, new Array[Byte](commentLen))

try {
/*
* The actual character set is "IBM Code Page 437". As of
* Sep 2006, the Zip spec (APPNOTE.TXT) supports UTF-8. When
* bit 11 of the GP flags field is set, the file name and
* comment fields are UTF-8.
*
* TODO: add correct UTF-8 support.
*/
val name = new String(nameBytes, "iso-8859-1")
val name =
ZipByteConversions.bytesToString(nameBytes, gpBitFlag, defaultCharset)

val comment =
if (commentBytes != null) new String(commentBytes, "iso-8859-1")
else null
ZipByteConversions.bytesToString(
commentBytes,
gpBitFlag,
defaultCharset
)

new ZipEntry(
name,
comment,
Expand All @@ -313,7 +313,6 @@ object ZipEntry extends ZipConstants {
time,
modDate,
extra,
nameLen,
mLocalHeaderRelOffset
)
} catch {
Expand Down
57 changes: 42 additions & 15 deletions javalib/src/main/scala/java/util/zip/ZipFile.scala
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
package java.util.zip

// Ported from Apache Harmony
// Ported from Apache Harmony. Extensively changed for Scala Native.

import java.nio.charset.{Charset, StandardCharsets}
import java.io.{
BufferedInputStream,
Closeable,
Expand All @@ -11,9 +10,10 @@ import java.io.{
RandomAccessFile
}

import java.util.Enumeration
import java.nio.charset.{Charset, StandardCharsets}

import java.{util => ju}
import java.util.Enumeration
import java.util.{stream => jus}

class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
Expand All @@ -24,6 +24,8 @@ class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
def this(name: String, charset: Charset) = this(new File(name), charset)
def this(name: String) = this(name, StandardCharsets.UTF_8)

var archiveComment: String = null

private final val fileName: String = file.getPath()

if (mode != ZipFile.OPEN_READ &&
Expand Down Expand Up @@ -116,6 +118,11 @@ class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
}
}

def getComment(): String = {
checkNotClosed()
archiveComment
}

def getEntry(entryName: String): ZipEntry = {
checkNotClosed()
if (entryName == null)
Expand Down Expand Up @@ -149,8 +156,10 @@ class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
val rafstrm =
new ZipFile.RAFStream(raf, entry.mLocalHeaderRelOffset + 28)
val localExtraLenOrWhatever = ler.readShortLE(rafstrm)

// Skip the name and this "extra" data or whatever it is:
rafstrm.skip(entry.nameLen + localExtraLenOrWhatever)
rafstrm.skip(entry.name.length() + localExtraLenOrWhatever)

rafstrm.mLength = rafstrm.mOffset + entry.compressedSize
if (entry.compressionMethod == ZipEntry.DEFLATED) {
val bufSize = Math.max(1024, Math.min(entry.getSize(), 65535L).toInt)
Expand Down Expand Up @@ -223,19 +232,24 @@ class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
/*centralDirSize =*/
ler.readIntLE(bin)
val centralDirOffset = ler.readIntLE(bin)
/*commentLen =*/
ler.readShortLE(bin)

if (numEntries != totalNumEntries || diskNumber != 0 || diskWithCentralDir != 0) {
val archiveCommentLen = ler.readShortLE(bin)
val archiveCommentBytes = new Array[Byte](archiveCommentLen)

ZipEntry.myReadFully(bin, archiveCommentBytes)
archiveComment = new String(archiveCommentBytes, charset)

if (numEntries != totalNumEntries || diskNumber != 0
|| diskWithCentralDir != 0) {
throw new ZipException("spanned archves not supported")
}

/*
* Seek to the first CDE and read all entries.
* However, when Z_SYNC_FLUSH is used the offset may not point directly
* to the CDE so skip over until we find it.
* At most it will be 6 bytes away (one or two bytes for empty block, 4 bytes for
* empty block signature).
* At most it will be 6 bytes away (one or two bytes for empty block,
* 4 bytes for empty block signature).
*/
scanOffset = centralDirOffset
stopOffset = scanOffset + 6
Expand All @@ -253,18 +267,31 @@ class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
}
}

rafs.close()
bin.close()

// Also, should probably explicitly close both of the new ones here
// after they are done. Done reasonably right, that means some
// try/finally blocks. Does the rafs and/or bin need to hang around
// for later I/O and _not_ be closed? Think a future "getEntry()" call.
// Study this well, do not "just hack it".

// If CDE is found then go and read all the entries
rafs = new ZipFile.RAFStream(mRaf, scanOffset)
bin = new BufferedInputStream(rafs, 4096)

var i = 0
while (i < numEntries) {
val newEntry = ZipEntry.fromInputStream(ler, bin)
mEntries.put(newEntry.getName(), newEntry)
i += 1
try {
var i = 0
while (i < numEntries) {
val newEntry = ZipEntry.fromInputStream(ler, bin, charset)
mEntries.put(newEntry.getName(), newEntry)
i += 1
}
} finally {
bin.close()
rafs.close()
}
}

}

object ZipFile extends ZipConstants {
Expand Down

0 comments on commit 3c5c8d4

Please sign in to comment.