Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #3796, #3786: Implement UTF-8 support in java.util.zip classes #3814

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
98 changes: 98 additions & 0 deletions javalib/src/main/scala/java/util/zip/ZipByteConversions.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package java.util.zip

import java.nio.charset.{Charset, StandardCharsets}

private[zip] object ZipByteConversions {

/* This is an attempt consolidate and describe zip Charset conversion
* complexity in one place.
*
* One can not simplify the underlying frothing sea of zip complexity,
* especially as practiced in the wild, but _can_ try to reduce the
* Scala Native complexity riding loosely on top. The former are
* 'zip features'; the latter are bugs.
*
* See URL:
* https://en.wikipedia.org/wiki/ZIP_(file_format)#History
*
* The original Harmony code base comment:
* The actual character set is "IBM Code Page 437". As of
* Sep 2006, the Zip spec (APPNOTE.TXT) supports UTF-8. When
* bit 11 of the GP flags field is set, the file name and
* comment fields are UTF-8.
*
* "IBM Code Page 437" is also known as "Code Page 437" and/or
* "MS-DOS CP437".
*
* CP437 is not one of the Java StandardCharsets, so
* StandardCharsets.ISO_8859_1 (a.k.a Latin_1) is often used instead in
* order to convert all 8 bits of single bytes to Java UTF-16 Strings.
*
* CP437 is described as the "specified" (i.e. it may not actually be
* described in the spec) code page. Its limitations lead people to
* use either its later relative CP1252 (Latin-1 for Windows) or
* the local character set used by the operating system.
* Wild West, East, North, South, and probably Outer Space.
*
*
* The convention here is that the caller passes in Zip general purpose
* flag bits and a Charset to use if Bit 11 is clear/not_set. If that
* bit is set, then StandardCharsets.UTF_8 is used.
*
* The Charset passed in is probably, not required to be, the Charset
* constructor argument of the caller.
*
* Some remaining complexity (non-exhaustive):
*
* *) The author has seen one report that macOS uses UTF-8 for the name,
* archive comment, and entry comment coding but DOES NOT set
* the UTF-8 bit.
*
* Of true, that is an Apple "feature" and a future evolution of these
* methods need be changed to accommodate that feature.
*
* *) Where is my emoji?
*
* Not all recent Unicode codepoints, such as the latest emoji,
* may be available.
*
* Scala Native currently (2024-03) uses Unicode version 13.0.
* Unicode 15.1 was released in September, 2023.
*
* In theory, attempting to convert codepoints defined after
* Unicode 13.0 should throw an Exception. How strict is the
* Scala Native conversion code?
*/

final val UTF8_ENABLED_MASK = 0x800 // Bit 11, Decimal 2048

def getCharset(flagBits: Short, defaultCharset: Charset): Charset = {
if ((flagBits & UTF8_ENABLED_MASK) == UTF8_ENABLED_MASK)
StandardCharsets.UTF_8
else defaultCharset
}

/* zipGPBitFlag arguments contain the zip general purpose bit flag bits
* at both (decimal) offset:
* 6 bytes in the Local file header (LOCSIG "PK\3\4")
* 8 bytes in the Central directory header (CENSIG "PK\1\2")
*/

def bytesToString(
rawBytes: Array[Byte],
zipGpBitFlag: Short,
defaultCharset: Charset
): String = {
if ((rawBytes == null) || (rawBytes.length <= 0)) ""
else new String(rawBytes, getCharset(zipGpBitFlag, defaultCharset))
}

def bytesFromString(
str: String,
zipGpBitFlag: Short,
defaultCharset: Charset
): Array[Byte] = {
if (str == null) new Array[Byte](0)
else str.getBytes(getCharset(zipGpBitFlag, defaultCharset))
}
}
75 changes: 37 additions & 38 deletions javalib/src/main/scala/java/util/zip/ZipEntry.scala
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@ import java.io.{
UnsupportedEncodingException
}

import java.nio.charset.Charset

import scala.scalanative.posix.time._
import scala.scalanative.posix.timeOps.tmOps

import scala.scalanative.unsafe._

class ZipEntry private (
private[zip] var name: String,
private[zip] val name: String, // immutable for safety
private[zip] var comment: String,
private[zip] var compressedSize: Long,
private[zip] var crc: Long,
Expand All @@ -24,13 +26,12 @@ class ZipEntry private (
private[zip] var time: Int,
private[zip] var modDate: Int,
private[zip] var extra: Array[Byte],
private[zip] var nameLen: Int,
private[zip] var mLocalHeaderRelOffset: Long
) extends ZipConstants
with Cloneable {

def this(name: String) =
this(name, null, -1L, -1L, -1L, -1, -1, -1, null, -1, -1L)
this(name, null, -1L, -1L, -1L, -1, -1, -1, null, -1L)

def this(e: ZipEntry) =
this(
Expand All @@ -43,7 +44,6 @@ class ZipEntry private (
e.time,
e.modDate,
e.extra,
e.nameLen,
e.mLocalHeaderRelOffset
)

Expand Down Expand Up @@ -103,6 +103,14 @@ class ZipEntry private (
name.charAt(name.length - 1) == '/'

def setComment(string: String): Unit = {
/* This length is a count of Java UTF-16 characters. It is
* accurate for Strings which contain characters < 128 but may
* not be for greater values.
*
* Depending on the charset given to ZipOutputStream, its conversion
* to bytes may generate more than lengthLimit bytes, resulting in
* truncation that is not obvious or tested here.
*/
val lengthLimit = 0xffff
comment =
if (string == null || string.length() <= lengthLimit) string
Expand Down Expand Up @@ -204,18 +212,20 @@ object ZipEntry extends ZipConstants {
final val DEFLATED = 8
final val STORED = 0

private def myReadFully(in: InputStream, b: Array[Byte]): Unit = {
private[zip] def myReadFully(in: InputStream, b: Array[Byte]): Array[Byte] = {
var len = b.length
var off = 0

while (len > 0) {
val count = in.read(b, off, len)
if (count <= 0) {
if (count <= 0)
throw new EOFException()
}

off += count
len -= count
}

b
}

private[zip] def readIntLE(raf: RandomAccessFile): Long = {
Expand All @@ -233,10 +243,10 @@ object ZipEntry extends ZipConstants {

private[zip] def fromInputStream(
ler: LittleEndianReader,
in: InputStream
in: InputStream,
defaultCharset: Charset
): ZipEntry = {
val hdrBuf = ler.hdrBuf
myReadFully(in, hdrBuf)
val hdrBuf = myReadFully(in, ler.hdrBuf)

val sig =
((hdrBuf(0) & 0xff) | ((hdrBuf(1) & 0xff) << 8) |
Expand All @@ -246,6 +256,7 @@ object ZipEntry extends ZipConstants {
throw new ZipException("Central Directory Entry not found")
}

val gpBitFlag = ((hdrBuf(8) & 0xff) | ((hdrBuf(9) & 0xff) << 8)).toShort
val compressionMethod = (hdrBuf(10) & 0xff) | ((hdrBuf(11) & 0xff) << 8)
val time = (hdrBuf(12) & 0xff) | ((hdrBuf(13) & 0xff) << 8)
val modDate = (hdrBuf(14) & 0xff) | ((hdrBuf(15) & 0xff) << 8)
Expand All @@ -261,48 +272,37 @@ object ZipEntry extends ZipConstants {
(hdrBuf(24) & 0xff) | ((hdrBuf(25) & 0xff) << 8) | ((hdrBuf(
26
) & 0xff) << 16) | ((hdrBuf(27) << 24) & 0xffffffffL)

val nameLen = (hdrBuf(28) & 0xff) | ((hdrBuf(29) & 0xff) << 8)
val extraLen = (hdrBuf(30) & 0xff) | ((hdrBuf(31) & 0xff) << 8)
val commentLen = (hdrBuf(32) & 0xff) | ((hdrBuf(33) & 0xff) << 8)

val mLocalHeaderRelOffset =
(hdrBuf(42) & 0xff) | ((hdrBuf(43) & 0xff) << 8) | ((hdrBuf(
44
) & 0xff) << 16) | ((hdrBuf(45) << 24) & 0xffffffffL)

val nameBytes = new Array[Byte](nameLen)
myReadFully(in, nameBytes)
val nameBytes = myReadFully(in, new Array[Byte](nameLen))

val extra =
if (extraLen > 0) {
val extra = new Array[Byte](extraLen)
myReadFully(in, extra)
extra
} else {
null
}
if (extraLen <= 0) null
else myReadFully(in, new Array[Byte](extraLen))

val commentBytes =
if (commentLen > 0) {
val commentBytes = new Array[Byte](commentLen)
myReadFully(in, commentBytes)
commentBytes
} else {
null
}
if (commentLen <= 0) null
else myReadFully(in, new Array[Byte](commentLen))

try {
/*
* The actual character set is "IBM Code Page 437". As of
* Sep 2006, the Zip spec (APPNOTE.TXT) supports UTF-8. When
* bit 11 of the GP flags field is set, the file name and
* comment fields are UTF-8.
*
* TODO: add correct UTF-8 support.
*/
val name = new String(nameBytes, "iso-8859-1")
val name =
ZipByteConversions.bytesToString(nameBytes, gpBitFlag, defaultCharset)

val comment =
if (commentBytes != null) new String(commentBytes, "iso-8859-1")
else null
ZipByteConversions.bytesToString(
commentBytes,
gpBitFlag,
defaultCharset
)

new ZipEntry(
name,
comment,
Expand All @@ -313,7 +313,6 @@ object ZipEntry extends ZipConstants {
time,
modDate,
extra,
nameLen,
mLocalHeaderRelOffset
)
} catch {
Expand Down
57 changes: 42 additions & 15 deletions javalib/src/main/scala/java/util/zip/ZipFile.scala
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
package java.util.zip

// Ported from Apache Harmony
// Ported from Apache Harmony. Extensively changed for Scala Native.

import java.nio.charset.{Charset, StandardCharsets}
import java.io.{
BufferedInputStream,
Closeable,
Expand All @@ -11,9 +10,10 @@ import java.io.{
RandomAccessFile
}

import java.util.Enumeration
import java.nio.charset.{Charset, StandardCharsets}

import java.{util => ju}
import java.util.Enumeration
import java.util.{stream => jus}

class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
Expand All @@ -24,6 +24,8 @@ class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
def this(name: String, charset: Charset) = this(new File(name), charset)
def this(name: String) = this(name, StandardCharsets.UTF_8)

var archiveComment: String = null

private final val fileName: String = file.getPath()

if (mode != ZipFile.OPEN_READ &&
Expand Down Expand Up @@ -116,6 +118,11 @@ class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
}
}

def getComment(): String = {
checkNotClosed()
archiveComment
}

def getEntry(entryName: String): ZipEntry = {
checkNotClosed()
if (entryName == null)
Expand Down Expand Up @@ -149,8 +156,10 @@ class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
val rafstrm =
new ZipFile.RAFStream(raf, entry.mLocalHeaderRelOffset + 28)
val localExtraLenOrWhatever = ler.readShortLE(rafstrm)

// Skip the name and this "extra" data or whatever it is:
rafstrm.skip(entry.nameLen + localExtraLenOrWhatever)
rafstrm.skip(entry.name.length() + localExtraLenOrWhatever)

rafstrm.mLength = rafstrm.mOffset + entry.compressedSize
if (entry.compressionMethod == ZipEntry.DEFLATED) {
val bufSize = Math.max(1024, Math.min(entry.getSize(), 65535L).toInt)
Expand Down Expand Up @@ -223,19 +232,24 @@ class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
/*centralDirSize =*/
ler.readIntLE(bin)
val centralDirOffset = ler.readIntLE(bin)
/*commentLen =*/
ler.readShortLE(bin)

if (numEntries != totalNumEntries || diskNumber != 0 || diskWithCentralDir != 0) {
val archiveCommentLen = ler.readShortLE(bin)
val archiveCommentBytes = new Array[Byte](archiveCommentLen)

ZipEntry.myReadFully(bin, archiveCommentBytes)
archiveComment = new String(archiveCommentBytes, charset)

if (numEntries != totalNumEntries || diskNumber != 0
|| diskWithCentralDir != 0) {
throw new ZipException("spanned archves not supported")
}

/*
* Seek to the first CDE and read all entries.
* However, when Z_SYNC_FLUSH is used the offset may not point directly
* to the CDE so skip over until we find it.
* At most it will be 6 bytes away (one or two bytes for empty block, 4 bytes for
* empty block signature).
* At most it will be 6 bytes away (one or two bytes for empty block,
* 4 bytes for empty block signature).
*/
scanOffset = centralDirOffset
stopOffset = scanOffset + 6
Expand All @@ -253,18 +267,31 @@ class ZipFile(file: File, mode: Int, charset: Charset) extends Closeable {
}
}

rafs.close()
bin.close()

// Also, should probably explicitly close both of the new ones here
// after they are done. Done reasonably right, that means some
// try/finally blocks. Does the rafs and/or bin need to hang around
// for later I/O and _not_ be closed? Think a future "getEntry()" call.
// Study this well, do not "just hack it".

// If CDE is found then go and read all the entries
rafs = new ZipFile.RAFStream(mRaf, scanOffset)
bin = new BufferedInputStream(rafs, 4096)

var i = 0
while (i < numEntries) {
val newEntry = ZipEntry.fromInputStream(ler, bin)
mEntries.put(newEntry.getName(), newEntry)
i += 1
try {
var i = 0
while (i < numEntries) {
val newEntry = ZipEntry.fromInputStream(ler, bin, charset)
mEntries.put(newEntry.getName(), newEntry)
i += 1
}
} finally {
bin.close()
rafs.close()
}
}

}

object ZipFile extends ZipConstants {
Expand Down