Skip to content

Commit

Permalink
Documentation and cleanups around the pickle encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
lrytz committed Jan 15, 2018
1 parent 9334c20 commit 77737b9
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 21 deletions.
44 changes: 29 additions & 15 deletions src/compiler/scala/tools/nsc/backend/jvm/BCodeHelpers.scala
Expand Up @@ -1034,21 +1034,34 @@ object BCodeHelpers {
}

/**
* Contains helpers around converting a Scala signature (array of bytes) into an array of `Long`.
* Details about the storage format of pickles at the bytecode level (classfile annotations) can be found in SIP-10.
* Helpers for encoding a Scala signature (array of bytes) into a String or, if too large, an
* array of Strings.
*
* The encoding is as described in [[scala.reflect.internal.pickling.ByteCodecs]]. However, the
* special encoding of 0x00 as 0xC0 0x80 is not done here, as the resulting String(s) are passed
* as annotation argument to ASM, which will perform this step.
*/
class ScalaSigBytes(bytes: Array[Byte]) {
final class ScalaSigBytes(bytes: Array[Byte]) {
import scala.reflect.internal.pickling.ByteCodecs

override def toString = (bytes map { byte => (byte & 0xff).toHexString }).mkString("[ ", " ", " ]")
lazy val sevenBitsMayBeZero: Array[Byte] = {
mapToNextModSevenBits(scala.reflect.internal.pickling.ByteCodecs.encode8to7(bytes))
}

/**
* The data in `bytes` mapped to 7-bit bytes and then each element incremented by 1 (modulo 0x80).
* This implements parts of the encoding documented in [[ByteCodecs]]. 0x00 values are NOT
* mapped to the overlong encoding (0xC0 0x80) but left as-is.
* When creating a String from this array and writing it to a classfile as annotation argument
* using ASM, the ASM library will replace 0x00 values by the overlong encoding. So the data in
* the classfile will have the format documented in [[ByteCodecs]].
*/
lazy val sevenBitsMayBeZero: Array[Byte] = mapToNextModSevenBits(ByteCodecs.encode8to7(bytes))

private def mapToNextModSevenBits(src: Array[Byte]): Array[Byte] = {
var i = 0
val srclen = src.length
while (i < srclen) {
val in = src(i)
src(i) = (if (in == 0x7f) 0.toByte else (in + 1).toByte)
src(i) = if (in == 0x7f) 0.toByte else (in + 1).toByte
i += 1
}
src
Expand All @@ -1068,15 +1081,12 @@ object BCodeHelpers {
if (sevenBitsMayBeZero(i) == 0) numZeros += 1
i += 1
}

(sevenBitsMayBeZero.length + numZeros) <= 65535
}
def strEncode: String = {
val ca = ubytesToCharArray(sevenBitsMayBeZero)
new java.lang.String(ca)
}

final def arrEncode: Array[String] = {
def strEncode: String = new java.lang.String(ubytesToCharArray(sevenBitsMayBeZero))

def arrEncode: Array[String] = {
var strs: List[String] = Nil
val bSeven: Array[Byte] = sevenBitsMayBeZero
// chop into slices of at most 65535 bytes, counting 0x00 as taking two bytes (as per JVMS 4.4.7 The CONSTANT_Utf8_info Structure)
Expand All @@ -1085,7 +1095,7 @@ object BCodeHelpers {
var encLength = 0
while (offset < bSeven.length) {
val deltaEncLength = if (bSeven(offset) == 0) 2 else 1
val newEncLength = encLength.toLong + deltaEncLength
val newEncLength = encLength + deltaEncLength
if (newEncLength >= 65535) {
val ba = bSeven.slice(prevOffset, offset)
strs ::= new java.lang.String(ubytesToCharArray(ba))
Expand All @@ -1105,13 +1115,17 @@ object BCodeHelpers {
strs.reverse.toArray
}

/**
* Maps an array of bytes 1:1 to an array of characters, ensuring that each byte is 7-bit.
* Therefore no charset is required.
*/
private def ubytesToCharArray(bytes: Array[Byte]): Array[Char] = {
val ca = new Array[Char](bytes.length)
var idx = 0
while(idx < bytes.length) {
val b: Byte = bytes(idx)
assert((b & ~0x7f) == 0)
ca(idx) = b.asInstanceOf[Char]
ca(idx) = b.toChar
idx += 1
}
ca
Expand Down
Expand Up @@ -323,19 +323,28 @@ abstract class ClassfileParser {
arr
}

def getBytes(index: Int): Array[Byte] = (
/**
* Get an array of bytes stored in the classfile as a string. The data is encoded in the format
* described in object [[ByteCodecs]]. Used for the ScalaSignature annotation argument.
*/
def getBytes(index: Int): Array[Byte] = {
if (index <= 0 || len <= index) errorBadIndex(index)
else values(index) match {
case xs: Array[Byte] => xs
case _ =>
case _ =>
val start = firstExpecting(index, CONSTANT_UTF8)
val len = (in getChar start).toInt
val len = (in getChar start).toInt
val bytes = new Array[Byte](len)
System.arraycopy(in.buf, start + 2, bytes, 0, len)
recordAtIndex(getSubArray(bytes), index)
}
)
}

/**
* Get an array of bytes stored in the classfile as an array of strings. The data is encoded in
* the format described in object [[ByteCodecs]]. Used for the ScalaLongSignature annotation
* argument.
*/
def getBytes(indices: List[Int]): Array[Byte] = {
val head = indices.head
values(head) match {
Expand All @@ -345,7 +354,8 @@ abstract class ClassfileParser {
if (index <= 0 || ConstantPool.this.len <= index) errorBadIndex(index)
val start = firstExpecting(index, CONSTANT_UTF8)
val len = (in getChar start).toInt
in.buf drop start + 2 take len
val s = start + 2
in.buf.slice(s, s + len)
}
recordAtIndex(getSubArray(arr), head)
}
Expand Down
24 changes: 23 additions & 1 deletion src/reflect/scala/reflect/internal/pickling/ByteCodecs.scala
Expand Up @@ -8,8 +8,25 @@
package scala
package reflect.internal.pickling

/**
* Helper methods to serialize a byte array as String that can be written as "modified" UTF-8
* to classfiles.
*
* Modified UTF-8 is the same as UTF-8, except for 0x00, which is represented as the "overlong"
* 0xC0 0x80. Constant strings in classfiles use this encoding.
*
* Encoding (according to SID-10):
* - The 8-bit bytes are split into 7-bit bytes, e.g., 0xff 0x0f becomes 0x7f 0x1f 0x00
* - Every bit is incremented by 1 (modulo 0x80), in the example we get 0x00, 0x20 0x01
* - 0x00 is mapped to the overlong encoding, so we get 0xC0 0x80 0x20 0x01
*
* The +1 increment should reduce the number of (overlong) zeros in the resulting string, as
* 0x7f is (hoped to be) more common than 0x00.
*/
object ByteCodecs {

/**
* Increment each element by 1, then map 0x00 to 0xC0 0x80. Returns a fresh array.
*/
def avoidZero(src: Array[Byte]): Array[Byte] = {
var i = 0
val srclen = src.length
Expand All @@ -36,6 +53,9 @@ object ByteCodecs {
dst
}

/**
* Map 0xC0 0x80 to 0x00, then subtract 1 from each element. In-place.
*/
def regenerateZero(src: Array[Byte]): Int = {
var i = 0
val srclen = src.length
Expand All @@ -57,6 +77,7 @@ object ByteCodecs {
j
}

/** Returns a new array */
def encode8to7(src: Array[Byte]): Array[Byte] = {
val srclen = src.length
val dstlen = (srclen * 8 + 6) / 7
Expand Down Expand Up @@ -123,6 +144,7 @@ object ByteCodecs {
dst
}

/** In-place */
def decode7to8(src: Array[Byte], srclen: Int): Int = {
var i = 0
var j = 0
Expand Down

0 comments on commit 77737b9

Please sign in to comment.