diff --git a/runtime/io/common/src/aws/smithy/kotlin/runtime/io/SdkByteReadChannel.kt b/runtime/io/common/src/aws/smithy/kotlin/runtime/io/SdkByteReadChannel.kt index a67a2fab3f..4fd2f2a93c 100644 --- a/runtime/io/common/src/aws/smithy/kotlin/runtime/io/SdkByteReadChannel.kt +++ b/runtime/io/common/src/aws/smithy/kotlin/runtime/io/SdkByteReadChannel.kt @@ -4,6 +4,7 @@ */ package aws.smithy.kotlin.runtime.io +import aws.smithy.kotlin.runtime.util.text.byteCountUtf8 import io.ktor.utils.io.* import io.ktor.utils.io.core.* @@ -160,3 +161,33 @@ internal suspend fun SdkByteReadChannel.readAvailableFallback(dest: SdkByteBuffe dest.writeFully(tmp) return tmp.size.toLong() } + +/** + * Reads a UTF-8 code point from the channel. Returns `null` if closed + */ +suspend fun SdkByteReadChannel.readUtf8CodePoint(): Int? { + awaitContent() + if (availableForRead == 0 && isClosedForRead) return null + + val firstByte = readByte() + val cnt = byteCountUtf8(firstByte) + var code = when (cnt) { + 1 -> firstByte.toInt() + 2 -> firstByte.toInt() and 0x1f + 3 -> firstByte.toInt() and 0x0f + 4 -> firstByte.toInt() and 0x07 + else -> throw IllegalStateException("Invalid UTF-8 start sequence: $firstByte") + } + + for (i in 1 until cnt) { + awaitContent() + if (availableForRead == 0 && isClosedForRead) throw IllegalStateException("unexpected EOF: expected ${cnt - i} bytes") + val byte = readByte() + val bint = byte.toInt() + if (bint and 0xc0 != 0x80) throw IllegalStateException("invalid UTF-8 successor byte: $byte") + + code = (code shl 6) or (bint and 0x3f) + } + + return code +} diff --git a/runtime/io/common/test/aws/smithy/kotlin/runtime/io/SdkByteChannelOpsTest.kt b/runtime/io/common/test/aws/smithy/kotlin/runtime/io/SdkByteChannelOpsTest.kt index 7878c15647..02bdb30a26 100644 --- a/runtime/io/common/test/aws/smithy/kotlin/runtime/io/SdkByteChannelOpsTest.kt +++ b/runtime/io/common/test/aws/smithy/kotlin/runtime/io/SdkByteChannelOpsTest.kt @@ -131,4 +131,36 @@ class SdkByteChannelOpsTest { yield() assertFalse(awaitingContent) } + + @Test + fun testReadUtf8Chars() = runSuspendTest { + val chan = SdkByteReadChannel("hello".encodeToByteArray()) + assertEquals('h', chan.readUtf8CodePoint()?.toChar()) + assertEquals('e', chan.readUtf8CodePoint()?.toChar()) + assertEquals('l', chan.readUtf8CodePoint()?.toChar()) + assertEquals('l', chan.readUtf8CodePoint()?.toChar()) + assertEquals('o', chan.readUtf8CodePoint()?.toChar()) + assertNull(chan.readUtf8CodePoint()) + } + + @Test + fun testReadMultibyteUtf8Chars(): Unit = runSuspendTest { + // https://www.fileformat.info/info/unicode/char/1d122/index.htm + // $ - 1 byte, cent sign - 2bytes, euro sign - 3 bytes, musical clef - 4 points (surrogate pair) + val content = "$¢€\uD834\uDD22" + val chan = SdkByteReadChannel(content.encodeToByteArray()) + + val expected = listOf( + 36, // $ + 162, // ¢ + 8364, // € + 119074 // musical F clef + ) + + expected.forEachIndexed { i, exp -> + val code = chan.readUtf8CodePoint() + assertEquals(exp, code, "[i=$i] expected $exp, found $code ") + } + assertNull(chan.readUtf8CodePoint()) + } } diff --git a/runtime/serde/build.gradle.kts b/runtime/serde/build.gradle.kts index 08a60f6144..c80c61e688 100644 --- a/runtime/serde/build.gradle.kts +++ b/runtime/serde/build.gradle.kts @@ -19,7 +19,7 @@ kotlin { } } -subprojects { +allprojects { kotlin { sourceSets { commonTest { diff --git a/runtime/serde/serde-json/TESTING.md b/runtime/serde/serde-json/TESTING.md new file mode 100644 index 0000000000..7b74398ac5 --- /dev/null +++ b/runtime/serde/serde-json/TESTING.md @@ -0,0 +1,178 @@ +How to run JSONTestSuite against serde-json deserialize +======================================================== + +When making changes to the lexer it is a good idea to run the +changes against the [JSONTestSuite](https://github.com/nst/JSONTestSuite) and manually examine the test results. + +### How to setup the JSONTestSuite + +1. Clone the [JSONTestSuite](https://github.com/nst/JSONTestSuite) repository. +2. In `JSONTestSuite/parsers`, create a new Gradle JVM application project named `test_smithy_kotlin`. +3. Add the following `build.gradle.kts` file + +```kotlin +plugins { + kotlin("jvm") version "1.5.30" + application + id("com.github.johnrengelman.shadow") version "7.0.0" +} + +application { + mainClass.set("aws.smithy.kotlin.jsontest.MainKt") +} + +allprojects { + repositories { + mavenLocal() + mavenCentral() + } +} + + +// NOTE: set to whatever locally published version you are working on +val smithyKotlinVersion: String = "0.4.1-kmp-json" +dependencies { + implementation(kotlin("stdlib")) + implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.5.0") + implementation("aws.smithy.kotlin:serde-json:$smithyKotlinVersion") + implementation("aws.smithy.kotlin:utils:$smithyKotlinVersion") +} + +tasks.jar { + manifest { + attributes["Main-Class"] = "aws.smithy.kotlin.jsontest.MainKt" + } +} +``` + +4. Add the following code to `src/main/kotlin/Main.kt` with: + +```kotlin +package aws.smithy.kotlin.jsontest + +import kotlinx.coroutines.runBlocking +import kotlin.system.exitProcess +import java.io.IOException +import java.nio.file.Files +import java.nio.file.Paths +import aws.smithy.kotlin.runtime.serde.json.JsonToken +import aws.smithy.kotlin.runtime.serde.json.jsonStreamReader +import aws.smithy.kotlin.runtime.util.InternalApi + + +@OptIn(InternalApi::class) +suspend fun isValidJson(bytes: ByteArray):Boolean { + val lexer = jsonStreamReader(bytes) + println(lexer::class.qualifiedName) + return try { + val tokens = mutableListOf() + do { + val token = lexer.nextToken() + tokens.add(token) + }while(token != JsonToken.EndDocument) + + // The test suite includes incomplete objects and arrays (e.g. "[null,") + // These are completely valid for this parser since it's just a tokenizer + // and doesn't attempt to make semantic meaning from the input. + // We'll just pretend to fail to satisfy the test suite + val pruned = if (tokens.last() == JsonToken.EndDocument) tokens.dropLast(1) else tokens + if (pruned.first() == JsonToken.BeginArray && pruned.last() != JsonToken.EndArray) { + return false + } + if (pruned.first() == JsonToken.BeginObject && pruned.last() != JsonToken.EndObject) { + return false + } + + tokens.isNotEmpty() + }catch(ex: Exception) { + println(ex) + false + } +} + +fun main(args: Array): Unit = runBlocking { + if(args.isEmpty()) { + println("Usage: java TestJSONParsing file.json") + exitProcess(2) + } + + try { + val data = Files.readAllBytes(Paths.get(args[0])) + if(isValidJson(data)) { + println("valid"); + exitProcess(0); + } + println("invalid"); + exitProcess(1); + } catch (ex: IOException) { + println(ex) + println("not found"); + exitProcess(2); + } +} +``` + +5. Compile this program with `./gradlew build`. + NOTE: Be sure to publish all of `smithy-kotlin` "runtime" to maven local. It is helpful to just choose a unique version + to be sure that everything is wired up correctly. +6. Modify `JSONTestSuite/run_tests.py` so that the `programs` dictionary only contains this one entry: + +``` +programs = { + "SmithyKotlin": + { + "url":"", + "commands":["java" , "-jar", os.path.join(PARSERS_DIR, "test_smithy_kotlin/build/libs/test_smithy_kotlin-all.jar")] + } +} +``` + +7. Run `run_tests.py` and examine the output with a web browser by opening `JSONTestSuite/results/parsing.html`. + +### Examining the results + +When looking at `JSONTestSuite/results/parsing.html`, there is a matrix of test cases against their +results with a legend at the top. + +Any test result marked with blue or light blue is for a test case where correct behavior isn't specified, +so use your best judgement to decide if it should have succeeded or failed. + +The other colors are bad and should be carefully examined. At time of writing, the following test cases +succeed when they should fail, and we intentionally left it that way since we're not currently concerned +about being more lenient in the number parsing: + +``` + +n_number_-01.json [-01] +n_number_-2..json [-2.] +n_number_.2e-3.json [.2e-3] +n_number_0.3e+.json [0.3e+] +n_number_0.3e.json [0.3e] +n_number_0.e1.json [0.e1] +n_number_0_capital_E+.json [0E+] +n_number_0_capital_E.json [0E] +n_number_0e+.json [0e+] +n_number_0e.json [0e] +n_number_1.0e+.json [1.0e+] +n_number_1.0e-.json [1.0e-] +n_number_1.0e.json [1.0e] +n_number_2.e+3.json [2.e+3] +n_number_2.e-3.json [2.e-3] +n_number_2.e3.json [2.e3] +n_number_9.e+.json [9.e+] +n_number_neg_int_starting_with_zero.json [-012] +n_number_neg_real_without_int_part.json [-.123] +n_number_real_without_fractional_part.json [1.] +n_number_starting_with_dot.json [.123] +n_number_with_leading_zero.json [012] +``` + + + +This test case succeeds with our parser and that's OK since we're +a token streaming parser (multiple values are allowed): +``` +n_array_just_minus.json [-] +n_structure_double_array.json [][] +n_structure_whitespace_formfeed.json [0C] <=> [ ] +``` \ No newline at end of file diff --git a/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonDeserializer.kt b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonDeserializer.kt index ff04691fab..67d95d4428 100644 --- a/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonDeserializer.kt +++ b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonDeserializer.kt @@ -7,7 +7,7 @@ package aws.smithy.kotlin.runtime.serde.json import aws.smithy.kotlin.runtime.serde.* /** - * Provides a deserialiser for JSON documents + * Provides a deserializer for JSON documents * * @param payload underlying document from which tokens are read */ @@ -65,11 +65,11 @@ class JsonDeserializer(payload: ByteArray) : Deserializer, Deserializer.ElementI override suspend fun deserializeStruct(descriptor: SdkObjectDescriptor): Deserializer.FieldIterator = when (reader.peek()) { - RawJsonToken.BeginObject -> { + JsonToken.BeginObject -> { reader.nextTokenOf() JsonFieldIterator(reader, descriptor, this) } - RawJsonToken.Null -> JsonNullFieldIterator(this) + JsonToken.Null -> JsonNullFieldIterator(this) else -> throw DeserializationException("Unexpected token type ${reader.peek()}") } @@ -88,28 +88,28 @@ class JsonDeserializer(payload: ByteArray) : Deserializer, Deserializer.ElementI return token.value } - override suspend fun nextHasValue(): Boolean = reader.peek() != RawJsonToken.Null + override suspend fun nextHasValue(): Boolean = reader.peek() != JsonToken.Null override suspend fun hasNextEntry(): Boolean = when (reader.peek()) { - RawJsonToken.EndObject -> { + JsonToken.EndObject -> { // consume the token reader.nextTokenOf() false } - RawJsonToken.Null, - RawJsonToken.EndDocument -> false + JsonToken.Null, + JsonToken.EndDocument -> false else -> true } override suspend fun hasNextElement(): Boolean = when (reader.peek()) { - RawJsonToken.EndArray -> { + JsonToken.EndArray -> { // consume the token reader.nextTokenOf() false } - RawJsonToken.EndDocument -> false + JsonToken.EndDocument -> false else -> true } } @@ -131,13 +131,13 @@ private class JsonFieldIterator( override suspend fun findNextFieldIndex(): Int? { val candidate = when (reader.peek()) { - RawJsonToken.EndObject -> { + JsonToken.EndObject -> { // consume the token reader.nextTokenOf() null } - RawJsonToken.EndDocument -> null - RawJsonToken.Null -> { + JsonToken.EndDocument -> null + JsonToken.Null -> { reader.nextTokenOf() null } @@ -151,7 +151,7 @@ private class JsonFieldIterator( if (candidate != null) { // found a field - if (reader.peek() == RawJsonToken.Null) { + if (reader.peek() == JsonToken.Null) { // skip explicit nulls reader.nextTokenOf() return findNextFieldIndex() @@ -166,17 +166,3 @@ private class JsonFieldIterator( reader.skipNext() } } - -// return the next token and require that it be of type [TExpected] or else throw an exception -private suspend inline fun JsonStreamReader.nextTokenOf(): TExpected { - val token = this.nextToken() - requireToken(token) - return token as TExpected -} - -// require that the given token be of type [TExpected] or else throw an exception -private inline fun requireToken(token: JsonToken) { - if (token::class != TExpected::class) { - throw DeserializationException("expected ${TExpected::class}; found ${token::class}") - } -} diff --git a/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonEncoder.kt b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonEncoder.kt new file mode 100644 index 0000000000..ab6a663fc4 --- /dev/null +++ b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonEncoder.kt @@ -0,0 +1,152 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +package aws.smithy.kotlin.runtime.serde.json + +import aws.smithy.kotlin.runtime.util.* + +// character code points +private const val CP_QUOTATION = 0x22 +private const val CP_BACKSLASH = 0x5C +private const val CP_NEWLINE = 0x0A +private const val CP_CARRIAGE_RETURN = 0x0D +private const val CP_TAB = 0x09 +private const val CP_BACKSPACE = 0x08 +private const val CP_FORMFEED = 0x0C + +internal class JsonEncoder(private val pretty: Boolean = false) : JsonStreamWriter { + private val buffer = StringBuilder() + + override val bytes: ByteArray + get() = buffer.toString().encodeToByteArray() + + private val state: ListStack = mutableListOf(LexerState.Initial) + + private var depth: Int = 0 + + override fun beginObject() = openStructure("{", LexerState.ObjectFirstKeyOrEnd) + override fun endObject() = closeStructure("}", LexerState.ObjectFirstKeyOrEnd, LexerState.ObjectNextKeyOrEnd) + override fun beginArray() = openStructure("[", LexerState.ArrayFirstValueOrEnd) + override fun endArray() = closeStructure("]", LexerState.ArrayFirstValueOrEnd, LexerState.ArrayNextValueOrEnd) + + private fun openStructure(token: String, nextState: LexerState) { + encodeValue(token) + writeNewline() + depth++ + state.push(nextState) + } + + private fun closeStructure(token: String, vararg allowedStates: LexerState) { + writeNewline() + depth-- + writeIndent() + buffer.append(token) + val last = state.pop() + check(last in allowedStates) { "Invalid JSON encoder state $last; expected one of ${allowedStates.joinToString()}" } + } + + private fun writeIndent() { + if (pretty && depth > 0) { + val indent = " ".repeat(depth * 4) + buffer.append(indent) + } + } + + private fun writeNewline() { if (pretty) buffer.append('\n') } + private fun writeComma() { + buffer.append(",") + writeNewline() + } + + private fun writeColon() { + buffer.append(":") + if (pretty) buffer.append(" ") + } + + private fun encodeValue(value: String) { + when (state.top()) { + LexerState.ArrayFirstValueOrEnd -> { + state.replaceTop(LexerState.ArrayNextValueOrEnd) + writeIndent() + } + LexerState.ArrayNextValueOrEnd -> { + writeComma() + writeIndent() + } + LexerState.ObjectFieldValue -> { + writeColon() + state.replaceTop(LexerState.ObjectNextKeyOrEnd) + } + else -> {} + } + + buffer.append(value) + } + + override fun writeNull() = encodeValue("null") + + private fun StringBuilder.appendQuoted(value: String) { + append("\"") + append(value) + append("\"") + } + + override fun writeName(name: String) { + if (state.top() == LexerState.ObjectNextKeyOrEnd) { + writeComma() + } + writeIndent() + buffer.appendQuoted(name.escape()) + state.replaceTop(LexerState.ObjectFieldValue) + } + + override fun writeValue(value: String) = encodeValue("\"${value.escape()}\"") + + override fun writeValue(bool: Boolean) = encodeValue(bool.toString()) + + private fun writeNumber(value: Number) = encodeValue(value.toString()) + + override fun writeValue(value: Byte) = writeNumber(value) + override fun writeValue(value: Long) = writeNumber(value) + override fun writeValue(value: Short) = writeNumber(value) + override fun writeValue(value: Int) = writeNumber(value) + override fun writeValue(value: Float) = writeNumber(value) + override fun writeValue(value: Double) = writeNumber(value) + + override fun writeRawValue(value: String) = encodeValue(value) +} + +internal fun String.escape(): String { + if (!any(Char::needsEscaped)) return this + + val str = this + + return buildString(length + 1) { + str.forEach { chr -> + when (chr.code) { + CP_QUOTATION -> append("\\\"") + CP_BACKSLASH -> append("\\\\") + CP_NEWLINE -> append("\\n") + CP_CARRIAGE_RETURN -> append("\\r") + CP_TAB -> append("\\t") + CP_BACKSPACE -> append("\\b") + CP_FORMFEED -> append("\\f") + in 0..0x1F -> { + val formatted = chr.code.toString(16) + append("\\u") + append(formatted.padStart(4, padChar = '0')) + } + else -> append(chr) + } + } + } +} + +private fun Char.needsEscaped(): Boolean = when (code) { + CP_QUOTATION, + CP_BACKSLASH, + in 0..0x1F -> true + else -> false +} diff --git a/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonLexer.kt b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonLexer.kt new file mode 100644 index 0000000000..37d3e2519b --- /dev/null +++ b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonLexer.kt @@ -0,0 +1,443 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +package aws.smithy.kotlin.runtime.serde.json + +import aws.smithy.kotlin.runtime.serde.* +import aws.smithy.kotlin.runtime.util.* + +private val DIGITS = ('0'..'9').toSet() +private val EXP = setOf('e', 'E') +private val PLUS_MINUS = setOf('-', '+') + +private typealias StateStack = ListStack +private typealias StateMutation = (StateStack) -> Unit + +/** + * Manages internal lexer state + * + * The entire lexer works off peeking tokens. Only when nextToken() is called should state be mutated. + * State manager helps enforce this invariant. + */ +private data class StateManager( + private val state: StateStack = mutableListOf(LexerState.Initial), + private val pendingMutations: MutableList = mutableListOf() +) { + + /** + * The size of the state stack + */ + val size: Int + get() = state.size + + /** + * Get the top of the state stack + */ + val current: LexerState + get() = state.top() + + /** + * Remove all pending mutations and run them to bring state up to date + */ + fun update() { + pendingMutations.forEach { it.invoke(state) } + pendingMutations.clear() + } + + /** + * Push a pending mutation + */ + fun mutate(mutation: StateMutation) { pendingMutations.add(mutation) } +} + +/** + * Tokenizes JSON documents + */ +internal class JsonLexer( + private val data: ByteArray +) : JsonStreamReader { + private var peeked: JsonToken? = null + private val state = StateManager() + private var idx = 0 + + override suspend fun nextToken(): JsonToken { + val next = peek() + peeked = null + state.update() + return next + } + + override suspend fun peek(): JsonToken = peeked ?: doPeek().also { peeked = it } + + override suspend fun skipNext() { + val startDepth = state.size + nextToken() + while (state.size > startDepth) { + nextToken() + } + } + + private fun doPeek(): JsonToken = + try { + when (state.current) { + LexerState.Initial -> readToken() + LexerState.ArrayFirstValueOrEnd -> stateArrayFirstValueOrEnd() + LexerState.ArrayNextValueOrEnd -> stateArrayNextValueOrEnd() + LexerState.ObjectFirstKeyOrEnd -> stateObjectFirstKeyOrEnd() + LexerState.ObjectNextKeyOrEnd -> stateObjectNextKeyOrEnd() + LexerState.ObjectFieldValue -> stateObjectFieldValue() + } + } catch (ex: DeserializationException) { + throw ex + } catch (ex: Exception) { + throw DeserializationException(cause = ex) + } + + // handles the [State.ObjectFirstKeyOrEnd] state + private fun stateObjectFirstKeyOrEnd(): JsonToken = + when (val chr = nextNonWhitespace(peek = true)) { + '}' -> endObject() + '"' -> readName() + else -> unexpectedToken(chr, "\"", "}") + } + + // handles the [State.ObjectNextKeyOrEnd] state + private fun stateObjectNextKeyOrEnd(): JsonToken = + when (val chr = nextNonWhitespace(peek = true)) { + '}' -> endObject() + ',' -> { + consume(',') + nextNonWhitespace(peek = true) + readName() + } + else -> unexpectedToken(chr, ",", "}") + } + + // handles the [State.ObjectFieldValue] state + private fun stateObjectFieldValue(): JsonToken = + when (val chr = nextNonWhitespace(peek = true)) { + ':' -> { + consume(':') + state.mutate { it.replaceTop(LexerState.ObjectNextKeyOrEnd) } + readToken() + } + else -> unexpectedToken(chr, ":") + } + + // handles the [State.ArrayFirstValueOrEnd] state + private fun stateArrayFirstValueOrEnd(): JsonToken = + when (nextNonWhitespace(peek = true)) { + ']' -> endArray() + else -> { + state.mutate { it.replaceTop(LexerState.ArrayNextValueOrEnd) } + readToken() + } + } + + // handles the [State.ArrayNextValueOrEnd] state + private fun stateArrayNextValueOrEnd(): JsonToken = + when (val chr = nextNonWhitespace(peek = true)) { + ']' -> endArray() + ',' -> { + consume(',') + readToken() + } + else -> unexpectedToken(chr, ",", "]") + } + + // discards the '{' character and pushes 'ObjectFirstKeyOrEnd' state + private fun startObject(): JsonToken { + consume('{') + state.mutate { it.push(LexerState.ObjectFirstKeyOrEnd) } + return JsonToken.BeginObject + } + + // discards the '}' character and pops the current state + private fun endObject(): JsonToken { + consume('}') + val top = state.current + lexerCheck(top == LexerState.ObjectFirstKeyOrEnd || top == LexerState.ObjectNextKeyOrEnd, idx - 1) { "Unexpected close `}` encountered" } + state.mutate { it.pop() } + return JsonToken.EndObject + } + + // discards the '[' and pushes 'ArrayFirstValueOrEnd' state + private fun startArray(): JsonToken { + consume('[') + state.mutate { it.push(LexerState.ArrayFirstValueOrEnd) } + return JsonToken.BeginArray + } + + // discards the '}' character and pops the current state + private fun endArray(): JsonToken { + consume(']') + val top = state.current + lexerCheck(top == LexerState.ArrayFirstValueOrEnd || top == LexerState.ArrayNextValueOrEnd, idx - 1) { "Unexpected close `]` encountered" } + state.mutate { it.pop() } + return JsonToken.EndArray + } + + // read an object key + private fun readName(): JsonToken { + val name = when (val chr = peekOrThrow()) { + '"' -> readQuoted() + else -> unexpectedToken(chr, "\"") + } + state.mutate { it.replaceTop(LexerState.ObjectFieldValue) } + return JsonToken.Name(name) + } + + // read the next token from the stream. This is only invoked from state functions which guarantees + // the current state should be such that the next character is the start of a token + private fun readToken(): JsonToken = + when (val chr = nextNonWhitespace(peek = true)) { + '{' -> startObject() + '[' -> startArray() + '"' -> JsonToken.String(readQuoted()) + 't', 'f', 'n' -> readKeyword() + '-', in '0'..'9' -> readNumber() + null -> JsonToken.EndDocument + else -> unexpectedToken(chr, "{", "[", "\"", "null", "true", "false", "") + } + + /** + * Read based on the number spec : https://www.json.org/json-en.html + * [-]0-9[.[0-9]][[E|e][+|-]0-9] + */ + private fun readNumber(): JsonToken { + val value = buildString { + if (peekChar() == '-') { + append(nextOrThrow()) + } + readDigits(this) + if (peekChar() == '.') { + append(nextOrThrow()) + readDigits(this) + } + if (peekChar() in EXP) { + append(nextOrThrow()) + if (peekChar() in PLUS_MINUS) { + append(nextOrThrow()) + } + readDigits(this) + } + } + lexerCheck(value.isNotEmpty()) { "Invalid number, expected `-` || 0..9, found `${peekChar()}`" } + return JsonToken.Number(value) + } + + private fun readDigits(appendable: Appendable) { + while (peekChar() in DIGITS) { + appendable.append(nextOrThrow()) + } + } + + /** + * Read a quoted JSON string out of the stream + */ + private fun readQuoted(): String { + consume('"') + // read bytes until a non-escaped end-quote + val start = idx + var chr = peekOrThrow() + var needsUnescaped = false + while (chr != '"') { + // handle escapes + when (chr) { + '\\' -> { + needsUnescaped = true + // consume escape backslash + nextOrThrow() + when (val byte = nextOrThrow()) { + 'u' -> { + if (idx + 4 >= data.size) fail("Unexpected EOF reading escaped unicode string", idx) + idx += 4 + } + '\\', '/', '"', 'b', 'f', 'r', 'n', 't' -> { } // already consumed + else -> fail("Invalid escape character: `$byte`", idx - 1) + } + } + else -> { + if (chr.isControl()) fail("Unexpected control character: `$chr`") + idx++ + } + } + + chr = peekOrThrow() + } + + val value = data.decodeToString(start, idx) + consume('"') + return if (needsUnescaped) { + try { + value.unescape() + } catch (ex: Exception) { + // use offset of the start of the entire string (including starting quotation mark) + fail(ex.message ?: "Invalid escaped string", start - 1) + } + } else { + value + } + } + + private fun readKeyword(): JsonToken = when (val ch = peekOrThrow()) { + 't' -> readLiteral("true", JsonToken.Bool(true)) + 'f' -> readLiteral("false", JsonToken.Bool(false)) + 'n' -> readLiteral("null", JsonToken.Null) + else -> fail("Unable to handle keyword starting with '$ch'") + } + + private fun readLiteral(expectedString: String, token: JsonToken): JsonToken { + consume(expectedString) + return token + } + + /** + * Advance the cursor until next non-whitespace character is encountered + * @param peek Flag indicating if the next non-whitespace character should be consumed or peeked + */ + private fun nextNonWhitespace(peek: Boolean = false): Char? { + while (peekChar()?.isWhitespace() == true) { + idx++ + } + return if (peek) peekChar() else nextOrThrow() + } + + /** + * Invoke [consume] for each character in [expected] + */ + private fun consume(expected: String) = expected.forEach { consume(it) } + + /** + * Assert that the next character is [expected] and advance + */ + private fun consume(expected: Char) { + val chr = data[idx].toInt().toChar() + lexerCheck(chr == expected) { "Unexpected char `$chr` expected `$expected`" } + idx++ + } + + /** + * Return next byte to consume or null if EOF has been reached + */ + private fun peekByte(): Byte? = data.getOrNull(idx) + + /** + * Peek the next character or return null if EOF has been reached + * + * SAFETY: This assumes ASCII. This is safe because we _only_ use it for tokenization + * (e.g. {, }, [, ], , , etc). When reading object keys or string values [readQuoted] is + * used which handles UTF-8. Do not use these single char related functions to directly construct a string! + * + * NOTE: [readQuoted] uses [decodeToString] which is _MUCH_ faster (~3x) than decoding bytes as + * UTF-8 chars one by one on the fly. + */ + private fun peekChar(): Char? = peekByte()?.toInt()?.toChar() + + /** + * Peek the next character or throw if EOF has been reached + */ + private fun peekOrThrow(): Char = peekChar() ?: throw IllegalStateException("Unexpected EOF") + + /** + * Consume the next character and advance the index or throw if EOF has been reached + */ + private fun nextOrThrow(): Char = peekOrThrow().also { idx++ } + + private fun unexpectedToken(found: Char?, vararg expected: String): Nothing { + val pluralModifier = if (expected.size > 1) " one of" else "" + val formatted = expected.joinToString(separator = ", ") { "`$it`" } + fail("found `$found`, expected$pluralModifier $formatted") + } + + private fun fail(message: String, offset: Int = idx, cause: Throwable? = null): Nothing { + throw DeserializationException("Unexpected JSON token at offset $offset; $message", cause) + } + + private inline fun lexerCheck(value: Boolean, offset: Int = idx, lazyMessage: () -> Any) { + if (!value) { + val message = lazyMessage() + fail(message.toString(), offset) + } + } +} + +/** + * Unescape a JSON string (either object key or string value) + */ +private fun String.unescape(): String { + val str = this + return buildString(str.length + 1) { + var i = 0 + while (i < str.length) { + val chr = str[i] + when (chr) { + '\\' -> { + i++ // consume backslash + when (val byte = str[i++]) { + 'u' -> { + i += readEscapedUnicode(str, i, this) + } + '\\' -> append('\\') + '/' -> append('/') + '"' -> append('"') + 'b' -> append('\b') + 'f' -> append('\u000C') + 'r' -> append('\r') + 'n' -> append('\n') + 't' -> append('\t') + else -> throw DeserializationException("Invalid escape character: `$byte`") + } + } + else -> { + append(chr) + i++ + } + } + } + } +} + +/** + * Reads an escaped unicode code point from [s] starting at [start] offset. This assumes that '\u' has already + * been consumed and [start] is pointing to the first hex digit. If the code point represents a surrogate pair + * an additional escaped code point will be consumed from the string. + * @param s The string to decode from + * @param start The starting index to start reading from + * @param sb The string builder to append unescaped unicode characters to + * @return The number of characters consumed + */ +private fun readEscapedUnicode(s: String, start: Int, sb: StringBuilder): Int { + // already consumed \u escape, take next 4 bytes as high + check(start + 4 <= s.length) { "Unexpected EOF reading escaped high surrogate" } + val high = s.substring(start, start + 4).decodeEscapedCodePoint() + var consumed = 4 + if (high.isHighSurrogate()) { + val lowStart = start + consumed + val escapedLow = s.substring(lowStart, lowStart + 6) + check(escapedLow.startsWith("\\u")) { "Expected surrogate pair, found `$escapedLow`" } + val low = escapedLow.substring(2).decodeEscapedCodePoint() + check(low.isLowSurrogate()) { "Invalid surrogate pair: (${high.code}, ${low.code})" } + sb.append(high, low) + consumed += 6 + } else { + sb.append(high) + } + return consumed +} + +/** + * decode an escaped unicode character to an integer code point (e.g. D801) + * the escape characters `\u` should be stripped from the input before calling + */ +private fun String.decodeEscapedCodePoint(): Char { + check(all { it in '0'..'9' || it in 'a'..'f' || it in 'A'..'F' }) { "Invalid unicode escape: `\\u$this`" } + return toInt(16).toChar() +} + +/** + * Test whether a character is a control character (ignoring SP and DEL) + */ +private fun Char.isControl(): Boolean = code in 0x00..0x1F diff --git a/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamReader.kt b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamReader.kt index 5c6519a021..38c075fcc3 100644 --- a/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamReader.kt +++ b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamReader.kt @@ -4,6 +4,7 @@ */ package aws.smithy.kotlin.runtime.serde.json +import aws.smithy.kotlin.runtime.serde.DeserializationException import aws.smithy.kotlin.runtime.util.InternalApi /** @@ -23,11 +24,25 @@ interface JsonStreamReader { /** * Peek at the next token type */ - suspend fun peek(): RawJsonToken + suspend fun peek(): JsonToken } /* * Creates a [JsonStreamReader] instance */ @InternalApi -expect fun jsonStreamReader(payload: ByteArray): JsonStreamReader +fun jsonStreamReader(payload: ByteArray): JsonStreamReader = JsonLexer(payload) + +// return the next token and require that it be of type [TExpected] or else throw an exception +internal suspend inline fun JsonStreamReader.nextTokenOf(): TExpected { + val token = this.nextToken() + requireToken(token) + return token as TExpected +} + +// require that the given token be of type [TExpected] or else throw an exception +internal inline fun requireToken(token: JsonToken) { + if (token::class != TExpected::class) { + throw DeserializationException("expected ${TExpected::class}; found ${token::class}") + } +} diff --git a/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamWriter.kt b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamWriter.kt index 63777f86ce..aded9cede8 100644 --- a/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamWriter.kt +++ b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamWriter.kt @@ -4,6 +4,8 @@ */ package aws.smithy.kotlin.runtime.serde.json +import aws.smithy.kotlin.runtime.util.InternalApi + /** * Interface for serialization. Specific formats should implement this interface according to their * own requirements. Currently only aws.smithy.kotlin.runtime.serde.json.JsonSerializer implements this interface. @@ -100,7 +102,8 @@ interface JsonStreamWriter { val bytes: ByteArray? } -/* -* Creates a [JsonStreamWriter] instance to write JSON -*/ -internal expect fun jsonStreamWriter(pretty: Boolean = false): JsonStreamWriter +/** + * Creates a [JsonStreamWriter] instance to write JSON + */ +@InternalApi +fun jsonStreamWriter(pretty: Boolean = false): JsonStreamWriter = JsonEncoder(pretty) diff --git a/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonToken.kt b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonToken.kt index 61831b879d..ca274251e1 100644 --- a/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonToken.kt +++ b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/JsonToken.kt @@ -73,59 +73,3 @@ sealed class JsonToken { EndDocument -> "EndDocument" } } - -/** - * Raw JSON token information without the associated token value - */ -enum class RawJsonToken { - /** - * The opening of a JSON array '[' - */ - BeginArray, - - /** - * The closing of a JSON array ']' - */ - EndArray, - - /** - * The opening of a JSON object '{' - */ - BeginObject, - - /** - * The closing of a JSON object '}' - */ - EndObject, - - /** - * A JSON property name - */ - Name, - - /** - * A JSON string - */ - String, - - /** - * A JSON number - */ - Number, - - /** - * A JSON boolean - */ - Bool, - - /** - * A JSON 'null' - */ - Null, - - /** - * The end of the JSON stream to signal that the JSON-encoded value has no more - * tokens - */ - EndDocument; -} diff --git a/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/LexerState.kt b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/LexerState.kt new file mode 100644 index 0000000000..8b0b1f5a65 --- /dev/null +++ b/runtime/serde/serde-json/common/src/aws/smithy/kotlin/runtime/serde/json/LexerState.kt @@ -0,0 +1,38 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +package aws.smithy.kotlin.runtime.serde.json + +internal enum class LexerState { + /** + * Entry point. Expecting any JSON value + */ + Initial, + + /** + * Expecting the next token to be the *first* value in an array, or the end of the array. + */ + ArrayFirstValueOrEnd, + + /** + * Expecting the next token to the next value in an array, or the end of the array. + */ + ArrayNextValueOrEnd, + + /** + * Expecting the next token to be the *first* key in the object, or the end of the object. + */ + ObjectFirstKeyOrEnd, + + /** + * Expecting the next token to the next object key, or the end of the object. + */ + ObjectNextKeyOrEnd, + + /** + * Expecting the next token to be the value of a field in an object. + */ + ObjectFieldValue, +} diff --git a/runtime/serde/serde-json/common/test/aws/smithy/kotlin/runtime/serde/json/JsonStreamReaderTest.kt b/runtime/serde/serde-json/common/test/aws/smithy/kotlin/runtime/serde/json/JsonStreamReaderTest.kt index 5094602423..47e14a0799 100644 --- a/runtime/serde/serde-json/common/test/aws/smithy/kotlin/runtime/serde/json/JsonStreamReaderTest.kt +++ b/runtime/serde/serde-json/common/test/aws/smithy/kotlin/runtime/serde/json/JsonStreamReaderTest.kt @@ -4,57 +4,17 @@ */ package aws.smithy.kotlin.runtime.serde.json +import aws.smithy.kotlin.runtime.serde.DeserializationException import aws.smithy.kotlin.runtime.testing.runSuspendTest -import kotlin.test.Test -import kotlin.test.assertEquals +import io.kotest.matchers.collections.shouldContainExactly +import io.kotest.matchers.string.shouldContain +import kotlin.test.* -suspend fun JsonStreamReader.allTokens(): List { - val tokens = mutableListOf() - while (true) { - val token = nextToken() - tokens.add(token) - if (token is JsonToken.EndDocument) { - break - } - } - return tokens -} - -fun assertTokensAreEqual(expected: List, actual: List) { - assertEquals(expected.size, actual.size, "unbalanced tokens") - val pairs = expected.zip(actual) - pairs.forEach { (exp, act) -> - assertEquals(exp, act) - } -} - -@OptIn(ExperimentalStdlibApi::class) class JsonStreamReaderTest { - @Test - fun itDeserializesObjects() = runSuspendTest { - val payload = """ - { - "x": 1, - "y": "2" - } - """.trimIndent().encodeToByteArray() - val actual = jsonStreamReader(payload).allTokens() - val expected = listOf( - JsonToken.BeginObject, - JsonToken.Name("x"), - JsonToken.Number("1"), - JsonToken.Name("y"), - JsonToken.String("2"), - JsonToken.EndObject, - JsonToken.EndDocument - ) - assertTokensAreEqual(expected, actual) - } - - @Test - fun kitchenSink() = runSuspendTest { - val payload = """ + object KitchenSink { + // serialized JSON document + val payload: String = """ { "num": 1, "str": "string", @@ -70,9 +30,10 @@ class JsonStreamReaderTest { }, "null": null } - """.trimIndent().encodeToByteArray() - val actual = jsonStreamReader(payload).allTokens() - val expected = listOf( + """.trimIndent() + + // expected tokens for "content" + val tokens: List = listOf( JsonToken.BeginObject, JsonToken.Name("num"), JsonToken.Number("1"), @@ -103,8 +64,147 @@ class JsonStreamReaderTest { JsonToken.EndObject, JsonToken.EndDocument ) + } + + @Test + fun itDeserializesObjects() = runSuspendTest { + // language=JSON + val actual = """ + { + "x": 1, + "y": "2" + } + """.allTokens() - assertTokensAreEqual(expected, actual) + actual.shouldContainExactly( + JsonToken.BeginObject, + JsonToken.Name("x"), + JsonToken.Number("1"), + JsonToken.Name("y"), + JsonToken.String("2"), + JsonToken.EndObject, + JsonToken.EndDocument + ) + } + + @Test + fun isDeserializesArrays() = runSuspendTest { + // language=JSON + val actual = """[ "hello", "world" ]""".allTokens() + + actual.shouldContainExactly( + JsonToken.BeginArray, + JsonToken.String("hello"), + JsonToken.String("world"), + JsonToken.EndArray, + JsonToken.EndDocument + ) + } + + @Test + fun itFailsOnUnclosedArrays(): Unit = runSuspendTest { + assertFailsWith { + """[ "hello", "world" """.allTokens() + }.message.shouldContain("expected one of `,`, `]`") + } + + @Test + fun itFailsOnNaN(): Unit = runSuspendTest { + assertFailsWith("Invalid number") { + // language=JSON + """[NaN]""".allTokens() + } + } + + @Test + fun itFailsOnMissingComma(): Unit = runSuspendTest { + assertFailsWith { + """[3[4]]""".allTokens() + }.message.shouldContain("Unexpected JSON token at offset 2; found `[`, expected one of `,`, `]`") + } + + @Test + fun itFailsOnTrailingComma(): Unit = runSuspendTest { + assertFailsWith { + """["",]""".allTokens() + }.message.shouldContain("Unexpected JSON token at offset 4; found `]`, expected one of `{`, `[`") + + assertFailsWith { + """{"foo":"bar",}""".allTokens() + }.message.shouldContain("Unexpected JSON token at offset 13; found `}`, expected `\"`") + } + + @Test + fun itDeserializesSingleScalarStrings() = runSuspendTest { + // language=JSON + val actual = "\"hello\"".allTokens() + actual.shouldContainExactly( + JsonToken.String("hello"), + JsonToken.EndDocument + ) + } + + @Test + fun itDeserializesSingleScalarNumbers() = runSuspendTest { + // language=JSON + val actual = "1.2".allTokens() + actual.shouldContainExactly( + JsonToken.Number("1.2"), + JsonToken.EndDocument + ) + } + + @Test + fun itCanHandleAllDataTypes() = runSuspendTest { + // language=JSON + val actual = """[ "hello", true, false, 1.0, 1, -34.234e3, null ]""".allTokens() + + actual.shouldContainExactly( + JsonToken.BeginArray, + JsonToken.String("hello"), + JsonToken.Bool(true), + JsonToken.Bool(false), + JsonToken.Number("1.0"), + JsonToken.Number("1"), + JsonToken.Number("-34.234e3"), + JsonToken.Null, + JsonToken.EndArray, + JsonToken.EndDocument + ) + } + + @Test + fun canHandleNesting() = runSuspendTest { + // language=JSON + val actual = """ + [ + "hello", + { + "foo": [ + 20, + true, + null + ], + "bar": "value" + } + ]""".allTokens() + + actual.shouldContainExactly( + JsonToken.BeginArray, + JsonToken.String("hello"), + JsonToken.BeginObject, + JsonToken.Name("foo"), + JsonToken.BeginArray, + JsonToken.Number("20"), + JsonToken.Bool(true), + JsonToken.Null, + JsonToken.EndArray, + JsonToken.Name("bar"), + JsonToken.String("value"), + JsonToken.EndObject, + JsonToken.EndArray, + JsonToken.EndDocument + ) } @Test @@ -123,8 +223,8 @@ class JsonStreamReaderTest { }, "y": 2 } - """.trimIndent().encodeToByteArray() - val reader = jsonStreamReader(payload) + """.trimIndent() + val reader = newReader(payload) // skip x reader.apply { nextToken() // begin obj @@ -140,6 +240,69 @@ class JsonStreamReaderTest { assertEquals("y", y.value) } + @Test + fun itSkipsValuesRecursivelyAfterPeek() = runSuspendTest { + val payload = """ + { + "x": 1, + "nested": { + "a": "a", + "unknown": { + "b": "b", + "c": ["d", "e", "f"], + "g": { + "h": "h", + "i": "i" + } + }, + "k": "k" + }, + "y": 2 + } + """.trimIndent() + val reader = newReader(payload) + // skip x + // BeginObj, x, value + repeat(3) { reader.nextToken() } + + val nested = reader.nextToken() as JsonToken.Name + assertEquals("nested", nested.value) + // BeginObj, a, value + repeat(3) { reader.nextToken() } + + val unknown = reader.nextToken() as JsonToken.Name + assertEquals("unknown", unknown.value) + // skip the entire unknown subtree + reader.skipNext() + reader.peek() + + val remaining = mutableListOf() + for (i in 0..6) { + remaining.add(reader.nextToken()) + } + + remaining.shouldContainExactly( + JsonToken.Name("k"), + JsonToken.String("k"), + JsonToken.EndObject, + JsonToken.Name("y"), + JsonToken.Number("2"), + JsonToken.EndObject, + JsonToken.EndDocument + ) + } + + @Test + fun testPeek() = runSuspendTest { + val reader = newReader(KitchenSink.payload) + KitchenSink.tokens.forEachIndexed { idx, expectedToken -> + repeat(2) { + assertEquals(expectedToken, reader.peek(), "[idx=$idx] unexpected peeked token") + } + assertEquals(expectedToken, reader.nextToken(), "[idx=$idx] unexpected next token") + } + } + @Test fun itSkipsSimpleValues() = runSuspendTest { val payload = """ @@ -148,8 +311,8 @@ class JsonStreamReaderTest { "z": "unknown", "y": 2 } - """.trimIndent().encodeToByteArray() - val reader = jsonStreamReader(payload) + """.trimIndent() + val reader = newReader(payload) // skip x reader.apply { nextToken() // begin obj @@ -164,4 +327,163 @@ class JsonStreamReaderTest { val y = reader.nextToken() as JsonToken.Name assertEquals("y", y.value) } + + @Test + fun kitchenSink() = runSuspendTest { + val actual = KitchenSink.payload.allTokens() + actual.shouldContainExactly(KitchenSink.tokens) + } + + @Test + fun itHandlesEscapes() = runSuspendTest { + val tests = listOf( + """\"quote""" to "\"quote", + """\/forward-slash""" to "/forward-slash", + """\\back-slash""" to "\\back-slash", + """\bbackspace""" to "\bbackspace", + """\fformfeed""" to "\u000cformfeed", + """\nlinefeed""" to "\nlinefeed", + """\rcarriage-return""" to "\rcarriage-return", + """\ttab""" to "\ttab", + // Unicode + ) + + tests.forEach { + val actual = """ + { + "foo": "${it.first}" + } + """.trimIndent().allTokens() + + actual.shouldContainExactly( + JsonToken.BeginObject, + JsonToken.Name("foo"), + JsonToken.String(it.second), + JsonToken.EndObject, + JsonToken.EndDocument + ) + } + } + + @Test + fun testUnescapeControls(): Unit = runSuspendTest { + assertEquals("\"test\"", """"\"test\""""".decodeJsonStringToken()) + assertEquals("foo\rbar", """"foo\rbar"""".decodeJsonStringToken()) + assertEquals("foo\r\n", """"foo\r\n"""".decodeJsonStringToken()) + assertEquals("\r\nbar", """"\r\nbar"""".decodeJsonStringToken()) + assertEquals("\bf\u000Co\to\r\n", """"\bf\fo\to\r\n"""".decodeJsonStringToken()) + } + + @Test + fun testUnicodeUnescape(): Unit = runSuspendTest { + assertFailsWith { + """"\uD801\nasdf"""".allTokens() + }.message.shouldContain("Expected surrogate pair") + + assertFailsWith { + """"\uD801\u00"""".allTokens() + }.message.shouldContain("Unexpected EOF") + + assertFailsWith { + """"\uD801\u+04D"""".allTokens() + }.message.shouldContain("Invalid unicode escape: `\\u+04D`") + + assertFailsWith { + """"\u00"""".allTokens() + }.message.shouldContain("Unexpected EOF") + + assertFailsWith { + """"\uD801\uC501"""".allTokens() + }.message.shouldContain("Invalid surrogate pair: (${0xD801}, ${0xC501})") + + assertFailsWith { + """"\zD801\uC501"""".allTokens() + }.message.shouldContain("Invalid escape character: `z`") + + assertEquals("\uD801\uDC37", """"\uD801\udc37"""".decodeJsonStringToken(), "surrogate pair") + assertEquals("\u0000", """"\u0000"""".decodeJsonStringToken()) + assertEquals("\u001f", """"\u001f"""".decodeJsonStringToken()) + } + + @Test + fun testUnescapedControlChars() = runSuspendTest { + assertFailsWith { + """["new +line"]""".allTokens() + }.message.shouldContain("Unexpected control character") + + assertFailsWith { + val tokens = """["foo tab"]""".trimIndent().allTokens() + println(tokens) + }.message.shouldContain("Unexpected control character") + + // whitespace should be fine + assertEquals("foo space", """"foo space"""".decodeJsonStringToken()) + // delete should be fine + assertEquals("\u007F", """""""".decodeJsonStringToken()) + } + + @Test + fun testUnicodeTokens() = runSuspendTest { + + val languages = listOf( + "こんにちは世界", + "مرحبا بالعالم", + "Привет, мир", + "Γειά σου Κόσμε", + "नमस्ते दुनिया", + "you have summoned ZA̡͊͠͝LGΌ" + ) + + languages.forEach { lang -> + val actual = """ + { + "foo": "$lang", + "$lang": "bar" + } + """.trimIndent().allTokens() + + actual.shouldContainExactly( + JsonToken.BeginObject, + JsonToken.Name("foo"), + JsonToken.String(lang), + JsonToken.Name(lang), + JsonToken.String("bar"), + JsonToken.EndObject, + JsonToken.EndDocument + ) + } + } +} + +private suspend fun String.decodeJsonStringToken(): String { + val reader = newReader(this) + val tokens = mutableListOf() + while (true) { + val token = reader.nextToken() + tokens.add(token) + if (token is JsonToken.EndDocument) { + break + } + } + + // element + end doc + assertEquals(2, tokens.size) + val token = tokens.first() + assertTrue(token is JsonToken.String) + return token.value +} + +private suspend fun String.allTokens(): List { + val reader = newReader(this) + val tokens = mutableListOf() + while (true) { + val token = reader.nextToken() + tokens.add(token) + if (token is JsonToken.EndDocument) { + return tokens + } + } } + +private fun newReader(contents: String): JsonStreamReader = JsonLexer(contents.encodeToByteArray()) diff --git a/runtime/serde/serde-json/common/test/aws/smithy/kotlin/runtime/serde/json/JsonStreamWriterTest.kt b/runtime/serde/serde-json/common/test/aws/smithy/kotlin/runtime/serde/json/JsonStreamWriterTest.kt index 02d4f3bd5d..db3c888205 100644 --- a/runtime/serde/serde-json/common/test/aws/smithy/kotlin/runtime/serde/json/JsonStreamWriterTest.kt +++ b/runtime/serde/serde-json/common/test/aws/smithy/kotlin/runtime/serde/json/JsonStreamWriterTest.kt @@ -6,142 +6,306 @@ package aws.smithy.kotlin.runtime.serde.json import kotlin.test.Test import kotlin.test.assertEquals +import kotlin.test.assertFailsWith @OptIn(ExperimentalStdlibApi::class) class JsonStreamWriterTest { - @Test - fun checkJsonSerializesCorrectlyWithWrapper() { - val msg1 = Message( - 912345678901, - "How do I stream JSON in Java?", - null, - user1 - ) - val msg2 = Message( - 912345678902, - "@json_newb just use JsonWriter!", - arrayOf(50.454722, -104.606667), - user2 - ) - assertEquals( - expected, - writeJsonStream( - listOf(msg1, msg2) - )?.decodeToString() - ) + fun testArrayOfObjects() { + // language=JSON + val expected = """[ + { + "id": "msg1", + "meta": { + "description": "first message" + }, + "nestedArray": [ + 1, + 2.2, + -3.3333 + ], + "integerId": 1 + }, + { + "id": "msg2", + "meta": { + "description": "second message" + }, + "nestedArray": [ + 4, + 5 + ], + "integerId": 2 } +]""" - @Test - fun checkCloseIsIdempotent() { - val writer = jsonStreamWriter(true) - writer.beginObject() - writer.writeName("id") - writer.writeValue(912345678901) - writer.endObject() - val expectedIdempotent = """{ - "id": 912345678901 -}""" - assertEquals(expectedIdempotent, writer.bytes?.decodeToString()) + val writer = jsonStreamWriter(true).apply { + beginArray() + + beginObject() + writeName("id") + writeValue("msg1") + writeName("meta") + beginObject() + writeName("description") + writeValue("first message") + endObject() + writeName("nestedArray") + beginArray() + writeValue(1) + writeValue(2.2f) + writeValue(-3.3333) + endArray() + writeName("integerId") + writeValue(1) + endObject() + + beginObject() + writeName("id") + writeValue("msg2") + writeName("meta") + beginObject() + writeName("description") + writeValue("second message") + endObject() + writeName("nestedArray") + beginArray() + writeValue(4) + writeValue(5) + endArray() + writeName("integerId") + writeValue(2) + endObject() + + endArray() + } + val actual = writer.bytes?.decodeToString() + + assertEquals(expected, actual) } @Test - fun checkNonHumanReadable() { + fun testObject() { val writer = jsonStreamWriter() writer.beginObject() writer.writeName("id") writer.writeValue(912345678901) writer.endObject() - val expectedNoIndent = """{"id":912345678901}""" - assertEquals(expectedNoIndent, writer.bytes?.decodeToString()) + // language=JSON + val expected = """{"id":912345678901}""" + assertEquals(expected, writer.bytes?.decodeToString()) } @Test - fun itAllowsRawValues() { + fun testWriteRawValue() { val writer = jsonStreamWriter() + // language=JSON val expected = """{"foo":1234.5678}""" writer.writeRawValue(expected) assertEquals(expected, writer.bytes?.decodeToString()) } -} -fun writeJsonStream(messages: List): ByteArray? { - val writer = jsonStreamWriter(true) - return writeMessagesArray(writer, messages) -} + @Test + fun testPretty() { + // language=JSON + val expected = """{ + "foo": "bar", + "nested": { + "array": [ + 1, + 2, + 3 + ], + "bool": true + }, + "baz": -1.23 +}""" + val writer = jsonStreamWriter(true).apply { + beginObject() + writeName("foo") + writeValue("bar") + writeName("nested") + beginObject() + writeName("array") + beginArray() + writeValue(1) + writeValue(2) + writeValue(3) + endArray() + writeName("bool") + writeValue(true) + endObject() + writeName("baz") + writeValue(-1.23) + endObject() + } + val actual = writer.bytes?.decodeToString() + assertEquals(expected, actual) + } -fun writeMessagesArray(writer: JsonStreamWriter, messages: List): ByteArray? { - writer.beginArray() - for (message in messages) { - writeMessage(writer, message) + @Test + fun testBoolean() { + val actual = jsonStreamWriter().apply { + beginArray() + writeValue(true) + writeValue(false) + endArray() + }.bytes?.decodeToString() + assertEquals("[true,false]", actual) } - writer.endArray() - return writer.bytes -} -fun writeMessage(writer: JsonStreamWriter, message: Message) { - writer.beginObject() - writer.writeName("id") - writer.writeValue(message.id) - writer.writeName("text") - writer.writeValue(message.text) - if (message.geo != null) { - writer.writeName("geo") - writeDoublesArray(writer, message.geo) - } else { - writer.writeName("geo") - writer.writeNull() + @Test + fun testNull() { + val actual = jsonStreamWriter().apply { + writeNull() + }.bytes?.decodeToString() + assertEquals("null", actual) } - writer.writeName("user") - writeUser(writer, message.user) - writer.endObject() -} -fun writeUser(writer: JsonStreamWriter, user: User) { - writer.beginObject() - writer.writeName("name") - writer.writeValue(user.name) - writer.writeName("followers_count") - writer.writeValue(user.followersCount) - writer.endObject() -} + @Test + fun testEmpty() { + val actualEmptyArray = jsonStreamWriter().apply { + beginArray() + endArray() + }.bytes?.decodeToString() + assertEquals("[]", actualEmptyArray) -fun writeDoublesArray(writer: JsonStreamWriter, doubles: Array?) { - writer.beginArray() - if (doubles != null) { - for (value in doubles) { - writer.writeValue(value) + val actualEmptyObject = jsonStreamWriter().apply { + beginObject() + endObject() + }.bytes?.decodeToString() + assertEquals("{}", actualEmptyObject) + } + + @Test + fun testObjectInsideArray() { + val actual = jsonStreamWriter().apply { + beginArray() + repeat(3) { + beginObject() + endObject() + } + endArray() + }.bytes?.decodeToString() + assertEquals("[{},{},{}]", actual) + } + + @Test + fun testObjectInsideObject() { + val actual = jsonStreamWriter().apply { + beginObject() + writeName("nested") + beginObject() + writeName("foo") + writeValue("bar") + endObject() + endObject() + }.bytes?.decodeToString() + assertEquals("""{"nested":{"foo":"bar"}}""", actual) + } + + @Test + fun testArrayInsideObject() { + val actual = jsonStreamWriter().apply { + beginObject() + writeName("foo") + beginArray() + endArray() + + writeName("b\nar") + beginArray() + endArray() + endObject() + }.bytes?.decodeToString() + assertEquals("""{"foo":[],"b\nar":[]}""", actual) + } + + @Test + fun testArrayInsideArray() { + val actual = jsonStreamWriter().apply { + beginArray() + beginArray() + writeValue(5) + endArray() + beginArray() + endArray() + endArray() + }.bytes?.decodeToString() + assertEquals("""[[5],[]]""", actual) + } + + @Test + fun testEscape() { + val tests = listOf( + // sanity check values that shouldn't be escaped + "" to "", + "foo" to "foo", + // surrogate pair + "\uD801\uDC37" to "\uD801\uDC37", + + // escaped + "foo\r\n" to "foo\\r\\n", + "foo\r\nbar" to "foo\\r\\nbar", + "foo\bar" to "foo\\bar", + "\u000Coobar" to "\\foobar", + "\u0008f\u000Co\to\r\n" to "\\bf\\fo\\to\\r\\n", + "\"test\"" to "\\\"test\\\"", + "\u0000" to "\\u0000", + "\u001f" to "\\u001f", + ) + + tests.forEachIndexed { idx, test -> + assertEquals(test.second, test.first.escape(), "[idx=$idx] escaped value not equal") } } - writer.endArray() -} -val user1 = User("json_newb", 41) -val user2 = User("jesse", 2) + @Test + fun testInvalidClose() { + assertFailsWith("end empty array") { + jsonStreamWriter().apply { + beginArray() + endObject() + } + } -class Message(val id: Long, val text: String, val geo: Array?, val user: User) -data class User(val name: String, val followersCount: Int) + assertFailsWith("end non-empty array") { + jsonStreamWriter().apply { + beginArray() + writeValue(1) + endObject() + } + } -val expected: String = """[ - { - "id": 912345678901, - "text": "How do I stream JSON in Java?", - "geo": null, - "user": { - "name": "json_newb", - "followers_count": 41 + assertFailsWith("end empty object") { + jsonStreamWriter().apply { + beginObject() + endArray() + } } - }, - { - "id": 912345678902, - "text": "@json_newb just use JsonWriter!", - "geo": [ - 50.454722, - -104.606667 - ], - "user": { - "name": "jesse", - "followers_count": 2 + + assertFailsWith("end object key no value") { + jsonStreamWriter().apply { + beginObject() + writeName("foo") + endObject() + } + } + + assertFailsWith("end non empty object") { + jsonStreamWriter().apply { + beginObject() + writeName("foo") + writeValue(1) + endArray() + } + } + + assertFailsWith("end array without start") { + jsonStreamWriter().apply { endObject() } + } + + assertFailsWith("end object without start") { + jsonStreamWriter().apply { endArray() } } } -]""" +} diff --git a/runtime/serde/serde-json/jvm/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamReaderGson.kt b/runtime/serde/serde-json/jvm/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamReaderGson.kt deleted file mode 100644 index 60765ac7d6..0000000000 --- a/runtime/serde/serde-json/jvm/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamReaderGson.kt +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. - */ -package aws.smithy.kotlin.runtime.serde.json - -import aws.smithy.kotlin.runtime.serde.DeserializationException -import com.google.gson.stream.JsonReader -import java.nio.charset.Charset -import com.google.gson.stream.JsonToken as RawToken - -private class JsonStreamReaderGson(payload: ByteArray, charset: Charset = Charsets.UTF_8) : JsonStreamReader { - private val reader = JsonReader(payload.inputStream().reader(charset)) - - override suspend fun nextToken(): JsonToken = when (peek()) { - RawJsonToken.BeginArray -> { - reader.beginArray() - JsonToken.BeginArray - } - RawJsonToken.EndArray -> { - reader.endArray() - JsonToken.EndArray - } - RawJsonToken.BeginObject -> { - reader.beginObject() - JsonToken.BeginObject - } - RawJsonToken.EndObject -> { - reader.endObject() - JsonToken.EndObject - } - RawJsonToken.Name -> JsonToken.Name(reader.nextName()) - RawJsonToken.String -> JsonToken.String(reader.nextString()) - RawJsonToken.Number -> JsonToken.Number(reader.nextString()) - RawJsonToken.Bool -> JsonToken.Bool(reader.nextBoolean()) - RawJsonToken.Null -> { - reader.nextNull() - JsonToken.Null - } - RawJsonToken.EndDocument -> JsonToken.EndDocument - } - - override suspend fun skipNext() = reader.skipValue() - - override suspend fun peek(): RawJsonToken = when (reader.peek()) { - RawToken.BEGIN_ARRAY -> RawJsonToken.BeginArray - RawToken.END_ARRAY -> RawJsonToken.EndArray - RawToken.BEGIN_OBJECT -> RawJsonToken.BeginObject - RawToken.END_OBJECT -> RawJsonToken.EndObject - RawToken.NAME -> RawJsonToken.Name - RawToken.STRING -> RawJsonToken.String - RawToken.NUMBER -> RawJsonToken.Number - RawToken.BOOLEAN -> RawJsonToken.Bool - RawToken.NULL -> RawJsonToken.Null - RawToken.END_DOCUMENT -> RawJsonToken.EndDocument - else -> throw DeserializationException("unknown JSON token encountered during deserialization") - } -} - -/* -* Creates a [JsonStreamReader] instance -*/ -actual fun jsonStreamReader(payload: ByteArray): JsonStreamReader = JsonStreamReaderGson(payload) diff --git a/runtime/serde/serde-json/jvm/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamWriterGson.kt b/runtime/serde/serde-json/jvm/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamWriterGson.kt deleted file mode 100644 index d33a862857..0000000000 --- a/runtime/serde/serde-json/jvm/src/aws/smithy/kotlin/runtime/serde/json/JsonStreamWriterGson.kt +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. - */ -package aws.smithy.kotlin.runtime.serde.json - -import aws.smithy.kotlin.runtime.serde.SerializationException -import com.google.gson.stream.JsonWriter -import java.io.BufferedWriter -import java.io.ByteArrayOutputStream -import java.io.IOException -import java.io.OutputStreamWriter - -/** - * Thin wrapper around Gson's JSON generator. Uses the gson.stream library's JsonWriter. - * https://www.javadoc.io/doc/com.google.code.gson/gson/latest/com.google.gson/com/google/gson/stream/JsonWriter.html - */ -private class JsonStreamWriterGson(pretty: Boolean) : JsonStreamWriter { - private val DEFAULT_BUFFER_SIZE = 1024 - private val baos: ByteArrayOutputStream = ByteArrayOutputStream(DEFAULT_BUFFER_SIZE) - - private var jsonStreamWriter: JsonWriter - - init { - try { - /** - * A [JsonWriter] created is by default enabled with UTF-8 encoding - */ - val bufferedWriter = BufferedWriter(OutputStreamWriter(baos, "UTF-8")) - var jsonWriter = JsonWriter(bufferedWriter) - if (pretty) { - jsonWriter.setIndent(" ") - } - jsonStreamWriter = jsonWriter - } catch (e: IOException) { - throw SerializationException(e) - } - } - - /** - * Closes the jsonStreamWriter and flushes to write. Must be called when finished writing JSON - * content. - */ - private fun close() { - try { - jsonStreamWriter.close() - } catch (e: IOException) { - throw SerializationException(e) - } - } - - /** - * Get the JSON content as a UTF-8 encoded byte array. It is recommended to hold onto the array - * reference rather then making repeated calls to this method as a new array will be created - * each time. - * - * @return Array of UTF-8 encoded bytes that make up the generated JSON. - */ - override val bytes: ByteArray - get() { - close() - return baos.toByteArray() - } - - override fun beginArray() { - try { - jsonStreamWriter.beginArray() - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun endArray() { - try { - jsonStreamWriter.endArray() - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun writeNull() { - try { - jsonStreamWriter.nullValue() - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun beginObject() { - try { - jsonStreamWriter.beginObject() - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun endObject() { - try { - jsonStreamWriter.endObject() - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun writeName(name: String) { - try { - jsonStreamWriter.name(name) - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun writeValue(value: String) { - try { - jsonStreamWriter.value(value) - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun writeValue(bool: Boolean) { - try { - jsonStreamWriter.value(bool) - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun writeValue(value: Long) { - try { - jsonStreamWriter.value(value) - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun writeValue(value: Double) { - try { - jsonStreamWriter.value(value) - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun writeValue(value: Float) { - try { - jsonStreamWriter.value(value) - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun writeValue(value: Short) { - try { - jsonStreamWriter.value(value) - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun writeValue(value: Int) { - try { - jsonStreamWriter.value(value) - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun writeValue(value: Byte) { - try { - jsonStreamWriter.value(value.toLong()) - } catch (e: IOException) { - throw SerializationException(e) - } - } - - override fun writeRawValue(value: String) { - try { - jsonStreamWriter.jsonValue(value) - } catch (e: IOException) { - throw SerializationException(e) - } - } -} - -/* -* Creates a [JsonStreamWriter] instance to write JSON -*/ -internal actual fun jsonStreamWriter(pretty: Boolean): JsonStreamWriter = JsonStreamWriterGson(pretty) diff --git a/runtime/serde/serde-xml/common/src/aws/smithy/kotlin/runtime/serde/xml/XmlSerializer.kt b/runtime/serde/serde-xml/common/src/aws/smithy/kotlin/runtime/serde/xml/XmlSerializer.kt index ef67a908e5..78c7e90af6 100644 --- a/runtime/serde/serde-xml/common/src/aws/smithy/kotlin/runtime/serde/xml/XmlSerializer.kt +++ b/runtime/serde/serde-xml/common/src/aws/smithy/kotlin/runtime/serde/xml/XmlSerializer.kt @@ -8,6 +8,7 @@ import aws.smithy.kotlin.runtime.serde.* import aws.smithy.kotlin.runtime.serde.xml.dom.* import aws.smithy.kotlin.runtime.time.Instant import aws.smithy.kotlin.runtime.time.TimestampFormat +import aws.smithy.kotlin.runtime.util.* /** * Provides serialization for the XML message format. @@ -17,8 +18,8 @@ import aws.smithy.kotlin.runtime.time.TimestampFormat class XmlSerializer(private val xmlWriter: XmlStreamWriter = xmlStreamWriter()) : Serializer, StructSerializer { // FIXME - clean up stack to distinguish between mutable/immutable and move to utils? (e.g. MutableStack = mutableStackOf()) - private var nodeStack: Stack = mutableListOf() - internal var parentDescriptorStack: Stack = mutableListOf() + private var nodeStack: ListStack = mutableListOf() + internal var parentDescriptorStack: ListStack = mutableListOf() override fun toByteArray(): ByteArray = xmlWriter.bytes @@ -28,7 +29,7 @@ class XmlSerializer(private val xmlWriter: XmlStreamWriter = xmlStreamWriter()) // use the parent descriptor instead of the object descriptor passed to us. // The object descriptor is for root nodes, nested structures have their own field descriptor // that describes the referred to struct - val structDescriptor = parentDescriptorStack.peekOrNull() ?: descriptor + val structDescriptor = parentDescriptorStack.topOrNull() ?: descriptor // Serialize top-level (root node) ns declarations and non-default declarations. val isRoot = nodeStack.isEmpty() @@ -74,7 +75,7 @@ class XmlSerializer(private val xmlWriter: XmlStreamWriter = xmlStreamWriter()) check(nodeStack.isNotEmpty()) { "Expected nodeStack to have a value, but was empty." } val tagName = nodeStack.pop() - if (parentDescriptorStack.isNotEmpty() && !parentDescriptorStack.peek().isMapOrList) { + if (parentDescriptorStack.isNotEmpty() && !parentDescriptorStack.top().isMapOrList) { xmlWriter.endTag(tagName) } } diff --git a/runtime/serde/serde-xml/common/src/aws/smithy/kotlin/runtime/serde/xml/dom/XmlNode.kt b/runtime/serde/serde-xml/common/src/aws/smithy/kotlin/runtime/serde/xml/dom/XmlNode.kt index 75e7d12269..9a33d671fa 100644 --- a/runtime/serde/serde-xml/common/src/aws/smithy/kotlin/runtime/serde/xml/dom/XmlNode.kt +++ b/runtime/serde/serde-xml/common/src/aws/smithy/kotlin/runtime/serde/xml/dom/XmlNode.kt @@ -9,7 +9,7 @@ import aws.smithy.kotlin.runtime.serde.DeserializationException import aws.smithy.kotlin.runtime.serde.xml.XmlStreamReader import aws.smithy.kotlin.runtime.serde.xml.XmlToken import aws.smithy.kotlin.runtime.serde.xml.xmlStreamReader -import aws.smithy.kotlin.runtime.util.InternalApi +import aws.smithy.kotlin.runtime.util.* /** * DOM representation of an XML document @@ -60,14 +60,14 @@ class XmlNode { // parse a string into a dom representation suspend fun parseDom(reader: XmlStreamReader): XmlNode { - val nodeStack: Stack = mutableListOf() + val nodeStack: ListStack = mutableListOf() loop@while (true) { when (val token = reader.nextToken()) { is XmlToken.BeginElement -> { val newNode = XmlNode.fromToken(token) if (nodeStack.isNotEmpty()) { - val curr = nodeStack.peek() + val curr = nodeStack.top() curr.addChild(newNode) newNode.parent = curr } @@ -75,7 +75,7 @@ suspend fun parseDom(reader: XmlStreamReader): XmlNode { nodeStack.push(newNode) } is XmlToken.EndElement -> { - val curr = nodeStack.peek() + val curr = nodeStack.top() if (curr.name != token.name) { throw DeserializationException("expected end of element: `${curr.name}`, found: `${token.name}`") @@ -87,7 +87,7 @@ suspend fun parseDom(reader: XmlStreamReader): XmlNode { } } is XmlToken.Text -> { - val curr = nodeStack.peek() + val curr = nodeStack.top() curr.text = token.value } null, @@ -100,13 +100,6 @@ suspend fun parseDom(reader: XmlStreamReader): XmlNode { return nodeStack.pop() } -fun MutableList.push(item: T) = add(item) -fun MutableList.pop(): T = removeLast() -fun MutableList.popOrNull(): T? = removeLastOrNull() -fun MutableList.peek(): T = this[count() - 1] -fun MutableList.peekOrNull(): T? = if (isNotEmpty()) peek() else null -typealias Stack = MutableList - fun XmlNode.toXmlString(pretty: Boolean = false): String { val sb = StringBuilder() formatXmlNode(this, 0, sb, pretty) diff --git a/runtime/serde/serde-xml/jvm/src/aws/smithy/kotlin/runtime/serde/xml/XmlStreamReaderXmlPull.kt b/runtime/serde/serde-xml/jvm/src/aws/smithy/kotlin/runtime/serde/xml/XmlStreamReaderXmlPull.kt index 07cab47bdf..86ea4cf099 100644 --- a/runtime/serde/serde-xml/jvm/src/aws/smithy/kotlin/runtime/serde/xml/XmlStreamReaderXmlPull.kt +++ b/runtime/serde/serde-xml/jvm/src/aws/smithy/kotlin/runtime/serde/xml/XmlStreamReaderXmlPull.kt @@ -7,7 +7,7 @@ package aws.smithy.kotlin.runtime.serde.xml import aws.smithy.kotlin.runtime.logging.Logger import aws.smithy.kotlin.runtime.serde.DeserializationException -import aws.smithy.kotlin.runtime.serde.xml.dom.push +import aws.smithy.kotlin.runtime.util.push import org.xmlpull.mxp1.MXParser import org.xmlpull.v1.XmlPullParser import org.xmlpull.v1.XmlPullParserException diff --git a/runtime/utils/common/src/aws/smithy/kotlin/runtime/util/Stack.kt b/runtime/utils/common/src/aws/smithy/kotlin/runtime/util/Stack.kt new file mode 100644 index 0000000000..cf9e3ce24d --- /dev/null +++ b/runtime/utils/common/src/aws/smithy/kotlin/runtime/util/Stack.kt @@ -0,0 +1,45 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +package aws.smithy.kotlin.runtime.util + +/** + * Convenience type for treating a list like stack + */ +typealias ListStack = MutableList + +/** + * Push an item to top of stack + */ +fun ListStack.push(item: T) = add(item) + +/** + * Pop the top of the stack or throw [NoSuchElementException] + */ +fun ListStack.pop(): T = removeLast() + +/** + * Pop the top of the stack or return null if stack is empty + */ +fun ListStack.popOrNull(): T? = removeLastOrNull() + +/** + * Return top of stack or throws exception if stack is empty + */ +fun ListStack.top(): T = this[count() - 1] + +/** + * Return top of stack or null if stack is empty + */ +fun ListStack.topOrNull(): T? = if (isNotEmpty()) top() else null + +/** + * Pop the top of the stack and push a [item] + */ +fun ListStack.replaceTop(item: T): T? { + val lastTop = popOrNull() + push(item) + return lastTop +} diff --git a/runtime/utils/common/src/aws/smithy/kotlin/runtime/util/text/Utf8.kt b/runtime/utils/common/src/aws/smithy/kotlin/runtime/util/text/Utf8.kt new file mode 100644 index 0000000000..360eb3b111 --- /dev/null +++ b/runtime/utils/common/src/aws/smithy/kotlin/runtime/util/text/Utf8.kt @@ -0,0 +1,53 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +package aws.smithy.kotlin.runtime.util.text + +import aws.smithy.kotlin.runtime.util.InternalApi + +@InternalApi +@Suppress("NOTHING_TO_INLINE") +inline fun byteCountUtf8(start: Byte): Int { + val x = start.toUInt() + return when { + x <= 0x7fu -> 1 // 0xxx xxxx one byte + x and 0xe0u == 0xc0u -> 2 // 110x xxxx two bytes + x and 0xf0u == 0xe0u -> 3 // 1110 xxxx three bytes + x and 0xf8u == 0xf0u -> 4 // 1111 0xxx 4 bytes + else -> throw IllegalStateException("$start is not a valid UTF-8 start sequence") + } +} + +/** + * The minimum value of a supplementary code point, `\u0x10000`. + */ +private const val SUPPLEMENTARY_PLANE_LOW: Int = 0x010000 + +/** + * Maximum value of a Unicode code point + */ +private const val MAX_CODEPOINT: Int = 0X10FFFF + +/** + * Checks to see if a codepoint is in the supplementary plane or not (surrogate pair) + */ +@InternalApi +fun Char.Companion.isSupplementaryCodePoint(codePoint: Int): Boolean = codePoint in SUPPLEMENTARY_PLANE_LOW..MAX_CODEPOINT + +/** + * Converts the [codePoint] to a char array. If the codepoint is in the supplementary plane then it will + * return an array with the high surrogate and low surrogate at indexes 0 and 1. Otherwise it will return a char + * array with a single character. + */ +@InternalApi +fun Char.Companion.codePointToChars(codePoint: Int): CharArray = when (codePoint) { + in 0 until SUPPLEMENTARY_PLANE_LOW -> charArrayOf(codePoint.toChar()) + in SUPPLEMENTARY_PLANE_LOW..MAX_CODEPOINT -> { + val low = MIN_LOW_SURROGATE.code + ((codePoint - 0x10000) and 0x3FF) + val high = MIN_HIGH_SURROGATE.code + (((codePoint - 0x10000) ushr 10) and 0x3FF) + charArrayOf(high.toChar(), low.toChar()) + } + else -> throw IllegalArgumentException("invalid codepoint $codePoint") +} diff --git a/runtime/utils/common/test/aws/smithy/kotlin/runtime/util/text/Utf8Test.kt b/runtime/utils/common/test/aws/smithy/kotlin/runtime/util/text/Utf8Test.kt new file mode 100644 index 0000000000..1380e5e79e --- /dev/null +++ b/runtime/utils/common/test/aws/smithy/kotlin/runtime/util/text/Utf8Test.kt @@ -0,0 +1,45 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +package aws.smithy.kotlin.runtime.util.text + +import kotlin.test.* + +class Utf8Test { + + @Test + fun testUtf8ByteCount() { + assertEquals(1, byteCountUtf8("$".encodeToByteArray()[0])) + assertEquals(2, byteCountUtf8("¢".encodeToByteArray()[0])) + assertEquals(3, byteCountUtf8("€".encodeToByteArray()[0])) + assertEquals(4, byteCountUtf8("\uD834\uDD22".encodeToByteArray()[0])) + } + + @Test + fun testIsSupplementaryCodePoint() { + assertFalse(Char.isSupplementaryCodePoint(-1)) + for (c in 0..0xFFFF) { + assertFalse(Char.isSupplementaryCodePoint(c.toInt())) + } + for (c in 0xFFFF + 1..0x10FFFF) { + assertTrue(Char.isSupplementaryCodePoint(c)) + } + assertFalse(Char.isSupplementaryCodePoint(0x10FFFF + 1)) + } + + @Test + fun testCodePointToChars() { + assertContentEquals(charArrayOf('\uD800', '\uDC00'), Char.codePointToChars(0x010000)) + assertContentEquals(charArrayOf('\uD800', '\uDC01'), Char.codePointToChars(0x010001)) + assertContentEquals(charArrayOf('\uD801', '\uDC01'), Char.codePointToChars(0x010401)) + assertContentEquals(charArrayOf('\uDBFF', '\uDFFF'), Char.codePointToChars(0x10FFFF)) + + assertContentEquals(charArrayOf('A'), Char.codePointToChars(65)) + + assertFailsWith() { + Char.codePointToChars(Int.MAX_VALUE) + } + } +}