Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
5efac86
Add reatUtf8Char and import CharStream
aajtodd Jul 14, 2021
6ea21d9
Import KMP compatible stream reader from smithy-kotlin#42
aajtodd Jul 14, 2021
e9c599c
fix invalid number parsing
aajtodd Jul 15, 2021
289b089
wip comma handling
aajtodd Jul 16, 2021
4da6a6d
note to self
aajtodd Jul 20, 2021
adb40c6
Merge remote-tracking branch 'origin/main' into kmp-json
aajtodd Jul 26, 2021
cd1f752
rename to readLiteral
aajtodd Jul 26, 2021
dbc4a10
refactor: remove utf8 char handling from SdkByteReadChannel
aajtodd Jul 26, 2021
5201698
fix: handle surrogates in CharStream
aajtodd Jul 26, 2021
ab208a7
fix: handle escapes
aajtodd Jul 26, 2021
656a757
fix reading escaped unicode and control chars
aajtodd Jul 27, 2021
7131dd1
add failing tests
aajtodd Jul 27, 2021
74a13f4
exception hygiene
aajtodd Jul 27, 2021
336074c
add unescaped control character handling
aajtodd Jul 27, 2021
4b88f50
add instructions for testing against JSONTestSuite
aajtodd Jul 27, 2021
2838994
Merge remote-tracking branch 'origin/main' into kmp-json
aajtodd Aug 23, 2021
c111e64
Merge remote-tracking branch 'origin/main' into kmp-json
aajtodd Aug 31, 2021
ee50ae0
Merge remote-tracking branch 'origin/main' into kmp-json
aajtodd Sep 1, 2021
8ba6147
update expected exception
aajtodd Sep 1, 2021
e27566c
use more meaningful states for handling errors
aajtodd Sep 1, 2021
377037b
remove RawJsonToken
aajtodd Sep 1, 2021
7d76fcf
fix lexer to support peek operations
aajtodd Sep 3, 2021
bffb4a4
cleanup state management
aajtodd Sep 3, 2021
c2fdf90
Merge remote-tracking branch 'origin/main' into kmp-json
aajtodd Sep 9, 2021
9270970
cleanup
aajtodd Sep 9, 2021
b67e0ba
Merge branch 'kmp-json-refactor' into kmp-json
aajtodd Sep 9, 2021
2519012
update testing readme
aajtodd Sep 9, 2021
bb966a6
replace gson with hand rolled encoder
aajtodd Sep 9, 2021
fb0cbff
share same state definition
aajtodd Sep 9, 2021
f780452
reset sdk version
aajtodd Sep 10, 2021
7892ddc
Merge remote-tracking branch 'origin/main' into kmp-json
aajtodd Sep 10, 2021
22e2ad9
fix multibyte unicode order
aajtodd Sep 13, 2021
bf711fe
wip microbenchmarking
aajtodd Sep 10, 2021
fb2d23f
optimize lexer
aajtodd Sep 10, 2021
0889cf9
cleanup
aajtodd Sep 13, 2021
93eed44
remove CharStream and cleanup
aajtodd Sep 14, 2021
6bb1a3f
cleanup error handling and include position when possible
aajtodd Sep 14, 2021
54d989a
cleanup encoder
aajtodd Sep 14, 2021
f45a4a1
Merge remote-tracking branch 'origin/main' into kmp-json
aajtodd Sep 14, 2021
b41fac3
refactor: remove unnecessary type from consts
aajtodd Sep 15, 2021
be5a6b9
Merge remote-tracking branch 'origin/main' into kmp-json
aajtodd Sep 20, 2021
dacca0d
feedback and cleanup
aajtodd Sep 20, 2021
a2f600c
Merge remote-tracking branch 'origin/main' into kmp-json
aajtodd Oct 6, 2021
2affe95
fix surrounding backticks
aajtodd Oct 6, 2021
78e8be7
include better offset info in exceptions; fix backticks
aajtodd Oct 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*/
package aws.smithy.kotlin.runtime.io

import aws.smithy.kotlin.runtime.util.text.byteCountUtf8
import io.ktor.utils.io.*
import io.ktor.utils.io.core.*

Expand Down Expand Up @@ -160,3 +161,33 @@ internal suspend fun SdkByteReadChannel.readAvailableFallback(dest: SdkByteBuffe
dest.writeFully(tmp)
return tmp.size.toLong()
}

/**
* Reads a UTF-8 code point from the channel. Returns `null` if closed
*/
suspend fun SdkByteReadChannel.readUtf8CodePoint(): Int? {
awaitContent()
if (availableForRead == 0 && isClosedForRead) return null

val firstByte = readByte()
val cnt = byteCountUtf8(firstByte)
var code = when (cnt) {
1 -> firstByte.toInt()
2 -> firstByte.toInt() and 0x1f
3 -> firstByte.toInt() and 0x0f
4 -> firstByte.toInt() and 0x07
else -> throw IllegalStateException("Invalid UTF-8 start sequence: $firstByte")
}

for (i in 1 until cnt) {
awaitContent()
if (availableForRead == 0 && isClosedForRead) throw IllegalStateException("unexpected EOF: expected ${cnt - i} bytes")
val byte = readByte()
val bint = byte.toInt()
if (bint and 0xc0 != 0x80) throw IllegalStateException("invalid UTF-8 successor byte: $byte")

code = (code shl 6) or (bint and 0x3f)
}

return code
}
Original file line number Diff line number Diff line change
Expand Up @@ -131,4 +131,36 @@ class SdkByteChannelOpsTest {
yield()
assertFalse(awaitingContent)
}

@Test
fun testReadUtf8Chars() = runSuspendTest {
val chan = SdkByteReadChannel("hello".encodeToByteArray())
assertEquals('h', chan.readUtf8CodePoint()?.toChar())
assertEquals('e', chan.readUtf8CodePoint()?.toChar())
assertEquals('l', chan.readUtf8CodePoint()?.toChar())
assertEquals('l', chan.readUtf8CodePoint()?.toChar())
assertEquals('o', chan.readUtf8CodePoint()?.toChar())
assertNull(chan.readUtf8CodePoint())
}

@Test
fun testReadMultibyteUtf8Chars(): Unit = runSuspendTest {
// https://www.fileformat.info/info/unicode/char/1d122/index.htm
// $ - 1 byte, cent sign - 2bytes, euro sign - 3 bytes, musical clef - 4 points (surrogate pair)
val content = "$¢€\uD834\uDD22"
val chan = SdkByteReadChannel(content.encodeToByteArray())

val expected = listOf(
36, // $
162, // ¢
8364, // €
119074 // musical F clef
)

expected.forEachIndexed { i, exp ->
val code = chan.readUtf8CodePoint()
assertEquals(exp, code, "[i=$i] expected $exp, found $code ")
}
assertNull(chan.readUtf8CodePoint())
}
}
2 changes: 1 addition & 1 deletion runtime/serde/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ kotlin {
}
}

subprojects {
allprojects {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: Why was this change necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might not be now. It was because we added code to serde/common that needed it. I'll check if it's still required.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was added for CharStream tests (since removed). I think I'll keep it since it allows tests to be added to serde common if needed.

kotlin {
sourceSets {
commonTest {
Expand Down
178 changes: 178 additions & 0 deletions runtime/serde/serde-json/TESTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
How to run JSONTestSuite against serde-json deserialize
========================================================

When making changes to the lexer it is a good idea to run the
changes against the [JSONTestSuite](https://github.com/nst/JSONTestSuite) and manually examine the test results.

### How to setup the JSONTestSuite
Comment on lines +1 to +7
Copy link
Contributor

@ianbotsf ianbotsf Sep 14, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: Why not just add an integration test that uses JSONTestSuite directly into the code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

JSONTestSuite is a bit interesting/onerous to setup and use. It does not lend itself to this kind of integration and I see it as an operational cost not worth paying.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this line of thinking, can this code as-is serve as a general purpose JSON parser? If so perhaps it would make sense to break out as a separate dependency at some point in the future such that JSONTestSuite could run against it as the others they have in their repo. Obviously not something for this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless we are planning to support it as a general purpose parser I would think it probably isn't in our interest (or anyone else) to do that. Perhaps we fork it and set it up so that it's easier to run...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "good idea to run the changes against JSONTestSuite" sounds like best intentions to me, is there a way we can mechanize it? When I was playing around with the testing, I just copy/pasted all of the parser scenarios from their input files into a (admittedly massive) test class.


1. Clone the [JSONTestSuite](https://github.com/nst/JSONTestSuite) repository.
2. In `JSONTestSuite/parsers`, create a new Gradle JVM application project named `test_smithy_kotlin`.
3. Add the following `build.gradle.kts` file

```kotlin
plugins {
kotlin("jvm") version "1.5.30"
application
id("com.github.johnrengelman.shadow") version "7.0.0"
}

application {
mainClass.set("aws.smithy.kotlin.jsontest.MainKt")
}

allprojects {
repositories {
mavenLocal()
mavenCentral()
}
}


// NOTE: set to whatever locally published version you are working on
val smithyKotlinVersion: String = "0.4.1-kmp-json"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: The versions users have will almost always be something like 0.4.0-alpha or 0.4.0-snapshot. We should make the example something that looks familiar.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This readme is targeted towards developers (i.e. us). Not end users.

dependencies {
implementation(kotlin("stdlib"))
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:1.5.0")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: Do we need to include a note about matching the coroutines version used by the rest of the project?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure that's important, again this is targeted at developers of smithy-kotlin though so I would expect us to make changes here as necessary.

implementation("aws.smithy.kotlin:serde-json:$smithyKotlinVersion")
implementation("aws.smithy.kotlin:utils:$smithyKotlinVersion")
}

tasks.jar {
manifest {
attributes["Main-Class"] = "aws.smithy.kotlin.jsontest.MainKt"
}
}
```

4. Add the following code to `src/main/kotlin/Main.kt` with:

```kotlin
package aws.smithy.kotlin.jsontest
Comment on lines +48 to +51
Copy link
Contributor

@ianbotsf ianbotsf Sep 14, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Source file path and package directive do not match.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It isn't really important since this is creating an application jar. Also technically it would be in line with the recommended directory structure if aws.smithy.kotlin.jsontest is the "root".


import kotlinx.coroutines.runBlocking
import kotlin.system.exitProcess
import java.io.IOException
import java.nio.file.Files
import java.nio.file.Paths
import aws.smithy.kotlin.runtime.serde.json.JsonToken
import aws.smithy.kotlin.runtime.serde.json.jsonStreamReader
import aws.smithy.kotlin.runtime.util.InternalApi


@OptIn(InternalApi::class)
suspend fun isValidJson(bytes: ByteArray):Boolean {
val lexer = jsonStreamReader(bytes)
println(lexer::class.qualifiedName)
return try {
val tokens = mutableListOf<JsonToken>()
do {
val token = lexer.nextToken()
tokens.add(token)
}while(token != JsonToken.EndDocument)

// The test suite includes incomplete objects and arrays (e.g. "[null,")
// These are completely valid for this parser since it's just a tokenizer
// and doesn't attempt to make semantic meaning from the input.
// We'll just pretend to fail to satisfy the test suite
val pruned = if (tokens.last() == JsonToken.EndDocument) tokens.dropLast(1) else tokens
if (pruned.first() == JsonToken.BeginArray && pruned.last() != JsonToken.EndArray) {
return false
}
if (pruned.first() == JsonToken.BeginObject && pruned.last() != JsonToken.EndObject) {
return false
}

tokens.isNotEmpty()
}catch(ex: Exception) {
println(ex)
false
}
}

fun main(args: Array<String>): Unit = runBlocking {
if(args.isEmpty()) {
println("Usage: java TestJSONParsing file.json")
exitProcess(2)
}

try {
val data = Files.readAllBytes(Paths.get(args[0]))
if(isValidJson(data)) {
println("valid");
exitProcess(0);
}
println("invalid");
exitProcess(1);
} catch (ex: IOException) {
println(ex)
println("not found");
exitProcess(2);
}
}
```

5. Compile this program with `./gradlew build`.
NOTE: Be sure to publish all of `smithy-kotlin` "runtime" to maven local. It is helpful to just choose a unique version
to be sure that everything is wired up correctly.
6. Modify `JSONTestSuite/run_tests.py` so that the `programs` dictionary only contains this one entry:

```
programs = {
"SmithyKotlin":
{
"url":"",
"commands":["java" , "-jar", os.path.join(PARSERS_DIR, "test_smithy_kotlin/build/libs/test_smithy_kotlin-all.jar")]
}
}
```

7. Run `run_tests.py` and examine the output with a web browser by opening `JSONTestSuite/results/parsing.html`.

### Examining the results

When looking at `JSONTestSuite/results/parsing.html`, there is a matrix of test cases against their
results with a legend at the top.

Any test result marked with blue or light blue is for a test case where correct behavior isn't specified,
so use your best judgement to decide if it should have succeeded or failed.

The other colors are bad and should be carefully examined. At time of writing, the following test cases
succeed when they should fail, and we intentionally left it that way since we're not currently concerned
about being more lenient in the number parsing:

```
n_number_-01.json [-01]
n_number_-2..json [-2.]
n_number_.2e-3.json [.2e-3]
n_number_0.3e+.json [0.3e+]
n_number_0.3e.json [0.3e]
n_number_0.e1.json [0.e1]
n_number_0_capital_E+.json [0E+]
n_number_0_capital_E.json [0E]
n_number_0e+.json [0e+]
n_number_0e.json [0e]
n_number_1.0e+.json [1.0e+]
n_number_1.0e-.json [1.0e-]
n_number_1.0e.json [1.0e]
n_number_2.e+3.json [2.e+3]
n_number_2.e-3.json [2.e-3]
n_number_2.e3.json [2.e3]
n_number_9.e+.json [9.e+]
n_number_neg_int_starting_with_zero.json [-012]
n_number_neg_real_without_int_part.json [-.123]
n_number_real_without_fractional_part.json [1.]
n_number_starting_with_dot.json [.123]
n_number_with_leading_zero.json [012]
```



This test case succeeds with our parser and that's OK since we're
a token streaming parser (multiple values are allowed):
```
n_array_just_minus.json [-]
n_structure_double_array.json [][]
n_structure_whitespace_formfeed.json [0C] <=> [ ]
```
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ package aws.smithy.kotlin.runtime.serde.json
import aws.smithy.kotlin.runtime.serde.*

/**
* Provides a deserialiser for JSON documents
* Provides a deserializer for JSON documents
*
* @param payload underlying document from which tokens are read
*/
Expand Down Expand Up @@ -65,11 +65,11 @@ class JsonDeserializer(payload: ByteArray) : Deserializer, Deserializer.ElementI

override suspend fun deserializeStruct(descriptor: SdkObjectDescriptor): Deserializer.FieldIterator =
when (reader.peek()) {
RawJsonToken.BeginObject -> {
JsonToken.BeginObject -> {
reader.nextTokenOf<JsonToken.BeginObject>()
JsonFieldIterator(reader, descriptor, this)
}
RawJsonToken.Null -> JsonNullFieldIterator(this)
JsonToken.Null -> JsonNullFieldIterator(this)
else -> throw DeserializationException("Unexpected token type ${reader.peek()}")
}

Expand All @@ -88,28 +88,28 @@ class JsonDeserializer(payload: ByteArray) : Deserializer, Deserializer.ElementI
return token.value
}

override suspend fun nextHasValue(): Boolean = reader.peek() != RawJsonToken.Null
override suspend fun nextHasValue(): Boolean = reader.peek() != JsonToken.Null

override suspend fun hasNextEntry(): Boolean =
when (reader.peek()) {
RawJsonToken.EndObject -> {
JsonToken.EndObject -> {
// consume the token
reader.nextTokenOf<JsonToken.EndObject>()
false
}
RawJsonToken.Null,
RawJsonToken.EndDocument -> false
JsonToken.Null,
JsonToken.EndDocument -> false
else -> true
}

override suspend fun hasNextElement(): Boolean =
when (reader.peek()) {
RawJsonToken.EndArray -> {
JsonToken.EndArray -> {
// consume the token
reader.nextTokenOf<JsonToken.EndArray>()
false
}
RawJsonToken.EndDocument -> false
JsonToken.EndDocument -> false
else -> true
}
}
Expand All @@ -131,13 +131,13 @@ private class JsonFieldIterator(

override suspend fun findNextFieldIndex(): Int? {
val candidate = when (reader.peek()) {
RawJsonToken.EndObject -> {
JsonToken.EndObject -> {
// consume the token
reader.nextTokenOf<JsonToken.EndObject>()
null
}
RawJsonToken.EndDocument -> null
RawJsonToken.Null -> {
JsonToken.EndDocument -> null
JsonToken.Null -> {
reader.nextTokenOf<JsonToken.Null>()
null
}
Expand All @@ -151,7 +151,7 @@ private class JsonFieldIterator(

if (candidate != null) {
// found a field
if (reader.peek() == RawJsonToken.Null) {
if (reader.peek() == JsonToken.Null) {
// skip explicit nulls
reader.nextTokenOf<JsonToken.Null>()
return findNextFieldIndex()
Expand All @@ -166,17 +166,3 @@ private class JsonFieldIterator(
reader.skipNext()
}
}

// return the next token and require that it be of type [TExpected] or else throw an exception
private suspend inline fun <reified TExpected : JsonToken> JsonStreamReader.nextTokenOf(): TExpected {
val token = this.nextToken()
requireToken<TExpected>(token)
return token as TExpected
}

// require that the given token be of type [TExpected] or else throw an exception
private inline fun <reified TExpected> requireToken(token: JsonToken) {
if (token::class != TExpected::class) {
throw DeserializationException("expected ${TExpected::class}; found ${token::class}")
}
}
Loading