Skip to content

Commit

Permalink
Introduce a non-recursive JSON parser
Browse files Browse the repository at this point in the history
This parser uses an explicit prediction stack, unlike the present
recursive descent parser where the parser state is represented on the
call stack. This difference makes the new parser suitable for use in
incremental parsing of huge JSON documents that cannot be conveniently
handled piece-wise by the recursive descent parser. One potential use
for this will be in parsing large backup manifests associated with
incremental backups.

Because this parser is somewhat slower than the recursive descent
parser, it  is not replacing that parser, but is an additional parser
available to callers.

For testing purposes, if the build is done with -DFORCE_JSON_PSTACK, all
JSON parsing is done with the non-recursive parser, in which case only
trivial regression differences in error messages should be observed.

Author: Andrew Dunstan
Reviewed-By: Jacob Champion

Discussion: https://postgr.es/m/7b0a51d6-0d9d-7366-3a1a-f74397a02f55@dunslane.net
  • Loading branch information
adunstan committed Apr 4, 2024
1 parent 585df02 commit 3311ea8
Show file tree
Hide file tree
Showing 16 changed files with 21,563 additions and 9 deletions.
954 changes: 945 additions & 9 deletions src/common/jsonapi.c

Large diffs are not rendered by default.

30 changes: 30 additions & 0 deletions src/include/common/jsonapi.h
Expand Up @@ -36,6 +36,9 @@ typedef enum JsonTokenType
typedef enum JsonParseErrorType
{
JSON_SUCCESS,
JSON_INCOMPLETE,
JSON_INVALID_LEXER_TYPE,
JSON_NESTING_TOO_DEEP,
JSON_ESCAPING_INVALID,
JSON_ESCAPING_REQUIRED,
JSON_EXPECTED_ARRAY_FIRST,
Expand All @@ -57,6 +60,9 @@ typedef enum JsonParseErrorType
JSON_SEM_ACTION_FAILED, /* error should already be reported */
} JsonParseErrorType;

/* Parser state private to jsonapi.c */
typedef struct JsonParserStack JsonParserStack;
typedef struct JsonIncrementalState JsonIncrementalState;

/*
* All the fields in this structure should be treated as read-only.
Expand All @@ -71,6 +77,11 @@ typedef enum JsonParseErrorType
* AFTER the end of the token, i.e. where there would be a nul byte
* if we were using nul-terminated strings.
*
* The prev_token_terminator field should not be used when incremental is
* true, as the previous token might have started in a previous piece of input,
* and thus it can't be used in any pointer arithmetic or other operations in
* conjunction with token_start.
*
* JSONLEX_FREE_STRUCT/STRVAL are used to drive freeJsonLexContext.
*/
#define JSONLEX_FREE_STRUCT (1 << 0)
Expand All @@ -83,11 +94,14 @@ typedef struct JsonLexContext
char *token_start;
char *token_terminator;
char *prev_token_terminator;
bool incremental;
JsonTokenType token_type;
int lex_level;
bits32 flags;
int line_number; /* line number, starting from 1 */
char *line_start; /* where that line starts within input */
JsonParserStack *pstack;
JsonIncrementalState *inc_state;
StringInfo strval;
StringInfo errormsg;
} JsonLexContext;
Expand Down Expand Up @@ -141,6 +155,12 @@ typedef struct JsonSemAction
extern JsonParseErrorType pg_parse_json(JsonLexContext *lex,
JsonSemAction *sem);

extern JsonParseErrorType pg_parse_json_incremental(JsonLexContext *lex,
JsonSemAction *sem,
char *json,
int len,
bool is_last);

/* the null action object used for pure validation */
extern PGDLLIMPORT JsonSemAction nullSemAction;

Expand Down Expand Up @@ -176,6 +196,16 @@ extern JsonLexContext *makeJsonLexContextCstringLen(JsonLexContext *lex,
int len,
int encoding,
bool need_escapes);

/*
* make a JsonLexContext suitable for incremental parsing.
* the string chunks will be handed to pg_parse_json_incremental,
* so there's no need for them here.
*/
extern JsonLexContext *makeJsonLexContextIncremental(JsonLexContext *lex,
int encoding,
bool need_escapes);

extern void freeJsonLexContext(JsonLexContext *lex);

/* lex one token */
Expand Down
7 changes: 7 additions & 0 deletions src/include/pg_config_manual.h
Expand Up @@ -240,6 +240,13 @@
*------------------------------------------------------------------------
*/

/*
* Force use of the non-recursive JSON parser in all cases. This is useful
* to validate the working of the parser, and the regression tests should
* pass except for some different error messages about the stack limit.
*/
/* #define FORCE_JSON_PSTACK */

/*
* Include Valgrind "client requests", mostly in the memory allocator, so
* Valgrind understands PostgreSQL memory contexts. This permits detecting
Expand Down
1 change: 1 addition & 0 deletions src/test/modules/Makefile
Expand Up @@ -22,6 +22,7 @@ SUBDIRS = \
test_extensions \
test_ginpostinglist \
test_integerset \
test_json_parser \
test_lfind \
test_misc \
test_oat_hooks \
Expand Down
1 change: 1 addition & 0 deletions src/test/modules/meson.build
Expand Up @@ -21,6 +21,7 @@ subdir('test_dsm_registry')
subdir('test_extensions')
subdir('test_ginpostinglist')
subdir('test_integerset')
subdir('test_json_parser')
subdir('test_lfind')
subdir('test_misc')
subdir('test_oat_hooks')
Expand Down
36 changes: 36 additions & 0 deletions src/test/modules/test_json_parser/Makefile
@@ -0,0 +1,36 @@

PGFILEDESC = "standalone json parser tester"
PGAPPICON = win32

TAP_TESTS = 1

OBJS = test_json_parser_incremental.o test_json_parser_perf.o

ifdef USE_PGXS
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
else
subdir = src/test/modules/test_json_parser
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif

all: test_json_parser_incremental$(X) test_json_parser_perf$(X)

%.o: $(top_srcdir)/$(subdir)/%.c

PARSER_LIBS = $(top_builddir)/src/common/libpgcommon.a $(top_builddir)/src/port/libpgport.a

test_json_parser_incremental$(X): test_json_parser_incremental.o $(PARSER_LIBS)
$(CC) $(CFLAGS) $^ -o $@

test_json_parser_perf$(X): test_json_parser_perf.o $(PARSER_LIBS)
$(CC) $(CFLAGS) $^ -o $@

speed-check: test_json_parser_perf$(X)
@echo Standard parser:
time ./test_json_parser_perf 10000 $(top_srcdir)/$(subdir)/tiny.json
@echo Incremental parser:
time ./test_json_parser_perf -i 10000 $(top_srcdir)/$(subdir)/tiny.json
25 changes: 25 additions & 0 deletions src/test/modules/test_json_parser/README
@@ -0,0 +1,25 @@
Module `test_json_parser`
=========================

This module contains two programs for testing the json parsers.

- `test_json_parser_incremental` is for testing the incremental parser, It
reads in a file and pases it in very small chunks (60 bytes at a time) to
the incremental parser. It's not meant to be a speed test but to test the
accuracy of the incremental parser. It takes one argument: the name of the
input file.
- `test_json_parser_perf` is for speed testing both the standard
recursive descent parser and the non-recursive incremental
parser. If given the `-i` flag it uses the non-recursive parser,
otherwise the stardard parser. The remaining flags are the number of
parsing iterations and the file containing the input. Even when
using the non-recursive parser, the input is passed to the parser in a
single chunk. The results are thus comparable to those of the
standard parser.

The easiest way to use these is to run `make check` and `make speed-check`

The sample input file is a small extract from a list of `delicious`
bookmarks taken some years ago, all wrapped in a single json
array. 10,000 iterations of parsing this file gives a reasonable
benchmark, and that is what the `speed-check` target does.
52 changes: 52 additions & 0 deletions src/test/modules/test_json_parser/meson.build
@@ -0,0 +1,52 @@
# Copyright (c) 2024, PostgreSQL Global Development Group

test_json_parser_incremental_sources = files(
'test_json_parser_incremental.c',
)

if host_system == 'windows'
test_json_parser_incremental_sources += rc_bin_gen.process(win32ver_rc, extra_args: [
'--NAME', 'test_json_parser_incremental',
'--FILEDESC', 'standalone json parser tester',
])
endif

test_json_parser_incremental = executable('test_json_parser_incremental',
test_json_parser_incremental_sources,
dependencies: [frontend_code],
kwargs: default_bin_args + {
'install': false,
},
)

test_json_parser_perf_sources = files(
'test_json_parser_perf.c',
)

if host_system == 'windows'
test_json_parser_perf_sources += rc_bin_gen.process(win32ver_rc, extra_args: [
'--NAME', 'test_json_parser_perf',
'--FILEDESC', 'standalone json parser tester',
])
endif

test_json_parser_perf = executable('test_json_parser_perf',
test_json_parser_perf_sources,
dependencies: [frontend_code],
kwargs: default_bin_args + {
'install': false,
},
)

tests += {
'name': 'test_json_parser',
'sd': meson.current_source_dir(),
'bd': meson.current_build_dir(),
'tap': {
'tests': [
't/001_test_json_parser_incremental.pl',
't/002_inline.pl',
't/003_test_semantic.pl'
],
},
}
@@ -0,0 +1,23 @@

use strict;
use warnings;

use PostgreSQL::Test::Utils;
use Test::More;
use FindBin;

use File::Temp qw(tempfile);

my $test_file = "$FindBin::RealBin/../tiny.json";

my $exe = "test_json_parser_incremental";

for (my $size = 64; $size > 0; $size--)
{
my ($stdout, $stderr) = run_command( [$exe, "-c", $size, $test_file] );

like($stdout, qr/SUCCESS/, "chunk size $size: test succeeds");
is($stderr, "", "chunk size $size: no error output");
}

done_testing();
83 changes: 83 additions & 0 deletions src/test/modules/test_json_parser/t/002_inline.pl
@@ -0,0 +1,83 @@
use strict;
use warnings;

use PostgreSQL::Test::Utils;
use Test::More;

use File::Temp qw(tempfile);

sub test
{
local $Test::Builder::Level = $Test::Builder::Level + 1;

my ($name, $json, %params) = @_;
my $exe = "test_json_parser_incremental";
my $chunk = length($json);

if ($chunk > 64)
{
$chunk = 64;
}

my ($fh, $fname) = tempfile(UNLINK => 1);
print $fh "$json";
close($fh);

foreach my $size (reverse(1..$chunk))
{
my ($stdout, $stderr) = run_command( [$exe, "-c", $size, $fname] );

if (defined($params{error}))
{
unlike($stdout, qr/SUCCESS/, "$name, chunk size $size: test fails");
like($stderr, $params{error}, "$name, chunk size $size: correct error output");
}
else
{
like($stdout, qr/SUCCESS/, "$name, chunk size $size: test succeeds");
is($stderr, "", "$name, chunk size $size: no error output");
}
}
}

test("number", "12345");
test("string", '"hello"');
test("false", "false");
test("true", "true");
test("null", "null");
test("empty object", "{}");
test("empty array", "[]");
test("array with number", "[12345]");
test("array with numbers", "[12345,67890]");
test("array with null", "[null]");
test("array with string", '["hello"]');
test("array with boolean", '[false]');
test("single pair", '{"key": "value"}');
test("heavily nested array", "[" x 3200 . "]" x 3200);
test("serial escapes", '"\\\\\\\\\\\\\\\\"');
test("interrupted escapes", '"\\\\\\"\\\\\\\\\\"\\\\"');
test("whitespace", ' "" ');

test("unclosed empty object", "{", error => qr/input string ended unexpectedly/);
test("bad key", "{{", error => qr/Expected string or "}", but found "\{"/);
test("bad key", "{{}", error => qr/Expected string or "}", but found "\{"/);
test("numeric key", "{1234: 2}", error => qr/Expected string or "}", but found "1234"/);
test("second numeric key", '{"a": "a", 1234: 2}', error => qr/Expected string, but found "1234"/);
test("unclosed object with pair", '{"key": "value"', error => qr/input string ended unexpectedly/);
test("missing key value", '{"key": }', error => qr/Expected JSON value, but found "}"/);
test("missing colon", '{"key" 12345}', error => qr/Expected ":", but found "12345"/);
test("missing comma", '{"key": 12345 12345}', error => qr/Expected "," or "}", but found "12345"/);
test("overnested array", "[" x 6401, error => qr/maximum permitted depth is 6400/);
test("overclosed array", "[]]", error => qr/Expected end of input, but found "]"/);
test("unexpected token in array", "[ }}} ]", error => qr/Expected array element or "]", but found "}"/);
test("junk punctuation", "[ ||| ]", error => qr/Token "|" is invalid/);
test("missing comma in array", "[123 123]", error => qr/Expected "," or "]", but found "123"/);
test("misspelled boolean", "tru", error => qr/Token "tru" is invalid/);
test("misspelled boolean in array", "[tru]", error => qr/Token "tru" is invalid/);
test("smashed top-level scalar", "12zz", error => qr/Token "12zz" is invalid/);
test("smashed scalar in array", "[12zz]", error => qr/Token "12zz" is invalid/);
test("unknown escape sequence", '"hello\vworld"', error => qr/Escape sequence "\\v" is invalid/);
test("unescaped control", "\"hello\tworld\"", error => qr/Character with value 0x09 must be escaped/);
test("incorrect escape count", '"\\\\\\\\\\\\\\"', error => qr/Token ""\\\\\\\\\\\\\\"" is invalid/);

done_testing();
36 changes: 36 additions & 0 deletions src/test/modules/test_json_parser/t/003_test_semantic.pl
@@ -0,0 +1,36 @@
use strict;
use warnings;

use PostgreSQL::Test::Utils;
use Test::More;
use FindBin;

use File::Temp qw(tempfile);

my $test_file = "$FindBin::RealBin/../tiny.json";
my $test_out = "$FindBin::RealBin/../tiny.out";

my $exe = "test_json_parser_incremental";

my ($stdout, $stderr) = run_command( [$exe, "-s", $test_file] );

is($stderr, "", "no error output");

my ($fh, $fname) = tempfile();

print $fh $stdout,"\n";

close($fh);

($stdout, $stderr) = run_command(["diff", "-u", $fname, $test_out]);

is($stdout, "", "no output diff");
is($stderr, "", "no diff error");

done_testing();






0 comments on commit 3311ea8

Please sign in to comment.