Skip to content

Commit

Permalink
Update @base64, utf8bytelength and fromjson to handle binary st…
Browse files Browse the repository at this point in the history
…rings
  • Loading branch information
Maxdamantus committed May 20, 2021
1 parent 0b2cff4 commit 26f3d81
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 21 deletions.
1 change: 0 additions & 1 deletion docs/content/manual/manual.yml
Expand Up @@ -1916,7 +1916,6 @@ sections:
* `@base64d`:
The inverse of `@base64`, input is decoded as specified by RFC 4648.
Note\: If the decoded string is not UTF-8, the results are undefined.
This syntax can be combined with string interpolation in a
useful way. You can follow a `@foo` token with a string
Expand Down
107 changes: 92 additions & 15 deletions src/builtin.c
Expand Up @@ -464,10 +464,55 @@ static jv f_dump(jq_state *jq, jv input) {
static jv f_json_parse(jq_state *jq, jv input) {
if (jv_get_kind(input) != JV_KIND_STRING)
return type_error(input, "only strings can be parsed");
jv res = jv_parse_sized(jv_string_value(input),
jv_string_length_bytes(jv_copy(input)));

const char* i = jv_string_value(input);
const char* end = i + jv_string_length_bytes(jv_copy(input));

struct jv_parser* parser = jv_parser_new(0);
int count = 0;
jv value = jv_invalid();
while (i != NULL) {
const int max_utf8_len = 4;
unsigned char buf[100 + max_utf8_len];
int buflen = 0;
int c;
while ((buflen + max_utf8_len < sizeof(buf)) && (i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
if (c >= -0xFF && c <= -0x80) {
// Invalid UTF-8 byte, pass through
buf[buflen++] = -c;
} else
buflen += jvp_utf8_encode(c, buf + buflen);
}
jv_parser_set_buf(parser, buf, buflen, i != NULL);
for (;;) {
jv next = jv_parser_next(parser);
if (!jv_is_valid(next)) {
if (jv_invalid_has_msg(jv_copy(next))) {
count++;
jv_free(value);
value = next;
i = NULL;
}
break;
}
jv_free(value);
if (count++ == 0)
value = next;
else {
jv_free(next);
value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
i = NULL;
break;
}
}
}
jv_parser_free(parser);
jv_free(input);
return res;
if (count == 0) {
jv_free(value);
value = jv_invalid_with_msg(jv_string("Expected JSON value"));
}
return value;
}

static jv f_tonumber(jq_state *jq, jv input) {
Expand Down Expand Up @@ -514,7 +559,19 @@ static jv f_tostring(jq_state *jq, jv input) {
static jv f_utf8bytelength(jq_state *jq, jv input) {
if (jv_get_kind(input) != JV_KIND_STRING)
return type_error(input, "only strings have UTF-8 byte length");
return jv_number(jv_string_length_bytes(input));
const char* i = jv_string_value(input);
const char* end = i + jv_string_length_bytes(jv_copy(input));
int len = 0;
int c;
while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_ERRORS_UTF8, &c))) {
if (c >= -0xFF && c <= -0x80) {
// Invalid UTF-8 byte, will be passed through
len++;
} else
len += jvp_utf8_encode_length(c);
}
jv_free(input);
return jv_number(len);
}

#define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
Expand Down Expand Up @@ -689,21 +746,41 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
jv_free(fmt);
input = f_tostring(jq, input);
jv line = jv_string("");
const unsigned char* data = (const unsigned char*)jv_string_value(input);
int len = jv_string_length_bytes(jv_copy(input));
for (int i=0; i<len; i+=3) {
uint32_t code = 0;
int n = len - i >= 3 ? 3 : len-i;
for (int j=0; j<3; j++) {
const char* i = jv_string_value(input);
const char* end = i + jv_string_length_bytes(jv_copy(input));
uint32_t code = 0;
int n = 0;
int c;
while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_ERRORS_UTF8, &c))) {
unsigned char ubuf[4];
int len = 0;
if (c >= -0xFF && c <= -0x80) {
// Invalid UTF-8 byte, pass through
ubuf[len++] = -c;
} else
len += jvp_utf8_encode(c, ubuf);
for (int x = 0; x < len; x++) {
code <<= 8;
code |= j < n ? (unsigned)data[i+j] : 0;
code |= ubuf[x];
if (++n == 3) {
char buf[4];
for (int j = 0; j < 4; j++)
buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
line = jv_string_append_buf(line, buf, sizeof(buf));
n = 0;
code = 0;
}
}
}
if (n > 0) {
assert(n < 3);
code <<= 8*(3 - n);
char buf[4];
for (int j=0; j<4; j++) {
for (int j = 0; j < 4; j++)
buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
}
if (n < 3) buf[3] = '=';
if (n < 2) buf[2] = '=';
buf[3] = '=';
if (n < 2)
buf[2] = '=';
line = jv_string_append_buf(line, buf, sizeof(buf));
}
jv_free(input);
Expand Down
5 changes: 5 additions & 0 deletions tests/base64.test
Expand Up @@ -33,3 +33,8 @@
. | try @base64d catch .
"QUJDa"
"string (\"QUJDa\") trailing base64 byte found"

# random binary data
(. | @base64d | @base64) == .
"zns0Su1i4JjDfGiR95WOcU8iiPMOrfJTUBm9P1ot2qIMiyk04b0WSIFNTMD7w9ziMV8nSbwpPqNl3JKF1eWZrRRg24rbvh66O1e7Z1xIGPNqTqm+jdzRCkWSryR+67wXRVgD6Q=="
true
19 changes: 14 additions & 5 deletions tests/shtest
Expand Up @@ -123,11 +123,20 @@ cmp $d/out $d/expected


clean=false
# Invalid UTF-8 bytes are preserved when encoding/decoding JSON
dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null
$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
$VALGRIND $Q $JQ -j . $d/out.json >$d/out
cmp $d/out $d/rand
# Invalid UTF-8 bytes are preserved when encoding/decoding JSON and base64 and concatenating binary strings
if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then
$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
$VALGRIND $Q $JQ -j . $d/out.json >$d/out
cmp $d/out $d/rand
$VALGRIND $Q $JQ -jR fromjson $d/out.json >$d/out
cmp $d/out $d/rand
$VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out
cmp $d/out $d/rand
base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out
cmp $d/out $d/rand
$VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out
cmp $d/out $d/rand
fi
clean=true


Expand Down

0 comments on commit 26f3d81

Please sign in to comment.