From 1336ada66b4e4f64c359028716a43e006818d1db Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 14:12:01 +0000 Subject: [PATCH 01/12] Add b64decode and some test coverage (not full) --- mypyc/lib-rt/librt_base64.c | 85 +++++++++++++++++++++++++++++++++ mypyc/test-data/run-base64.test | 31 +++++++++++- 2 files changed, 115 insertions(+), 1 deletion(-) diff --git a/mypyc/lib-rt/librt_base64.c b/mypyc/lib-rt/librt_base64.c index 020a56e412f4..326303dca2a7 100644 --- a/mypyc/lib-rt/librt_base64.c +++ b/mypyc/lib-rt/librt_base64.c @@ -63,11 +63,96 @@ b64encode(PyObject *self, PyObject *const *args, size_t nargs) { return b64encode_internal(args[0]); } +static PyObject * +b64decode_internal(PyObject *arg) { + // Expect a bytes object + if (!PyBytes_Check(arg)) { + PyErr_SetString(PyExc_TypeError, "b64decode expects a 'bytes' object"); + return NULL; + } + + // Get input pointer and length + const char *src = PyBytes_AS_STRING(arg); // returns char*, safe to treat as const + Py_ssize_t srclen_ssz = PyBytes_GET_SIZE(arg); + + // Fast-path: empty input + if (srclen_ssz == 0) { + return PyBytes_FromStringAndSize(NULL, 0); + } + + // Compute an output capacity that's at least 3/4 of input, without overflow: + // ceil(3/4 * N) == N - floor(N/4) + size_t srclen = (size_t)srclen_ssz; + size_t max_out = srclen - (srclen / 4); + if (max_out == 0) { + max_out = 1; // defensive (srclen > 0 implies >= 1 anyway) + } + if (max_out > (size_t)PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_OverflowError, "input too large"); + return NULL; + } + + // Allocate output bytes (uninitialized) of the max capacity + PyObject *out_bytes = PyBytes_FromStringAndSize(NULL, (Py_ssize_t)max_out); + if (out_bytes == NULL) { + return NULL; // Propagate memory error + } + + char *outbuf = PyBytes_AS_STRING(out_bytes); + size_t outlen = max_out; + + // Decode (flags = 0 for plain input) + int ret = base64_decode(src, srclen, outbuf, &outlen, 0); + + if (ret != 1) { + Py_DECREF(out_bytes); + if (ret == 0) { + PyErr_SetString(PyExc_ValueError, "invalid base64 input"); + } else if (ret == -1) { + PyErr_SetString(PyExc_NotImplementedError, "base64 codec not available in this build"); + } else { + PyErr_SetString(PyExc_RuntimeError, "base64_decode failed"); + } + return NULL; + } + + // Sanity-check contract (decoder must not overflow our buffer) + if (outlen > max_out) { + Py_DECREF(out_bytes); + PyErr_SetString(PyExc_RuntimeError, "decoder wrote past output buffer"); + return NULL; + } + +#ifndef Py_LIMITED_API + // Shrink in place to the actual decoded length + if (_PyBytes_Resize(&out_bytes, (Py_ssize_t)outlen) < 0) { + // _PyBytes_Resize sets an exception and may free the old object + return NULL; + } + return out_bytes; +#else + // PEP 384 limited-API fallback: copy into a right-sized bytes object + PyObject *res = PyBytes_FromStringAndSize(outbuf, (Py_ssize_t)outlen); + Py_DECREF(out_bytes); + return res; // may be NULL if allocation failed (exception set) +#endif +} + +static PyObject* +b64decode(PyObject *self, PyObject *const *args, size_t nargs) { + if (nargs != 1) { + PyErr_SetString(PyExc_TypeError, "b64decode() takes exactly one argument"); + return 0; + } + return b64decode_internal(args[0]); +} + #endif static PyMethodDef librt_base64_module_methods[] = { #ifdef MYPYC_EXPERIMENTAL {"b64encode", (PyCFunction)b64encode, METH_FASTCALL, PyDoc_STR("Encode bytes-like object using Base64.")}, + {"b64decode", (PyCFunction)b64decode, METH_FASTCALL, PyDoc_STR("Decode bytes-like object using Base64.")}, #endif {NULL, NULL, 0, NULL} }; diff --git a/mypyc/test-data/run-base64.test b/mypyc/test-data/run-base64.test index 0f9151c2b00b..c62950a0ed04 100644 --- a/mypyc/test-data/run-base64.test +++ b/mypyc/test-data/run-base64.test @@ -2,7 +2,7 @@ from typing import Any import base64 -from librt.base64 import b64encode +from librt.base64 import b64encode, b64decode from testutil import assertRaises @@ -44,6 +44,35 @@ def test_encode_wrapper() -> None: with assertRaises(TypeError): enc(b"x", b"y") +def test_decode_basic() -> None: + assert b64decode(b"eA==") == b"x" + + with assertRaises(TypeError): + b64decode(bytearray(b"eA==")) + +def check_decode(b: bytes) -> None: + enc = b64encode(b) + assert b64decode(enc) == getattr(base64, "b64decode")(enc) + +def test_decode_different_strings() -> None: + for i in range(256): + check_decode(bytes([i])) + check_decode(bytes([i]) + b"x") + check_decode(bytes([i]) + b"xy") + check_decode(bytes([i]) + b"xyz") + check_decode(bytes([i]) + b"xyza") + check_decode(b"x" + bytes([i])) + check_decode(b"xy" + bytes([i])) + check_decode(b"xyz" + bytes([i])) + check_decode(b"xyza" + bytes([i])) + + b = b"a\x00\xb7" * 1000 + for i in range(1000): + check_decode(b[:i]) + + for b in b"", b"ab", b"bac", b"1234", b"xyz88", b"abc" * 200: + check_decode(b) + [case testBase64FeaturesNotAvailableInNonExperimentalBuild_librt_base64] # This also ensures librt.base64 can be built without experimental features import librt.base64 From b7643e60f7d61cf1e257a1ac0f5c7521e899b543 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 14:16:13 +0000 Subject: [PATCH 02/12] Update stub --- mypy/typeshed/stubs/librt/librt/base64.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/mypy/typeshed/stubs/librt/librt/base64.pyi b/mypy/typeshed/stubs/librt/librt/base64.pyi index 36366f5754ce..26524add3cea 100644 --- a/mypy/typeshed/stubs/librt/librt/base64.pyi +++ b/mypy/typeshed/stubs/librt/librt/base64.pyi @@ -1 +1,2 @@ def b64encode(s: bytes) -> bytes: ... +def b64decode(s: bytes) -> bytes: ... From e4501f9dc384c4e046bd54aa4b93f6f184399319 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 15:34:54 +0000 Subject: [PATCH 03/12] Improve tests --- mypyc/lib-rt/librt_base64.c | 2 +- mypyc/test-data/run-base64.test | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/mypyc/lib-rt/librt_base64.c b/mypyc/lib-rt/librt_base64.c index 326303dca2a7..1c4dfeb0fc00 100644 --- a/mypyc/lib-rt/librt_base64.c +++ b/mypyc/lib-rt/librt_base64.c @@ -107,7 +107,7 @@ b64decode_internal(PyObject *arg) { if (ret != 1) { Py_DECREF(out_bytes); if (ret == 0) { - PyErr_SetString(PyExc_ValueError, "invalid base64 input"); + PyErr_SetString(PyExc_ValueError, "Only base64 data is allowed"); } else if (ret == -1) { PyErr_SetString(PyExc_NotImplementedError, "base64 codec not available in this build"); } else { diff --git a/mypyc/test-data/run-base64.test b/mypyc/test-data/run-base64.test index c62950a0ed04..7ab0e0c32538 100644 --- a/mypyc/test-data/run-base64.test +++ b/mypyc/test-data/run-base64.test @@ -50,8 +50,11 @@ def test_decode_basic() -> None: with assertRaises(TypeError): b64decode(bytearray(b"eA==")) -def check_decode(b: bytes) -> None: - enc = b64encode(b) +def check_decode(b: bytes, encoded: bool = False) -> None: + if encoded: + enc = b + else: + enc = b64encode(b) assert b64decode(enc) == getattr(base64, "b64decode")(enc) def test_decode_different_strings() -> None: @@ -73,6 +76,16 @@ def test_decode_different_strings() -> None: for b in b"", b"ab", b"bac", b"1234", b"xyz88", b"abc" * 200: check_decode(b) +def test_decode_wrapper() -> None: + dec: Any = b64decode + assert dec(b"eA==") == b"x" + + with assertRaises(TypeError): + dec() + + with assertRaises(TypeError): + dec(b"x", b"y") + [case testBase64FeaturesNotAvailableInNonExperimentalBuild_librt_base64] # This also ensures librt.base64 can be built without experimental features import librt.base64 From 3655416499bbe59b501e803320101db494a86d99 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Tue, 18 Nov 2025 15:56:36 +0000 Subject: [PATCH 04/12] Accept ascii str arguments --- mypyc/lib-rt/librt_base64.c | 25 ++++++++++++++++++------- mypyc/test-data/run-base64.test | 6 ++++++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/mypyc/lib-rt/librt_base64.c b/mypyc/lib-rt/librt_base64.c index 1c4dfeb0fc00..94a8e89eca03 100644 --- a/mypyc/lib-rt/librt_base64.c +++ b/mypyc/lib-rt/librt_base64.c @@ -65,15 +65,26 @@ b64encode(PyObject *self, PyObject *const *args, size_t nargs) { static PyObject * b64decode_internal(PyObject *arg) { - // Expect a bytes object - if (!PyBytes_Check(arg)) { - PyErr_SetString(PyExc_TypeError, "b64decode expects a 'bytes' object"); - return NULL; - } + const char *src; + Py_ssize_t srclen_ssz; // Get input pointer and length - const char *src = PyBytes_AS_STRING(arg); // returns char*, safe to treat as const - Py_ssize_t srclen_ssz = PyBytes_GET_SIZE(arg); + if (PyBytes_Check(arg)) { + src = PyBytes_AS_STRING(arg); + srclen_ssz = PyBytes_GET_SIZE(arg); + } else if (PyUnicode_Check(arg)) { + if (!PyUnicode_IS_ASCII(arg)) { + PyErr_SetString(PyExc_ValueError, + "string argument should contain only ASCII characters"); + return NULL; + } + src = (const char *)PyUnicode_1BYTE_DATA(arg); + srclen_ssz = PyUnicode_GET_LENGTH(arg); + } else { + PyErr_SetString(PyExc_TypeError, + "argument should be a bytes-like object or ASCII string"); + return NULL; + } // Fast-path: empty input if (srclen_ssz == 0) { diff --git a/mypyc/test-data/run-base64.test b/mypyc/test-data/run-base64.test index 7ab0e0c32538..3a0973d3aba8 100644 --- a/mypyc/test-data/run-base64.test +++ b/mypyc/test-data/run-base64.test @@ -50,12 +50,18 @@ def test_decode_basic() -> None: with assertRaises(TypeError): b64decode(bytearray(b"eA==")) + for non_ascii in "\x80", "foo\u100bar", "foo\ua1234bar": + with assertRaises(ValueError): + b64decode(non_ascii) + def check_decode(b: bytes, encoded: bool = False) -> None: if encoded: enc = b else: enc = b64encode(b) assert b64decode(enc) == getattr(base64, "b64decode")(enc) + enc_str = enc.decode("ascii") + assert b64decode(enc_str) == getattr(base64, "b64decode")(enc_str) def test_decode_different_strings() -> None: for i in range(256): From aeca40a2bafdee660d3628fe0470aa913618eaa0 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Tue, 18 Nov 2025 16:23:11 +0000 Subject: [PATCH 05/12] Update b64decode stub --- mypy/typeshed/stubs/librt/librt/base64.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypy/typeshed/stubs/librt/librt/base64.pyi b/mypy/typeshed/stubs/librt/librt/base64.pyi index 26524add3cea..1cea838505d6 100644 --- a/mypy/typeshed/stubs/librt/librt/base64.pyi +++ b/mypy/typeshed/stubs/librt/librt/base64.pyi @@ -1,2 +1,2 @@ def b64encode(s: bytes) -> bytes: ... -def b64decode(s: bytes) -> bytes: ... +def b64decode(s: bytes | str) -> bytes: ... From 836c5f3255d59e09d8cc41fc437f1dc59d5b4ecf Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Tue, 18 Nov 2025 16:49:40 +0000 Subject: [PATCH 06/12] Filter out invalid base64 characters --- mypyc/lib-rt/librt_base64.c | 62 ++++++++++++++++++++++++++++++--- mypyc/test-data/run-base64.test | 11 ++++++ 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/mypyc/lib-rt/librt_base64.c b/mypyc/lib-rt/librt_base64.c index 94a8e89eca03..e94d46f214d2 100644 --- a/mypyc/lib-rt/librt_base64.c +++ b/mypyc/lib-rt/librt_base64.c @@ -6,6 +6,9 @@ #ifdef MYPYC_EXPERIMENTAL +static PyObject * +b64decode_handle_invalid(PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen); + #define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2) #define STACK_BUFFER_SIZE 1024 @@ -63,6 +66,12 @@ b64encode(PyObject *self, PyObject *const *args, size_t nargs) { return b64encode_internal(args[0]); } +static inline int +is_valid_base64_char(char c) { + return ((c >= 'A' && c <= 'Z') | (c >= 'a' && c <= 'z') | + (c >= '0' && c <= '9') | (c == '+') | (c == '/') | (c == '=')); +} + static PyObject * b64decode_internal(PyObject *arg) { const char *src; @@ -91,6 +100,12 @@ b64decode_internal(PyObject *arg) { return PyBytes_FromStringAndSize(NULL, 0); } + // Quickly ignore invalid characters at the end. Other invalid characters + // are also accepted, but they need a slow path. + while (srclen_ssz > 0 && !is_valid_base64_char(src[srclen_ssz - 1])) { + srclen_ssz--; + } + // Compute an output capacity that's at least 3/4 of input, without overflow: // ceil(3/4 * N) == N - floor(N/4) size_t srclen = (size_t)srclen_ssz; @@ -112,14 +127,14 @@ b64decode_internal(PyObject *arg) { char *outbuf = PyBytes_AS_STRING(out_bytes); size_t outlen = max_out; - // Decode (flags = 0 for plain input) int ret = base64_decode(src, srclen, outbuf, &outlen, 0); if (ret != 1) { - Py_DECREF(out_bytes); if (ret == 0) { - PyErr_SetString(PyExc_ValueError, "Only base64 data is allowed"); - } else if (ret == -1) { + return b64decode_handle_invalid(out_bytes, outbuf, max_out, src, srclen); + } + Py_DECREF(out_bytes); + if (ret == -1) { PyErr_SetString(PyExc_NotImplementedError, "base64 codec not available in this build"); } else { PyErr_SetString(PyExc_RuntimeError, "base64_decode failed"); @@ -149,6 +164,45 @@ b64decode_internal(PyObject *arg) { #endif } +static PyObject * +b64decode_handle_invalid(PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen) +{ + size_t i; + char *newbuf = PyMem_Malloc(srclen); + size_t newbuf_len = 0; + for (i = 0; i < srclen; i++) { + char c = src[i]; + if (is_valid_base64_char(c)) { + newbuf[newbuf_len++] = c; + } + } + + size_t outlen = max_out; + int ret = base64_decode(newbuf, newbuf_len, outbuf, &outlen, 0); + PyMem_Free(newbuf); + + if (ret != 1) { + Py_DECREF(out_bytes); + if (ret == 0) { + PyErr_SetString(PyExc_ValueError, "Only base64 data is allowed"); + } + if (ret == -1) { + PyErr_SetString(PyExc_NotImplementedError, "base64 codec not available in this build"); + } else { + PyErr_SetString(PyExc_RuntimeError, "base64_decode failed"); + } + return NULL; + } + + // Shrink in place to the actual decoded length + if (_PyBytes_Resize(&out_bytes, (Py_ssize_t)outlen) < 0) { + // _PyBytes_Resize sets an exception and may free the old object + return NULL; + } + return out_bytes; +} + + static PyObject* b64decode(PyObject *self, PyObject *const *args, size_t nargs) { if (nargs != 1) { diff --git a/mypyc/test-data/run-base64.test b/mypyc/test-data/run-base64.test index 3a0973d3aba8..62e6edd39f59 100644 --- a/mypyc/test-data/run-base64.test +++ b/mypyc/test-data/run-base64.test @@ -82,6 +82,17 @@ def test_decode_different_strings() -> None: for b in b"", b"ab", b"bac", b"1234", b"xyz88", b"abc" * 200: check_decode(b) +def test_decode_with_non_base64_chars() -> None: + # For stdlib compatibility, non-base64 characters should be ignored. + + # Invalid characters as a suffix use a fast path. + check_decode(b"eA== ", encoded=True) + check_decode(b"eA==\n", encoded=True) + check_decode(b"eA== \t\n", encoded=True) + check_decode(b"\n", encoded=True) + + check_decode(b" e A = = ", encoded=True) + def test_decode_wrapper() -> None: dec: Any = b64decode assert dec(b"eA==") == b"x" From cc45fdea47cc345834ea51c3130ad5e0d60f10f5 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 19 Nov 2025 14:25:11 +0000 Subject: [PATCH 07/12] Check for invalid padding --- mypyc/lib-rt/librt_base64.c | 36 +++++++++++++++++++++++++++++---- mypyc/test-data/run-base64.test | 34 +++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/mypyc/lib-rt/librt_base64.c b/mypyc/lib-rt/librt_base64.c index e94d46f214d2..2230ca917d7d 100644 --- a/mypyc/lib-rt/librt_base64.c +++ b/mypyc/lib-rt/librt_base64.c @@ -1,5 +1,6 @@ #define PY_SSIZE_T_CLEAN #include +#include #include "librt_base64.h" #include "libbase64.h" #include "pythoncapi_compat.h" @@ -67,9 +68,9 @@ b64encode(PyObject *self, PyObject *const *args, size_t nargs) { } static inline int -is_valid_base64_char(char c) { +is_valid_base64_char(char c, bool allow_padding) { return ((c >= 'A' && c <= 'Z') | (c >= 'a' && c <= 'z') | - (c >= '0' && c <= '9') | (c == '+') | (c == '/') | (c == '=')); + (c >= '0' && c <= '9') | (c == '+') | (c == '/') | (allow_padding && c == '=')); } static PyObject * @@ -102,7 +103,7 @@ b64decode_internal(PyObject *arg) { // Quickly ignore invalid characters at the end. Other invalid characters // are also accepted, but they need a slow path. - while (srclen_ssz > 0 && !is_valid_base64_char(src[srclen_ssz - 1])) { + while (srclen_ssz > 0 && !is_valid_base64_char(src[srclen_ssz - 1], true)) { srclen_ssz--; } @@ -172,11 +173,38 @@ b64decode_handle_invalid(PyObject *out_bytes, char *outbuf, size_t max_out, cons size_t newbuf_len = 0; for (i = 0; i < srclen; i++) { char c = src[i]; - if (is_valid_base64_char(c)) { + if (is_valid_base64_char(c, false)) { newbuf[newbuf_len++] = c; + } else if (c == '=') { + // Copy necessary amount of padding + int remainder = newbuf_len % 4; + if (remainder == 0) { + // No padding needed -- ignore padding + break; + } + int numpad = 4 - remainder; + // Check that there is at least the required amount padding (CPython ignores + // extra padding) + while (numpad > 0) { + if (i == srclen || src[i] != '=') { + break; + } + newbuf[newbuf_len++] = '='; + i++; + numpad--; + while (i < srclen && !is_valid_base64_char(src[i], true)) { + i++; + } + } + break; } } + if (newbuf_len % 4 != 0) { + PyErr_SetString(PyExc_ValueError, "Incorrect padding"); + return NULL; + } + size_t outlen = max_out; int ret = base64_decode(newbuf, newbuf_len, outbuf, &outlen, 0); PyMem_Free(newbuf); diff --git a/mypyc/test-data/run-base64.test b/mypyc/test-data/run-base64.test index 62e6edd39f59..e02eb521e737 100644 --- a/mypyc/test-data/run-base64.test +++ b/mypyc/test-data/run-base64.test @@ -1,6 +1,7 @@ [case testAllBase64Features_librt_experimental] from typing import Any import base64 +import binascii from librt.base64 import b64encode, b64decode @@ -93,6 +94,39 @@ def test_decode_with_non_base64_chars() -> None: check_decode(b" e A = = ", encoded=True) + # Special case: Two different encodings of the same data + check_decode(b"eAa=", encoded=True) + check_decode(b"eAY=", encoded=True) + +def check_decode_error(b: bytes, ignore_stdlib: bool = False) -> None: + if not ignore_stdlib: + with assertRaises(binascii.Error): + getattr(base64, "b64decode")(b) + + # The raised error is different, since librt shouldn't depend on binascii + with assertRaises(ValueError): + b64decode(b) + +def test_decode_with_invalid_padding() -> None: + check_decode_error(b"eA") + check_decode_error(b"eA=") + check_decode_error(b"eHk") + check_decode_error(b"eA = ") + + # Here stdlib behavior seems nonsensical, so we don't try to duplicate it + check_decode_error(b"eA=a=", ignore_stdlib=True) + +def test_decode_with_extra_data_after_padding() -> None: + check_decode(b"=", encoded=True) + check_decode(b"==", encoded=True) + check_decode(b"===", encoded=True) + check_decode(b"====", encoded=True) + check_decode(b"eA===", encoded=True) + check_decode(b"eHk==", encoded=True) + check_decode(b"eA==x", encoded=True) + check_decode(b"eHk=x", encoded=True) + check_decode(b"eA==abc=======efg", encoded=True) + def test_decode_wrapper() -> None: dec: Any = b64decode assert dec(b"eA==") == b"x" From 62666f43242025c979bb6400ca26388972d14155 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 19 Nov 2025 15:23:03 +0000 Subject: [PATCH 08/12] Test more --- mypyc/test-data/run-base64.test | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/mypyc/test-data/run-base64.test b/mypyc/test-data/run-base64.test index e02eb521e737..8d7eb7c13482 100644 --- a/mypyc/test-data/run-base64.test +++ b/mypyc/test-data/run-base64.test @@ -61,8 +61,9 @@ def check_decode(b: bytes, encoded: bool = False) -> None: else: enc = b64encode(b) assert b64decode(enc) == getattr(base64, "b64decode")(enc) - enc_str = enc.decode("ascii") - assert b64decode(enc_str) == getattr(base64, "b64decode")(enc_str) + if getattr(enc, "isascii")(): # Test stub has no "isascii" + enc_str = enc.decode("ascii") + assert b64decode(enc_str) == getattr(base64, "b64decode")(enc_str) def test_decode_different_strings() -> None: for i in range(256): @@ -83,6 +84,10 @@ def test_decode_different_strings() -> None: for b in b"", b"ab", b"bac", b"1234", b"xyz88", b"abc" * 200: check_decode(b) +def is_base64_char(x: int) -> bool: + c = chr(x) + return ('a' <= c <= 'z') or ('A' <= c <= 'Z') or ('0' <= c <= '9') or c in '+/=' + def test_decode_with_non_base64_chars() -> None: # For stdlib compatibility, non-base64 characters should be ignored. @@ -98,6 +103,14 @@ def test_decode_with_non_base64_chars() -> None: check_decode(b"eAa=", encoded=True) check_decode(b"eAY=", encoded=True) + for x in range(256): + if not is_base64_char(x): + b = bytes([x]) + check_decode(b, encoded=True) + check_decode(b"eA==" + b, encoded=True) + check_decode(b"e" + b + b"A==", encoded=True) + check_decode(b"eA=" + b + b"=", encoded=True) + def check_decode_error(b: bytes, ignore_stdlib: bool = False) -> None: if not ignore_stdlib: with assertRaises(binascii.Error): From 65fd3054e9f7f806803009ba8f939efe9a0829b2 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 19 Nov 2025 15:31:06 +0000 Subject: [PATCH 09/12] Clean up implementation --- mypyc/lib-rt/librt_base64.c | 39 +++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/mypyc/lib-rt/librt_base64.c b/mypyc/lib-rt/librt_base64.c index 2230ca917d7d..f64258dcf476 100644 --- a/mypyc/lib-rt/librt_base64.c +++ b/mypyc/lib-rt/librt_base64.c @@ -8,7 +8,8 @@ #ifdef MYPYC_EXPERIMENTAL static PyObject * -b64decode_handle_invalid(PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen); +b64decode_handle_invalid_input( + PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen); #define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2) @@ -132,7 +133,8 @@ b64decode_internal(PyObject *arg) { if (ret != 1) { if (ret == 0) { - return b64decode_handle_invalid(out_bytes, outbuf, max_out, src, srclen); + // Slow path: handle non-base64 input + return b64decode_handle_invalid_input(out_bytes, outbuf, max_out, src, srclen); } Py_DECREF(out_bytes); if (ret == -1) { @@ -150,36 +152,39 @@ b64decode_internal(PyObject *arg) { return NULL; } -#ifndef Py_LIMITED_API // Shrink in place to the actual decoded length if (_PyBytes_Resize(&out_bytes, (Py_ssize_t)outlen) < 0) { // _PyBytes_Resize sets an exception and may free the old object return NULL; } return out_bytes; -#else - // PEP 384 limited-API fallback: copy into a right-sized bytes object - PyObject *res = PyBytes_FromStringAndSize(outbuf, (Py_ssize_t)outlen); - Py_DECREF(out_bytes); - return res; // may be NULL if allocation failed (exception set) -#endif } +// Process non-base64 input by ignoring non-base64 characters, for compatiblity +// with stdlib b64decode. static PyObject * -b64decode_handle_invalid(PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen) +b64decode_handle_invalid_input( + PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen) { - size_t i; - char *newbuf = PyMem_Malloc(srclen); + // Copy input to a temporary buffer, with non-base64 characters and extra suffix + // characters removed size_t newbuf_len = 0; - for (i = 0; i < srclen; i++) { + char *newbuf = PyMem_Malloc(srclen); + if (newbuf == NULL) { + Py_DECREF(out_bytes); + return PyErr_NoMemory(); + } + + // Copy base64 characters and some padding to the new buffer + for (size_t i = 0; i < srclen; i++) { char c = src[i]; if (is_valid_base64_char(c, false)) { newbuf[newbuf_len++] = c; } else if (c == '=') { - // Copy necessary amount of padding + // Copy a necessary amount of padding int remainder = newbuf_len % 4; if (remainder == 0) { - // No padding needed -- ignore padding + // No padding needed break; } int numpad = 4 - remainder; @@ -192,6 +197,7 @@ b64decode_handle_invalid(PyObject *out_bytes, char *outbuf, size_t max_out, cons newbuf[newbuf_len++] = '='; i++; numpad--; + // Skip non-base64 alphabet characters within padding while (i < srclen && !is_valid_base64_char(src[i], true)) { i++; } @@ -200,7 +206,10 @@ b64decode_handle_invalid(PyObject *out_bytes, char *outbuf, size_t max_out, cons } } + // Stdlib always performs a non-strict padding check if (newbuf_len % 4 != 0) { + Py_DECREF(out_bytes); + PyMem_Free(newbuf); PyErr_SetString(PyExc_ValueError, "Incorrect padding"); return NULL; } From 6349a39880358c543088350af8745f1ebd6c376f Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 19 Nov 2025 15:54:29 +0000 Subject: [PATCH 10/12] Update docstrings --- mypyc/lib-rt/librt_base64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mypyc/lib-rt/librt_base64.c b/mypyc/lib-rt/librt_base64.c index f64258dcf476..18011cee82fe 100644 --- a/mypyc/lib-rt/librt_base64.c +++ b/mypyc/lib-rt/librt_base64.c @@ -253,8 +253,8 @@ b64decode(PyObject *self, PyObject *const *args, size_t nargs) { static PyMethodDef librt_base64_module_methods[] = { #ifdef MYPYC_EXPERIMENTAL - {"b64encode", (PyCFunction)b64encode, METH_FASTCALL, PyDoc_STR("Encode bytes-like object using Base64.")}, - {"b64decode", (PyCFunction)b64decode, METH_FASTCALL, PyDoc_STR("Decode bytes-like object using Base64.")}, + {"b64encode", (PyCFunction)b64encode, METH_FASTCALL, PyDoc_STR("Encode bytes object using Base64.")}, + {"b64decode", (PyCFunction)b64decode, METH_FASTCALL, PyDoc_STR("Decode a Base64 encoded bytes object or ASCII string.")}, #endif {NULL, NULL, 0, NULL} }; @@ -298,7 +298,7 @@ static PyModuleDef_Slot librt_base64_module_slots[] = { static PyModuleDef librt_base64_module = { .m_base = PyModuleDef_HEAD_INIT, .m_name = "base64", - .m_doc = "base64 encoding and decoding optimized for mypyc", + .m_doc = "Fast base64 encoding and decoding optimized for mypyc", .m_size = 0, .m_methods = librt_base64_module_methods, .m_slots = librt_base64_module_slots, From 3d091041b5fae707279d668cbcd6d86ddb39ee4d Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 19 Nov 2025 16:11:31 +0000 Subject: [PATCH 11/12] Fix typo --- mypyc/lib-rt/librt_base64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypyc/lib-rt/librt_base64.c b/mypyc/lib-rt/librt_base64.c index 18011cee82fe..4bf98ccb41da 100644 --- a/mypyc/lib-rt/librt_base64.c +++ b/mypyc/lib-rt/librt_base64.c @@ -160,7 +160,7 @@ b64decode_internal(PyObject *arg) { return out_bytes; } -// Process non-base64 input by ignoring non-base64 characters, for compatiblity +// Process non-base64 input by ignoring non-base64 characters, for compatibility // with stdlib b64decode. static PyObject * b64decode_handle_invalid_input( From ca42d33eb2a5dd3889461749702db5069db424e5 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 19 Nov 2025 16:42:49 +0000 Subject: [PATCH 12/12] Refactor function based on review comment --- mypyc/lib-rt/librt_base64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mypyc/lib-rt/librt_base64.c b/mypyc/lib-rt/librt_base64.c index 4bf98ccb41da..1720359ef9a6 100644 --- a/mypyc/lib-rt/librt_base64.c +++ b/mypyc/lib-rt/librt_base64.c @@ -70,8 +70,8 @@ b64encode(PyObject *self, PyObject *const *args, size_t nargs) { static inline int is_valid_base64_char(char c, bool allow_padding) { - return ((c >= 'A' && c <= 'Z') | (c >= 'a' && c <= 'z') | - (c >= '0' && c <= '9') | (c == '+') | (c == '/') | (allow_padding && c == '=')); + return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || (c == '+') || (c == '/') || (allow_padding && c == '=')); } static PyObject *