From dd8fc07d277bd38b1c4ad031a8e69db39b8f9a8e Mon Sep 17 00:00:00 2001
From: p1c2u <maciag.artur@gmail.com>
Date: Sat, 16 May 2026 17:43:50 +0100
Subject: [PATCH] Fix multipart binary composed-schema matching

---
 openapi_core/validation/schemas/validators.py | 158 ++++++++++++++++-
 .../test_request_unmarshaller.py              | 159 ++++++++++++++++-
 .../test_media_types_deserializers.py         |  12 --
 .../unit/validation/test_schema_validators.py | 163 ++++++++++++++++++
 4 files changed, 471 insertions(+), 21 deletions(-)

diff --git a/openapi_core/validation/schemas/validators.py b/openapi_core/validation/schemas/validators.py
index c7a8f03b..4a504781 100644
--- a/openapi_core/validation/schemas/validators.py
+++ b/openapi_core/validation/schemas/validators.py
@@ -22,6 +22,12 @@
 from openapi_core.validation.schemas.exceptions import InvalidSchemaValue
 from openapi_core.validation.schemas.exceptions import ValidateError
 
+# OpenAPI ``format`` values whose *type: string* schemas are permitted to
+# carry a raw ``bytes`` payload end-to-end -- ``binary`` for opaque file
+# bodies (multipart/form-data, application/octet-stream) and ``byte`` for
+# base64 strings that callers may still hand in as ``bytes``.
+_BINARY_STRING_FORMATS = frozenset({"binary", "byte"})
+
 if TYPE_CHECKING:
     from openapi_core.casting.schemas.casters import SchemaCaster
 
@@ -41,12 +47,156 @@ def __contains__(self, schema_format: str) -> bool:
         return schema_format in self.validator.format_checker.checkers
 
     def validate(self, value: Any) -> None:
-        errors_iter = self.validator.iter_errors(value)
+        # OpenAPI allows ``bytes`` to flow through ``string`` schemas
+        # whose ``format`` is ``binary`` or ``byte`` (file uploads,
+        # base64-encoded blobs). jsonschema only validates ``string``
+        # against text, so we present it a decoded view while keeping
+        # the original ``value`` for downstream unmarshalling and error
+        # reporting.
+        normalized = self._normalize_for_validation(value)
+        errors_iter = self.validator.iter_errors(normalized)
         errors = tuple(errors_iter)
         if errors:
             schema_type = (self.schema / "type").read_str_or_list("any")
             raise InvalidSchemaValue(value, schema_type, schema_errors=errors)
 
+    @staticmethod
+    def _decode_binary_value(value: bytes) -> str:
+        """Decode raw ``bytes`` into the text view jsonschema expects.
+
+        ``utf-8`` first because that's what the vast majority of byte
+        bodies actually are; falling back to ASCII + ``surrogateescape``
+        guarantees the call never raises for arbitrary binary payloads
+        (a real file upload may contain any byte sequence).
+        """
+        try:
+            return value.decode("utf-8")
+        except UnicodeDecodeError:
+            return value.decode("ASCII", errors="surrogateescape")
+
+    def _accepts_binary_string(self, value: Any) -> bool:
+        """True when ``value`` is ``bytes`` and the schema at this
+        position is a ``string`` whose ``format`` allows raw bytes.
+        """
+        if not isinstance(value, bytes):
+            return False
+        schema_format = (self.schema / "format").read_str(None)
+        if schema_format not in _BINARY_STRING_FORMATS:
+            return False
+        schema_types = (self.schema / "type").read_str_or_list(None)
+        if schema_types is None:
+            # No declared type: OAS 3.1 lets any value flow; treat the
+            # binary/byte format as authoritative.
+            return True
+        if isinstance(schema_types, str):
+            return schema_types == "string"
+        return "string" in schema_types
+
+    def _normalize_for_validation(self, value: Any) -> Any:
+        """Return a view of ``value`` with ``bytes`` decoded to text
+        wherever the schema-at-this-position is a binary/byte string.
+
+        The original ``value`` is never mutated. Containers are only
+        copied when a descendant actually changes, so the unchanged
+        fast path returns ``value`` itself -- callers can use object
+        identity to detect a no-op.
+
+        Recursion is driven by the schema, not by introspecting the
+        value: a ``dict`` is only descended when the schema declares
+        ``properties``/``additionalProperties``, a ``list`` only when
+        it declares ``items``, and composition (``oneOf``/``anyOf``/
+        ``allOf``) is descended unconditionally because that's where
+        a multipart binary branch typically lives.
+        """
+        if self._accepts_binary_string(value):
+            return self._decode_binary_value(value)
+
+        normalized: Any
+        if isinstance(value, dict):
+            normalized = self._normalize_mapping_for_validation(value)
+        elif isinstance(value, list) and "items" in self.schema:
+            normalized = self._normalize_array_for_validation(value)
+        else:
+            normalized = value
+
+        # Composition keywords are where the binary branch actually
+        # lives in real specs (a multipart oneOf with a file branch and
+        # a non-file branch, for example). We apply each sub-schema's
+        # normalization in turn -- idempotent because a sub-schema that
+        # doesn't touch a position returns the same object, and once a
+        # bytes value has been decoded to ``str`` no other sub-schema
+        # treats it as binary.
+        for keyword in ("oneOf", "anyOf", "allOf"):
+            if keyword not in self.schema:
+                continue
+            for subschema in self.schema / keyword:
+                normalized = self.evolve(subschema)._normalize_for_validation(
+                    normalized
+                )
+
+        return normalized
+
+    def _normalize_mapping_for_validation(
+        self, value: dict[str, Any]
+    ) -> dict[str, Any]:
+        normalized: dict[str, Any] = value
+
+        if "properties" in self.schema:
+            for prop_name, prop_schema in (self.schema / "properties").items():
+                if not isinstance(prop_name, str) or prop_name not in value:
+                    continue
+                prop_validator = self.evolve(prop_schema)
+                new_value = prop_validator._normalize_for_validation(
+                    value[prop_name]
+                )
+                if new_value is value[prop_name]:
+                    continue
+                if normalized is value:
+                    normalized = dict(value)
+                normalized[prop_name] = new_value
+
+        additional = self.schema.get("additionalProperties", True)
+        if additional in (True, False):
+            return normalized
+
+        property_names: set[str] = set()
+        if "properties" in self.schema:
+            property_names = {
+                name
+                for name in (self.schema / "properties").keys()
+                if isinstance(name, str)
+            }
+        additional_validator = self.evolve(
+            self.schema / "additionalProperties"
+        )
+        for prop_name, prop_value in value.items():
+            if prop_name in property_names:
+                continue
+            new_value = additional_validator._normalize_for_validation(
+                prop_value
+            )
+            if new_value is prop_value:
+                continue
+            if normalized is value:
+                normalized = dict(value)
+            normalized[prop_name] = new_value
+
+        return normalized
+
+    def _normalize_array_for_validation(self, value: list[Any]) -> list[Any]:
+        items_validator = self.evolve(self.schema / "items")
+        normalized: Optional[list[Any]] = None
+        for idx, item in enumerate(value):
+            new_item = items_validator._normalize_for_validation(item)
+            if new_item is item:
+                continue
+            if normalized is None:
+                normalized = list(value)
+            normalized[idx] = new_item
+        if normalized is None:
+            return value
+        return normalized
+
     # Cache the recursive "does this schema benefit from a ValidationState?"
     # check, keyed on the SchemaPath. SchemaPath is hashed by content, so
     # two SchemaPaths pointing at the same spec location share a cache
@@ -267,6 +417,12 @@ def get_primitive_type(self, value: Any) -> Optional[str]:
             schema_types = sorted(self.validator.TYPE_CHECKER._type_checkers)
         assert isinstance(schema_types, list)
         for schema_type in schema_types:
+            if schema_type == "string" and self._accepts_binary_string(value):
+                # Bytes value, binary/byte format, ``string`` is in the
+                # declared type list: treat it as string without asking
+                # jsonschema's type checker (which doesn't know about
+                # OpenAPI's binary convention).
+                return "string"
             result = self.type_validator(value, type_override=schema_type)
             if not result:
                 continue
diff --git a/tests/integration/unmarshalling/test_request_unmarshaller.py b/tests/integration/unmarshalling/test_request_unmarshaller.py
index 4ac3ac88..daddf02d 100644
--- a/tests/integration/unmarshalling/test_request_unmarshaller.py
+++ b/tests/integration/unmarshalling/test_request_unmarshaller.py
@@ -1,6 +1,5 @@
 import json
 from base64 import b64encode
-from email.generator import _make_boundary
 
 import pytest
 
@@ -469,16 +468,14 @@ def test_request_body_with_object_default(self):
         assert result.errors == []
         assert result.body == {"tags": []}
 
-    @pytest.mark.xfail(
-        reason=(
-            "multipart composed-schema branch selection is not binary-aware"
-        ),
-        strict=True,
-    )
     def test_request_body_multipart_oneof_binary_field(self):
         from openapi_core import OpenAPI
 
-        boundary = _make_boundary()
+        # email.generator._make_boundary() returns strings like
+        # ``===============1234==`` whose ``=`` chars trip the mimetype
+        # parameter parser. That's a separate bug; here we just want a
+        # legal boundary that round-trips the binary oneOf branch.
+        boundary = "openapicoreboundary1234567890"
         spec = OpenAPI.from_dict(
             {
                 "openapi": "3.1.0",
@@ -545,6 +542,152 @@ def test_request_body_multipart_oneof_binary_field(self):
         assert result.errors == []
         assert result.body == {"file": b"\xff\xfe"}
 
+    def test_request_body_multipart_anyof_binary_field(self):
+        # anyOf with a text-only branch and a binary branch: a posted file
+        # should match the binary branch (and only the binary branch).
+        from openapi_core import OpenAPI
+
+        boundary = "openapicoreboundary1234567890"
+        spec = OpenAPI.from_dict(
+            {
+                "openapi": "3.1.0",
+                "info": {"version": "0", "title": "test"},
+                "paths": {
+                    "/test": {
+                        "post": {
+                            "requestBody": {
+                                "required": True,
+                                "content": {
+                                    "multipart/form-data": {
+                                        "schema": {
+                                            "anyOf": [
+                                                {
+                                                    "type": "object",
+                                                    "properties": {
+                                                        "note": {
+                                                            "type": "string"
+                                                        }
+                                                    },
+                                                    "required": ["note"],
+                                                },
+                                                {
+                                                    "type": "object",
+                                                    "properties": {
+                                                        "blob": {
+                                                            "type": "string",
+                                                            "format": "binary",
+                                                        }
+                                                    },
+                                                    "required": ["blob"],
+                                                },
+                                            ]
+                                        }
+                                    }
+                                },
+                            },
+                            "responses": {"200": {"description": ""}},
+                        }
+                    }
+                },
+            }
+        )
+        data = (
+            (
+                f"--{boundary}\n"
+                "Content-Type: application/octet-stream\n"
+                "MIME-Version: 1.0\n"
+                'Content-Disposition: form-data; name="blob"\n\n'
+            ).encode("ascii")
+            + b"\x00\x01\x02binary\xff"
+            + (f"\n--{boundary}--\n").encode("ascii")
+        )
+        request = MockRequest(
+            "http://localhost",
+            "post",
+            "/test",
+            content_type=f"multipart/form-data; boundary={boundary}",
+            data=data,
+        )
+
+        result = spec.unmarshal_request(request)
+
+        assert result.errors == []
+        assert result.body == {"blob": b"\x00\x01\x02binary\xff"}
+
+    def test_request_body_multipart_allof_binary_field(self):
+        # allOf: every branch must validate. Binary normalization has to
+        # be visible to all of them.
+        from openapi_core import OpenAPI
+
+        boundary = "openapicoreboundary1234567890"
+        spec = OpenAPI.from_dict(
+            {
+                "openapi": "3.1.0",
+                "info": {"version": "0", "title": "test"},
+                "paths": {
+                    "/test": {
+                        "post": {
+                            "requestBody": {
+                                "required": True,
+                                "content": {
+                                    "multipart/form-data": {
+                                        "schema": {
+                                            "allOf": [
+                                                {
+                                                    "type": "object",
+                                                    "properties": {
+                                                        "label": {
+                                                            "type": "string"
+                                                        }
+                                                    },
+                                                    "required": ["label"],
+                                                },
+                                                {
+                                                    "type": "object",
+                                                    "properties": {
+                                                        "file": {
+                                                            "type": "string",
+                                                            "format": "binary",
+                                                        }
+                                                    },
+                                                    "required": ["file"],
+                                                },
+                                            ]
+                                        }
+                                    }
+                                },
+                            },
+                            "responses": {"200": {"description": ""}},
+                        }
+                    }
+                },
+            }
+        )
+        data = (
+            (
+                f"--{boundary}\n"
+                'Content-Disposition: form-data; name="label"\n\n'
+                "report"
+                f"\n--{boundary}\n"
+                "Content-Type: application/octet-stream\n"
+                'Content-Disposition: form-data; name="file"\n\n'
+            ).encode("ascii")
+            + b"\xff\xfe"
+            + (f"\n--{boundary}--\n").encode("ascii")
+        )
+        request = MockRequest(
+            "http://localhost",
+            "post",
+            "/test",
+            content_type=f"multipart/form-data; boundary={boundary}",
+            data=data,
+        )
+
+        result = spec.unmarshal_request(request)
+
+        assert result.errors == []
+        assert result.body == {"label": "report", "file": b"\xff\xfe"}
+
     def test_post_pets_validates_request_schema_once(
         self, request_unmarshaller
     ):
diff --git a/tests/unit/deserializing/test_media_types_deserializers.py b/tests/unit/deserializing/test_media_types_deserializers.py
index 05b1e532..38c65e64 100644
--- a/tests/unit/deserializing/test_media_types_deserializers.py
+++ b/tests/unit/deserializing/test_media_types_deserializers.py
@@ -657,12 +657,6 @@ def test_urlencoded_form_with_array_default(self, deserializer_factory):
 
         assert result == {"tags": []}
 
-    @pytest.mark.xfail(
-        reason=(
-            "multipart composed-schema branch selection is not binary-aware"
-        ),
-        strict=True,
-    )
     def test_multipart_oneof_binary_field(self, spec, deserializer_factory):
         mimetype = "multipart/form-data"
         schema_dict = {
@@ -757,12 +751,6 @@ def test_multipart_oneof_string_field(self, spec, deserializer_factory):
             "fieldA": "value",
         }
 
-    @pytest.mark.xfail(
-        reason=(
-            "multipart composed-schema branch selection is not binary-aware"
-        ),
-        strict=True,
-    )
     def test_multipart_anyof_binary_field(self, spec, deserializer_factory):
         mimetype = "multipart/form-data"
         schema_dict = {
diff --git a/tests/unit/validation/test_schema_validators.py b/tests/unit/validation/test_schema_validators.py
index 2dea1e10..83e2ed2d 100644
--- a/tests/unit/validation/test_schema_validators.py
+++ b/tests/unit/validation/test_schema_validators.py
@@ -356,3 +356,166 @@ def test_enforce_properties_required_applies_to_nested_composed_schemas(
                 schema,
                 enforce_properties_required=True,
             ).validate({"name": "openapi-core", "meta": {}})
+
+
+class TestSchemaValidateBinary:
+    """Bytes-aware validation for OpenAPI ``string`` schemas whose
+    ``format`` is ``binary`` or ``byte``.
+
+    OpenAPI lets multipart and octet-stream payloads flow end-to-end as
+    raw ``bytes``, but jsonschema's ``string`` type only accepts text.
+    The validator decodes ``bytes`` to text just for the validator's
+    consumption, leaving the value untouched for downstream
+    unmarshalling.
+    """
+
+    @pytest.fixture
+    def spec(self):
+        return SchemaPath.from_dict({})
+
+    @pytest.fixture
+    def validator_factory(self, spec):
+        def create_validator(schema_dict):
+            schema = SchemaPath.from_dict(schema_dict)
+            return oas30_write_schema_validators_factory.create(spec, schema)
+
+        return create_validator
+
+    def test_bytes_against_string_binary(self, validator_factory):
+        validator_factory({"type": "string", "format": "binary"}).validate(
+            b"\xff\xfe\x00 binary payload"
+        )
+
+    def test_bytes_against_string_byte(self, validator_factory):
+        validator_factory({"type": "string", "format": "byte"}).validate(
+            b"aGVsbG8gd29ybGQ="
+        )
+
+    def test_bytes_against_string_without_format_still_rejected(
+        self, validator_factory
+    ):
+        # No binary/byte format -- the bytes-is-string convention does
+        # NOT apply, so the value is correctly rejected.
+        with pytest.raises(InvalidSchemaValue):
+            validator_factory({"type": "string"}).validate(b"hello")
+
+    def test_bytes_against_string_date_format_still_rejected(
+        self, validator_factory
+    ):
+        with pytest.raises(InvalidSchemaValue):
+            validator_factory({"type": "string", "format": "date"}).validate(
+                b"2024-01-01"
+            )
+
+    def test_bytes_nested_in_object_property(self, validator_factory):
+        schema = {
+            "type": "object",
+            "properties": {
+                "file": {"type": "string", "format": "binary"},
+                "name": {"type": "string"},
+            },
+            "required": ["file"],
+        }
+        validator_factory(schema).validate(
+            {"file": b"\xff\xfe", "name": "report"}
+        )
+
+    def test_bytes_nested_in_array_items(self, validator_factory):
+        schema = {
+            "type": "array",
+            "items": {"type": "string", "format": "binary"},
+        }
+        validator_factory(schema).validate([b"\xff", b"\x00\x01", b"a"])
+
+    def test_bytes_nested_in_object_under_array(self, validator_factory):
+        schema = {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {"blob": {"type": "string", "format": "binary"}},
+            },
+        }
+        validator_factory(schema).validate(
+            [{"blob": b"\xff"}, {"blob": b"\x00"}]
+        )
+
+    def test_bytes_under_oneof_binary_branch(self, validator_factory):
+        schema = {
+            "oneOf": [
+                {
+                    "type": "object",
+                    "properties": {"label": {"type": "string"}},
+                    "required": ["label"],
+                },
+                {
+                    "type": "object",
+                    "properties": {
+                        "file": {"type": "string", "format": "binary"}
+                    },
+                    "required": ["file"],
+                },
+            ]
+        }
+        validator_factory(schema).validate({"file": b"\xff\xfe"})
+
+    def test_bytes_under_anyof_binary_branch(self, validator_factory):
+        schema = {
+            "anyOf": [
+                {
+                    "type": "object",
+                    "properties": {"note": {"type": "string"}},
+                    "required": ["note"],
+                },
+                {
+                    "type": "object",
+                    "properties": {
+                        "blob": {"type": "string", "format": "binary"}
+                    },
+                    "required": ["blob"],
+                },
+            ]
+        }
+        validator_factory(schema).validate({"blob": b"\x00\x01\x02"})
+
+    def test_bytes_under_allof_binary_branch(self, validator_factory):
+        schema = {
+            "allOf": [
+                {
+                    "type": "object",
+                    "properties": {"label": {"type": "string"}},
+                    "required": ["label"],
+                },
+                {
+                    "type": "object",
+                    "properties": {
+                        "file": {"type": "string", "format": "binary"}
+                    },
+                    "required": ["file"],
+                },
+            ]
+        }
+        validator_factory(schema).validate({"label": "x", "file": b"\xff\xfe"})
+
+    def test_bytes_in_additionalproperties_schema(self, validator_factory):
+        schema = {
+            "type": "object",
+            "additionalProperties": {
+                "type": "string",
+                "format": "binary",
+            },
+        }
+        validator_factory(schema).validate({"a": b"\xff", "b": b"\x00\x01"})
+
+    def test_validate_does_not_mutate_input(self, validator_factory):
+        schema = {
+            "type": "object",
+            "properties": {
+                "file": {"type": "string", "format": "binary"},
+            },
+        }
+        value = {"file": b"\xff\xfe"}
+        validator_factory(schema).validate(value)
+        # The original mapping is unchanged; bytes value survives intact
+        # for downstream unmarshalling.
+        assert value == {"file": b"\xff\xfe"}
+        assert isinstance(value["file"], bytes)