-
-
Notifications
You must be signed in to change notification settings - Fork 9
perf: Replace unordered_map with bitset for vocabulary lookups #2040
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
jviotti
merged 2 commits into
sourcemeta:main
from
syedazeez337:feature/bitset-vocabulary-optimization
Nov 25, 2025
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
125 changes: 125 additions & 0 deletions
125
src/core/jsonschema/include/sourcemeta/core/jsonschema_vocabularies.h
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| #ifndef SOURCEMETA_CORE_JSONSCHEMA_VOCABULARIES_H_ | ||
| #define SOURCEMETA_CORE_JSONSCHEMA_VOCABULARIES_H_ | ||
|
|
||
| #ifndef SOURCEMETA_CORE_JSONSCHEMA_EXPORT | ||
| #include <sourcemeta/core/jsonschema_export.h> | ||
| #endif | ||
|
|
||
| #include <sourcemeta/core/json.h> | ||
|
|
||
| #include <bitset> // std::bitset | ||
| #include <cassert> // assert | ||
| #include <cstdint> // std::uint32_t, std::size_t | ||
| #include <optional> // std::optional | ||
| #include <stdexcept> // std::out_of_range | ||
| #include <string> // std::string | ||
| #include <string_view> // std::string_view | ||
| #include <unordered_map> // std::unordered_map | ||
| #include <utility> // std::pair | ||
| #include <vector> // std::vector | ||
|
|
||
| namespace sourcemeta::core { | ||
|
|
||
| /// @ingroup jsonschema | ||
| /// Optimized vocabulary set using bitflags for known vocabularies | ||
| /// and a fallback `std::unordered_map` for custom vocabularies. | ||
| /// | ||
| /// TODO: To maximize performance gains, convert string-based vocabulary checks | ||
| /// throughout the codebase to use enum-based methods. | ||
| struct SOURCEMETA_CORE_JSONSCHEMA_EXPORT Vocabularies { | ||
| enum class Known : std::uint8_t { | ||
| // Pre-vocabulary dialects (treated as vocabularies) | ||
| JSON_Schema_Draft_0 = 0, | ||
| JSON_Schema_Draft_0_Hyper = 1, | ||
| JSON_Schema_Draft_1 = 2, | ||
| JSON_Schema_Draft_1_Hyper = 3, | ||
| JSON_Schema_Draft_2 = 4, | ||
| JSON_Schema_Draft_2_Hyper = 5, | ||
| JSON_Schema_Draft_3 = 6, | ||
| JSON_Schema_Draft_3_Hyper = 7, | ||
| JSON_Schema_Draft_4 = 8, | ||
| JSON_Schema_Draft_4_Hyper = 9, | ||
| JSON_Schema_Draft_6 = 10, | ||
| JSON_Schema_Draft_6_Hyper = 11, | ||
| JSON_Schema_Draft_7 = 12, | ||
| JSON_Schema_Draft_7_Hyper = 13, | ||
| // 2019-09 vocabularies | ||
| JSON_Schema_2019_09_Core = 14, | ||
| JSON_Schema_2019_09_Applicator = 15, | ||
| JSON_Schema_2019_09_Validation = 16, | ||
| JSON_Schema_2019_09_Meta_Data = 17, | ||
| JSON_Schema_2019_09_Format = 18, | ||
| JSON_Schema_2019_09_Content = 19, | ||
| JSON_Schema_2019_09_Hyper_Schema = 20, | ||
| // 2020-12 vocabularies | ||
| JSON_Schema_2020_12_Core = 21, | ||
| JSON_Schema_2020_12_Applicator = 22, | ||
| JSON_Schema_2020_12_Unevaluated = 23, | ||
| JSON_Schema_2020_12_Validation = 24, | ||
| JSON_Schema_2020_12_Meta_Data = 25, | ||
| JSON_Schema_2020_12_Format_Annotation = 26, | ||
| JSON_Schema_2020_12_Format_Assertion = 27, | ||
| JSON_Schema_2020_12_Content = 28 | ||
| }; | ||
|
|
||
| // NOTE: Must be kept in sync with the Known enum above | ||
| static constexpr std::size_t KNOWN_VOCABULARY_COUNT = 29; | ||
|
|
||
| public: | ||
| Vocabularies() = default; | ||
| Vocabularies(const Vocabularies &) = default; | ||
| Vocabularies(Vocabularies &&) noexcept = default; | ||
| auto operator=(const Vocabularies &) -> Vocabularies & = default; | ||
| auto operator=(Vocabularies &&) noexcept -> Vocabularies & = default; | ||
| ~Vocabularies() = default; | ||
|
|
||
| /// Construct from initializer list | ||
| Vocabularies(std::initializer_list<std::pair<JSON::String, bool>> init); | ||
|
|
||
| /// Construct from initializer list using known vocabulary enums | ||
| Vocabularies(std::initializer_list<std::pair<Known, bool>> init); | ||
|
|
||
| /// Check if a vocabulary is enabled | ||
| [[nodiscard]] auto contains(const JSON::String &uri) const noexcept -> bool; | ||
|
|
||
| /// Check if a known vocabulary is enabled | ||
| [[nodiscard]] auto contains(Known vocabulary) const noexcept -> bool; | ||
|
|
||
| /// Insert a vocabulary with its required/optional status | ||
| auto insert(const JSON::String &uri, bool required) noexcept -> void; | ||
|
|
||
| /// Insert a known vocabulary with its required/optional status | ||
| auto insert(Known vocabulary, bool required) noexcept -> void; | ||
|
|
||
| /// Get vocabulary status by URI | ||
| [[nodiscard]] auto get(const JSON::String &uri) const noexcept | ||
| -> std::optional<bool>; | ||
|
|
||
| /// Get known vocabulary status | ||
| [[nodiscard]] auto get(Known vocabulary) const noexcept | ||
| -> std::optional<bool>; | ||
|
|
||
| /// Get the number of vocabularies (required + optional + custom) | ||
| [[nodiscard]] auto size() const noexcept -> std::size_t; | ||
|
|
||
| /// Check if there are no vocabularies | ||
| [[nodiscard]] auto empty() const noexcept -> bool; | ||
|
|
||
| private: | ||
| // Invariant: required_known and optional_known must be mutually exclusive | ||
| // A vocabulary can be either required (true) OR optional (false), never both | ||
| #ifdef _MSC_VER | ||
| #pragma warning(push) | ||
| #pragma warning(disable : 4251) | ||
| #endif | ||
| std::bitset<KNOWN_VOCABULARY_COUNT> required_known{}; | ||
| std::bitset<KNOWN_VOCABULARY_COUNT> optional_known{}; | ||
| std::unordered_map<JSON::String, bool> custom; | ||
| #ifdef _MSC_VER | ||
| #pragma warning(pop) | ||
| #endif | ||
| }; | ||
|
|
||
| } // namespace sourcemeta::core | ||
|
|
||
| #endif | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -259,18 +259,67 @@ auto sourcemeta::core::base_dialect( | |
| } | ||
|
|
||
| namespace { | ||
| auto core_vocabulary(std::string_view base_dialect) -> std::string { | ||
| auto core_vocabulary_known(std::string_view base_dialect) | ||
| -> sourcemeta::core::Vocabularies::Known { | ||
| if (base_dialect == "https://json-schema.org/draft/2020-12/schema" || | ||
| base_dialect == "https://json-schema.org/draft/2020-12/hyper-schema") { | ||
| return "https://json-schema.org/draft/2020-12/vocab/core"; | ||
| return sourcemeta::core::Vocabularies::Known::JSON_Schema_2020_12_Core; | ||
jviotti marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } else if (base_dialect == "https://json-schema.org/draft/2019-09/schema" || | ||
| base_dialect == | ||
| "https://json-schema.org/draft/2019-09/hyper-schema") { | ||
| return "https://json-schema.org/draft/2019-09/vocab/core"; | ||
| return sourcemeta::core::Vocabularies::Known::JSON_Schema_2019_09_Core; | ||
| } else { | ||
| throw sourcemeta::core::SchemaBaseDialectError(std::string{base_dialect}); | ||
| } | ||
| } | ||
|
|
||
| auto dialect_to_known(std::string_view dialect) | ||
| -> std::optional<sourcemeta::core::Vocabularies::Known> { | ||
| using sourcemeta::core::Vocabularies; | ||
| if (dialect == "http://json-schema.org/draft-07/schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_7; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-07/hyper-schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_7_Hyper; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-06/schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_6; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-06/hyper-schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_6_Hyper; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-04/schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_4; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-04/hyper-schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_4_Hyper; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-03/schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_3; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-03/hyper-schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_3_Hyper; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-02/schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_2; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-02/hyper-schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_2_Hyper; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-01/schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_1; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-01/hyper-schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_1_Hyper; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-00/schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_0; | ||
| } | ||
| if (dialect == "http://json-schema.org/draft-00/hyper-schema#") { | ||
| return Vocabularies::Known::JSON_Schema_Draft_0_Hyper; | ||
| } | ||
| return std::nullopt; | ||
| } | ||
| } // namespace | ||
|
|
||
| auto sourcemeta::core::vocabularies( | ||
|
|
@@ -304,21 +353,22 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver, | |
| // As a performance optimization shortcut | ||
| if (base_dialect == dialect) { | ||
| if (dialect == "https://json-schema.org/draft/2020-12/schema") { | ||
| return {{"https://json-schema.org/draft/2020-12/vocab/core", true}, | ||
| {"https://json-schema.org/draft/2020-12/vocab/applicator", true}, | ||
| {"https://json-schema.org/draft/2020-12/vocab/unevaluated", true}, | ||
| {"https://json-schema.org/draft/2020-12/vocab/validation", true}, | ||
| {"https://json-schema.org/draft/2020-12/vocab/meta-data", true}, | ||
| {"https://json-schema.org/draft/2020-12/vocab/format-annotation", | ||
| true}, | ||
| {"https://json-schema.org/draft/2020-12/vocab/content", true}}; | ||
| return Vocabularies{ | ||
| {Vocabularies::Known::JSON_Schema_2020_12_Core, true}, | ||
| {Vocabularies::Known::JSON_Schema_2020_12_Applicator, true}, | ||
| {Vocabularies::Known::JSON_Schema_2020_12_Unevaluated, true}, | ||
| {Vocabularies::Known::JSON_Schema_2020_12_Validation, true}, | ||
| {Vocabularies::Known::JSON_Schema_2020_12_Meta_Data, true}, | ||
| {Vocabularies::Known::JSON_Schema_2020_12_Format_Annotation, true}, | ||
| {Vocabularies::Known::JSON_Schema_2020_12_Content, true}}; | ||
| } else if (dialect == "https://json-schema.org/draft/2019-09/schema") { | ||
| return {{"https://json-schema.org/draft/2019-09/vocab/core", true}, | ||
| {"https://json-schema.org/draft/2019-09/vocab/applicator", true}, | ||
| {"https://json-schema.org/draft/2019-09/vocab/validation", true}, | ||
| {"https://json-schema.org/draft/2019-09/vocab/meta-data", true}, | ||
| {"https://json-schema.org/draft/2019-09/vocab/format", false}, | ||
| {"https://json-schema.org/draft/2019-09/vocab/content", true}}; | ||
| return Vocabularies{ | ||
| {Vocabularies::Known::JSON_Schema_2019_09_Core, true}, | ||
| {Vocabularies::Known::JSON_Schema_2019_09_Applicator, true}, | ||
| {Vocabularies::Known::JSON_Schema_2019_09_Validation, true}, | ||
| {Vocabularies::Known::JSON_Schema_2019_09_Meta_Data, true}, | ||
| {Vocabularies::Known::JSON_Schema_2019_09_Format, false}, | ||
| {Vocabularies::Known::JSON_Schema_2019_09_Content, true}}; | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -336,7 +386,11 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver, | |
| dialect == "http://json-schema.org/draft-02/schema#" || | ||
| dialect == "http://json-schema.org/draft-01/schema#" || | ||
| dialect == "http://json-schema.org/draft-00/schema#") { | ||
| return {{dialect, true}}; | ||
| const auto known = dialect_to_known(dialect); | ||
| if (known.has_value()) { | ||
| return Vocabularies{{known.value(), true}}; | ||
| } | ||
| return Vocabularies{{dialect, true}}; | ||
| } | ||
|
|
||
| /* | ||
|
|
@@ -356,7 +410,11 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver, | |
| base_dialect == "http://json-schema.org/draft-02/hyper-schema#" || | ||
| base_dialect == "http://json-schema.org/draft-01/hyper-schema#" || | ||
| base_dialect == "http://json-schema.org/draft-00/hyper-schema#") { | ||
| return {{base_dialect, true}}; | ||
| const auto known = dialect_to_known(base_dialect); | ||
| if (known.has_value()) { | ||
| return Vocabularies{{known.value(), true}}; | ||
| } | ||
| return Vocabularies{{base_dialect, true}}; | ||
| } | ||
|
|
||
| /* | ||
|
|
@@ -384,25 +442,28 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver, | |
| */ | ||
|
|
||
| Vocabularies result; | ||
| const std::string core{core_vocabulary(base_dialect)}; | ||
| const auto core{core_vocabulary_known(base_dialect)}; | ||
| if (schema_dialect.defines("$vocabulary")) { | ||
| const sourcemeta::core::JSON &vocabularies{ | ||
| schema_dialect.at("$vocabulary")}; | ||
| assert(vocabularies.is_object()); | ||
| for (const auto &entry : vocabularies.as_object()) { | ||
| result.insert({entry.first, entry.second.to_boolean()}); | ||
| result.insert(entry.first, entry.second.to_boolean()); | ||
| } | ||
| } else { | ||
| result.insert({core, true}); | ||
| result.insert(core, true); | ||
| } | ||
|
|
||
| // The specification recommends these checks | ||
| if (!result.contains(core)) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we take advantage of your bitsets here? See that we get the core vocabulary for a dialect using |
||
| throw sourcemeta::core::SchemaError( | ||
| "The core vocabulary must always be present"); | ||
| } else if (!result.at(core)) { | ||
| throw sourcemeta::core::SchemaError( | ||
| "The core vocabulary must always be required"); | ||
| } else { | ||
| const auto core_status{result.get(core)}; | ||
| if (core_status.has_value() && !core_status.value()) { | ||
| throw sourcemeta::core::SchemaError( | ||
| "The core vocabulary must always be required"); | ||
| } | ||
| } | ||
|
|
||
| return result; | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.