✨ skip_input_errpr implementation

— ✨ constexpr Shift-JIS implementation — ✨ New skip_handler with grafting into UTF-8 and UTF-16 (and all derivatives) to work with it — 🛠 Improvements on all replacement handlers — 🛠 Separation of many error handlers from the core file — 📝✨ Sincere improvements to getting started and documentation (almost done!) — 📝 Benchmarks placed properly and live.
soasis · Dec 10, 2022 · e96e1e0 · e96e1e0
1 parent 4269245
commit e96e1e0
Show file tree

Hide file tree

Showing 343 changed files with 7,036 additions and 3,669 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -266,5 +266,10 @@ if(ZTD_TEXT_SCRATCH)
 		${--warn-pedantic}
 		${--warn-all}
 		${--warn-extra}
-		${--warn-errors})
+		${--warn-errors}
+	)
+	target_compile_definitions(scratch
+		PRIVATE
+		ZTD_ASSERT_CHECKS=1
+	)
 endif()
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -95,9 +95,9 @@ add_subdirectory(barrier)
 add_subdirectory(function_form)
 add_subdirectory(conversion_speed)
 
-add_custom_target(ztd.text.benchmarks.all_graphs
+add_custom_target(ztd.text.benchmarks.graphs.all
 	COMMENT "[ztd.text] graphing all benchmarks...")
-add_dependencies(ztd.text.benchmarks.all_graphs
+add_dependencies(ztd.text.benchmarks.graphs.all
 	ztd.tools.benchmark_grapher.function_form.large
 	ztd.tools.benchmark_grapher.function_form.small
 	ztd.tools.benchmark_grapher.conversion_speed.small

diff --git a/benchmarks/barrier/include/barrier/api.h b/benchmarks/barrier/include/barrier/api.h
@@ -89,4 +89,4 @@
 
 // clang-format on
 
-#endif // ZTD_TEXT_BENCHMARKS_BARRIER_API_H
+#endif
diff --git a/benchmarks/barrier/include/barrier/barrier.h b/benchmarks/barrier/include/barrier/barrier.h
@@ -37,4 +37,4 @@
 #include <barrier/convert.h>
 #include <barrier/data.h>
 
-#endif // ZTD_TEXT_BENCHMARKS_BARRIER_BARRIER_H
+#endif
diff --git a/benchmarks/barrier/include/barrier/convert.h b/benchmarks/barrier/include/barrier/convert.h
@@ -58,4 +58,4 @@ ZTD_C_LANGUAGE_LINKAGE_I_
 ZTD_TEXT_BENCHMARKS_BARRIER_API_LINKAGE_I_ error_ptr_ptr ptr_struct_ptr_ptr(ztd_char8_t* output_ptr,
      ztd_char8_t* output_ptr_last, const ztd_char32_t* input_ptr, const ztd_char32_t* input_ptr_last);
 
-#endif // ZTD_TEXT_BENCHMARKS_BARRIER_CONVERT_H
+#endif
diff --git a/benchmarks/barrier/include/barrier/data.h b/benchmarks/barrier/include/barrier/data.h
@@ -69,4 +69,4 @@ ZTD_TEXT_BENCHMARKS_BARRIER_API_LINKAGE_I_ c_span_char16_t u16_basic_source_data
 ZTD_C_LANGUAGE_LINKAGE_I_
 ZTD_TEXT_BENCHMARKS_BARRIER_API_LINKAGE_I_ c_span_char32_t u32_basic_source_data;
 
-#endif // ZTD_TEXT_BENCHMARKS_BARRIER_DATA_H
+#endif
diff --git a/benchmarks/barrier/include/barrier/version.h b/benchmarks/barrier/include/barrier/version.h
@@ -34,4 +34,4 @@
 
 #include <barrier/api.h>
 
-#endif // ZTD_TEXT_BENCHAMRKS_BARRIER_VERSION_H
+#endif
diff --git a/benchmarks/conversion_speed/CMakeLists.txt b/benchmarks/conversion_speed/CMakeLists.txt
@@ -104,5 +104,5 @@ function (generate_converion_speed_benchmark_targets name data_name title)
 	)
 endfunction()
 
-generate_converion_speed_benchmark_targets(large unicode "All Unicode Scalar Values (~4.4 MB)")
-generate_converion_speed_benchmark_targets(small basic_source "C Character Set (97 Bytes)")
+generate_converion_speed_benchmark_targets(large unicode "All Unicode Scalar Values")
+generate_converion_speed_benchmark_targets(small basic_source "C Character Set")
diff --git a/benchmarks/conversion_speed/graph_config.in.json b/benchmarks/conversion_speed/graph_config.in.json
@@ -9,109 +9,85 @@
 			"name": "UTF-16 to UTF-32 (Well-Formed)",
 			"pattern": "utf16_to_utf32_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-16 data to UTF-32 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-16 data to UTF-32 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf16_to_utf32_well_formed"
 		},
 		{
 			"name": "UTF-16 to UTF-8 (Well-Formed)",
 			"pattern": "utf16_to_utf8_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-16 data to UTF-8 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-16 data to UTF-8 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf16_to_utf8_well_formed"
 		},
 		{
 			"name": "UTF-8 to UTF-16 (Well-Formed)",
 			"pattern": "utf8_to_utf16_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-8 data to UTF-16 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-8 data to UTF-16 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf8_to_utf16_well_formed"
 		},
 		{
 			"name": "UTF-8 to UTF-32 (Well-Formed)",
 			"pattern": "utf8_to_utf32_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-8 data to UTF-32 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-8 data to UTF-32 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf8_to_utf32_well_formed"
 		},
 		{
 			"name": "UTF-32 to UTF-8 (Well-Formed)",
 			"pattern": "utf32_to_utf8_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-32 data to UTF-8 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-32 data to UTF-8 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf32_to_utf8_well_formed"
 		},
 		{
 			"name": "UTF-32 to UTF-16 (Well-Formed)",
 			"pattern": "utf32_to_utf16_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-32 data to UTF-16 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-32 data to UTF-16 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf32_to_utf16_well_formed"
 		},
 		{
 			"name": "UTF-16 to UTF-32 (Well-Formed, with Init)",
 			"pattern": "utf16_to_utf32_init_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-16 data to UTF-32 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-16 data to UTF-32 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf16_to_utf32_well_formed_init"
 		},
 		{
 			"name": "UTF-16 to UTF-8 (Well-Formed, with Init)",
 			"pattern": "utf16_to_utf8_init_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-16 data to UTF-8 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-16 data to UTF-8 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf16_to_utf8_well_formed_init"
 		},
 		{
 			"name": "UTF-8 to UTF-16 (Well-Formed, with Init)",
 			"pattern": "utf8_to_utf16_init_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-8 data to UTF-16 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-8 data to UTF-16 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf8_to_utf16_well_formed_init"
 		},
 		{
 			"name": "UTF-8 to UTF-32 (Well-Formed, with Init)",
 			"pattern": "utf8_to_utf32_init_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-8 data to UTF-32 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-8 data to UTF-32 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf8_to_utf32_well_formed_init"
 		},
 		{
 			"name": "UTF-32 to UTF-8 (Well-Formed, with Init)",
 			"pattern": "utf32_to_utf8_init_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-32 data to UTF-8 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-32 data to UTF-8 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf32_to_utf8_well_formed_init"
 		},
 		{
 			"name": "UTF-32 to UTF-16 (Well-Formed, with Init)",
 			"pattern": "utf32_to_utf16_init_well_formed_",
 			"ascending": false,
-			"description": "A conversion from UTF-32 data to UTF-16 data that contains no ill-formed or incomplete sequences."
-		},
-		{
-			"name": "UTF-16 to UTF-32 (Well-Formed, Assumed Valid)",
-			"pattern": "utf16_to_utf32_unchecked_well_formed_",
-			"ascending": false,
-			"description": "A conversion from UTF-16 data to UTF-32 data that contains no ill-formed or incomplete sequences."
-		},
-		{
-			"name": "UTF-16 to UTF-8 (Well-Formed, Assumed Valid)",
-			"pattern": "utf16_to_utf8_unchecked_well_formed_",
-			"ascending": false,
-			"description": "A conversion from UTF-16 data to UTF-8 data that contains no ill-formed or incomplete sequences."
-		},
-		{
-			"name": "UTF-8 to UTF-16 (Well-Formed, Assumed Valid)",
-			"pattern": "utf8_to_utf16_unchecked_well_formed_",
-			"ascending": false,
-			"description": "A conversion from UTF-8 data to UTF-16 data that contains no ill-formed or incomplete sequences."
-		},
-		{
-			"name": "UTF-8 to UTF-32 (Well-Formed, Assumed Valid)",
-			"pattern": "utf8_to_utf32_unchecked_well_formed_",
-			"ascending": false,
-			"description": "A conversion from UTF-8 data to UTF-32 data that contains no ill-formed or incomplete sequences."
-		},
-		{
-			"name": "UTF-32 to UTF-8 (Well-Formed, Assumed Valid)",
-			"pattern": "utf32_to_utf8_unchecked_well_formed_",
-			"ascending": false,
-			"description": "A conversion from UTF-32 data to UTF-8 data that contains no ill-formed or incomplete sequences."
-		},
-		{
-			"name": "UTF-32 to UTF-16 (Well-Formed, Assumed Valid)",
-			"pattern": "utf32_to_utf16_unchecked_well_formed_",
-			"ascending": false,
-			"description": "A conversion from UTF-32 data to UTF-16 data that contains no ill-formed or incomplete sequences."
+			"description": "A conversion from UTF-32 data to UTF-16 data that contains no ill-formed or incomplete sequences.",
+			"file_name": "utf32_to_utf16_well_formed_init"
 		}
 	],
 	"data_groups": [
@@ -136,12 +112,12 @@
 			"description": "Measures the ztd.cuneicode library from Shepherd's Oasis and its encoding routines, particularly the non-typed conversion routines from its Conversion Registry abstraction with all defaults left alone."
 		},
 		{
-			"name": "cuneicode registry (unchecked & unbounded)",
-			"pattern": "cuneicode_registry_unbounded_unchecked$",
+			"name": "cuneicode registry (unbounded, assume valid)",
+			"pattern": "cuneicode_registry_unchecked_unbounded$",
 			"description": "Measures the ztd.cuneicode library from Shepherd's Oasis and its encoding routines, particularly the non-typed conversion routines from its Conversion Registry abstraction with specific conversion pathways overridden (without providing an output size, and without checking input validity)."
 		},
 		{
-			"name": "cuneicode registry (unchecked)",
+			"name": "cuneicode registry (assume valid)",
 			"pattern": "cuneicode_registry_unchecked$",
 			"description": "Measures the ztd.cuneicode library from Shepherd's Oasis and its encoding routines, particularly the non-typed conversion routines from its Conversion Registry abstraction with all defaults left alone (without checking for the validity of the input)."
 		},
@@ -175,6 +151,11 @@
 			"pattern": "simdutf$",
 			"description": "Measures the performance of Daniel Lemire's simdutf library, which is meant to be highly optimized and performant under all unicode workloads."
 		},
+		{
+			"name": "simdutf (unbounded, assume valid)",
+			"pattern": "simdutf_unchecked$",
+			"description": "Measures the performance of Daniel Lemire's simdutf library, which is meant to be highly optimized and performant under all unicode workloads."
+		},
 		{
 			"name": "Rust's encoding_c",
 			"pattern": "encoding_c$",
@@ -183,17 +164,22 @@
 		{
 			"name": "Rust's encoding_c (manual)",
 			"pattern": "encoding_c_direct$",
-			"description": "Measures the performance of the Rust encoding_rs library, called through its C bindings encoding_c directly (typically statically linked in to achieve the same optimization potential). Used in the Gecko web engine."
+			"description": "Measures the performance of the Rust encoding_rs library, called through its C bindings encoding_c and not using the general-purpose conversion routes (typically statically linked in to achieve the same optimization potential). Used in the Gecko web engine."
 		},
 		{
-			"name": "ctre",
-			"pattern": "ctre$",
-			"description": "Measures the performance of the Compile-Time Regular Expression (CTRE) library's internal encoding conversion routines."
+			"name": "ctre (assume valid)",
+			"pattern": "ctre_unchecked$",
+			"description": "Measures the performance of the Compile-Time Regular Expression (CTRE) library's internal encoding conversion routines, which assume the input is valid (or produce invalid code points when it is not)."
 		},
 		{
 			"name": "utf8cpp (unbounded)",
 			"pattern": "utf8cpp$",
-			"description": "Measures the utf8cpp library and its conversion routines."
+			"description": "Measures the utf8cpp library and its conversion routines, which do not check for available output space."
+		},
+		{
+			"name": "utf8cpp (unbounded, assume valid)",
+			"pattern": "utf8cpp_unchecked$",
+			"description": "Measures the utf8cpp library and its conversion routines, which do not check for available output space and also assume the input is valid."
 		},
 		{
 			"name": "ICU (2 UConverter convertEx)",
@@ -255,11 +241,46 @@
 			"pattern": "ztd_text_view$",
 			"description": "Measures the ztd.text library conversion routine using the ztd::text::transcode_view range abstraction."
 		},
+		{
+			"name": "ztd.text (assume valid)",
+			"pattern": "ztd_text_unchecked$",
+			"description": "Measures the ztd.text library conversion routine using the ztd::text::transcode function."
+		},
+		{
+			"name": "ztd.text (unbounded, assume valid)",
+			"pattern": "ztd_text_unbounded_unchecked$",
+			"description": "Measures the ztd.text library conversion routine using the ztd::text::transcode function, with output bounds checking turned off."
+		},
+		{
+			"name": "ztd.text (single loop, assume valid)",
+			"pattern": "ztd_text_single_unchecked$",
+			"description": "Measures the ztd.text library conversion routine using the ztd::text::transcode_one function."
+		},
+		{
+			"name": "ztd.text (single loop, unbounded, assume valid)",
+			"pattern": "ztd_text_single_unbounded_unchecked$",
+			"description": "Measures the ztd.text library conversion routine using the ztd::text::transcode_one function in a loop, with output bounds checking turned off."
+		},
+		{
+			"name": "ztd.text view (assume valid)",
+			"pattern": "ztd_text_view_unchecked$",
+			"description": "Measures the ztd.text library conversion routine using the ztd::text::transcode_view range abstraction."
+		},
 		{
 			"name": "Win32",
 			"pattern": "windows_api$",
 			"description": "Measures the Win32 API conversion routine using MultibyteToWideChar function or the WideCharToMultibyte function."
 		},
+		{
+			"name": "Standard C",
+			"pattern": "standard_c$",
+			"description": "Measures the <uchar.h>, <wchar.h>, and similar functionality present in the C standard library."
+		},
+		{
+			"name": "Standard C++",
+			"pattern": "standard_cpp$",
+			"description": "Measures the deprecated std::codecvt_* functionality present in the C++ standard library."
+		},
 		{
 			"name": "noop",
 			"pattern": "noop$",

diff --git a/benchmarks/conversion_speed/include/ztd/text/benchmarks/cuneicode_helpers.hpp b/benchmarks/conversion_speed/include/ztd/text/benchmarks/cuneicode_helpers.hpp
@@ -39,45 +39,49 @@
 #include <vector>
 #include <memory_resource>
 
-struct conversion_deleter {
-	void operator()(cnc_conversion* conv) const noexcept {
-		cnc_conv_delete(conv);
-	}
-};
+inline namespace ztd_text_benchmarks_conversion_speed_cuneicode_help {
 
-struct registry_deleter {
-	void operator()(cnc_conversion_registry* registry) const noexcept {
-		cnc_registry_delete(registry);
-	}
-};
+	struct conversion_deleter {
+		void operator()(cnc_conversion* conv) const noexcept {
+			cnc_conv_delete(conv);
+		}
+	};
 
-struct conversion_closer {
-	void operator()(cnc_conversion* conv) const noexcept {
-		cnc_conv_close(conv);
-	}
-};
+	struct registry_deleter {
+		void operator()(cnc_conversion_registry* registry) const noexcept {
+			cnc_registry_delete(registry);
+		}
+	};
+
+	struct conversion_closer {
+		void operator()(cnc_conversion* conv) const noexcept {
+			cnc_conv_close(conv);
+		}
+	};
 
-struct registry_closer {
-	void operator()(cnc_conversion_registry* registry) const noexcept {
-		cnc_close_registry(registry);
+	struct registry_closer {
+		void operator()(cnc_conversion_registry* registry) const noexcept {
+			cnc_registry_close(registry);
+		}
+	};
+
+	inline void* mbr_allocate(size_t requested_size, size_t alignment, size_t* p_actual_size, void* user_data) {
+		std::pmr::monotonic_buffer_resource& mbr = *static_cast<std::pmr::monotonic_buffer_resource*>(user_data);
+		void* ptr                                = mbr.allocate(requested_size, alignment);
+		*p_actual_size                           = requested_size;
+		return ptr;
 	}
-};
 
-inline void* mbr_allocate(size_t requested_size, size_t alignment, size_t* p_actual_size, void* user_data) {
-	std::pmr::monotonic_buffer_resource& mbr = *static_cast<std::pmr::monotonic_buffer_resource*>(user_data);
-	void* ptr                                = mbr.allocate(requested_size, alignment);
-	*p_actual_size                           = requested_size;
-	return ptr;
-}
+	inline void mbr_deallocate(void* ptr, size_t ptr_size, size_t alignment, void* user_data) {
+		std::pmr::monotonic_buffer_resource& mbr = *static_cast<std::pmr::monotonic_buffer_resource*>(user_data);
+		mbr.deallocate(ptr, ptr_size, alignment);
+	}
 
-inline void mbr_deallocate(void* ptr, size_t ptr_size, size_t alignment, void* user_data) {
-	std::pmr::monotonic_buffer_resource& mbr = *static_cast<std::pmr::monotonic_buffer_resource*>(user_data);
-	mbr.deallocate(ptr, ptr_size, alignment);
-}
+	inline cnc_conversion_heap create_monotonic_buffer_heap(std::pmr::monotonic_buffer_resource& resource) {
+		cnc_conversion_heap mbr_heap = { &resource, mbr_allocate, nullptr, nullptr, nullptr, mbr_deallocate };
+		return mbr_heap;
+	}
 
-inline cnc_conversion_heap create_monotonic_buffer_heap(std::pmr::monotonic_buffer_resource& resource) {
-	cnc_conversion_heap mbr_heap = { &resource, mbr_allocate, nullptr, nullptr, nullptr, mbr_deallocate };
-	return mbr_heap;
-}
+} // namespace ztd_text_benchmarks_conversion_speed_cuneicode_help
 
 #endif
diff --git a/benchmarks/conversion_speed/include/ztd/text/benchmarks/fast_transcode.ztd.text.hpp b/benchmarks/conversion_speed/include/ztd/text/benchmarks/fast_transcode.ztd.text.hpp
@@ -126,6 +126,8 @@ TEXT_TRANSCODE_EXTENSION_POINTS(16, 32, be, le, , );
 TEXT_TRANSCODE_EXTENSION_POINTS(32, 8, , , , );
 /// This one is expanded below, as an example, with commentary!
 // TEXT_TRANSCODE_EXTENSION_POINTS(8, 32, , , , );
+// Macro hygiene!
+#undef TEXT_TRANSCODE_EXTENSION_POINTS
 
 template <typename FromErrorHandler, typename ToErrorHandler, typename FromState, typename ToState, typename PivotRange>
 auto text_transcode(::ztd::tag<ztd::text::utf8_t, ztd::text::utf32_t>, ztd::span<const ztd_char8_t> input,
@@ -214,7 +216,4 @@ auto text_transcode(::ztd::tag<ztd::text::utf8_t, ztd::text::utf32_t>, ztd::span
 	     from_state, to_state, __pivot);
 }
 
-// Macro hygiene!
-#undef TEXT_TRANSCODE_EXTENSION_POINTS
-
 #endif