Skip to content

Commit

Permalink
✨ Punycode encoding, normalization skeletons, and documentation updates.
Browse files Browse the repository at this point in the history
  • Loading branch information
ThePhD committed Sep 11, 2022
1 parent db86e29 commit e8dce10
Show file tree
Hide file tree
Showing 70 changed files with 2,176 additions and 1,214 deletions.
2 changes: 1 addition & 1 deletion .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Contact: opensource@soasis.org
#
# Commercial License Usage
# Licensees holding valid commercial ztd.cuneicode licenses may use this file in
# Licensees holding valid commercial ztd.text licenses may use this file in
# accordance with the commercial license agreement provided with the
# Software or, alternatively, in accordance with the terms contained in
# a written agreement between you and Shepherd's Oasis, LLC.
Expand Down
91 changes: 13 additions & 78 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
#
# ============================================================================>

cmake_minimum_required(VERSION 3.16.0)
cmake_policy(VERSION 3.16)
cmake_minimum_required(VERSION 3.21.0)
cmake_policy(VERSION 3.21)

# # Project kickstart
# Includes a bunch of basic flags and utilities shared across projects
Expand Down Expand Up @@ -119,8 +119,6 @@ option(ZTD_TEXT_EXAMPLES "Enable build of examples" OFF)
option(ZTD_TEXT_BENCHMARKS "Enable build of benchmarks" OFF)
option(ZTD_TEXT_GENERATE_SINGLE "Enable generation of a single header and its target" OFF)
option(ZTD_TEXT_USE_CUNEICODE "Use ztd.cuneicode" OFF)
option(ZTD_TEXT_USE_LIBICONV "Use libiconv" OFF)
option(ZTD_TEXT_USE_STATIC_LIBICONV "Use libiconv and link against it even if it's a static library" OFF)

# # Dependencies
# ztd.idk
Expand All @@ -136,71 +134,16 @@ FetchContent_Declare(ztd.static_containers
FetchContent_MakeAvailable(ztd.static_containers)

# ztd.cuneicode
if(ZTD_TEXT_USE_CUNEICODE)
FetchContent_Declare(ztd.cuneicode
GIT_REPOSITORY https://github.com/soasis/cuneicode.git
GIT_TAG main)
FetchContent_MakeAvailable(ztd.cuneicode)
set(ztd-text-cuneicode ztd::cuneicode)
endif()

# iconv - static and dynamic
if(NOT TARGET Iconv::Iconv AND ZTD_TEXT_USE_LIBICONV)
# because of iconv's license, using it as a static dependency
# is tricky: therefore, it's only enabled for folks who explicitly asked for it,
# regardless of whether it is available or not
find_package(Iconv)
endif()
FetchContent_Declare(ztd.cuneicode
GIT_REPOSITORY https://github.com/soasis/cuneicode.git
GIT_TAG main)
FetchContent_MakeAvailable(ztd.cuneicode)

# define generator expressions for each moment
string(CONCAT ztd-text-libiconv-define
$<IF:$<BOOL:${ZTD_TEXT_USE_LIBICONV}>,
ZTD_LIBICONV=1,
ZTD_LIBICONV=0
>
)
string(CONCAT ztd-text-libiconv-load-define
$<IF:$<AND:$<BOOL:${Iconv_FOUND}>,$<BOOL:${ZTD_TEXT_USE_LIBICONV}>>,
ZTD_LIBICONV_LOAD=0,
ZTD_LIBICONV_LOAD=1
>
)
string(CONCAT ztd-text-libiconv-header-define
$<IF:$<AND:$<BOOL:${Iconv_FOUND}>,$<BOOL:${ZTD_TEXT_USE_LIBICONV}>>,
ZTD_ICONV_H=1,
ZTD_ICONV_H=0
>
)
string(CONCAT ztd-text-libiconv
$<TARGET_NAME_IF_EXISTS:Iconv::Iconv>
)
string(CONCAT ztd-text-libiconv-dl
$<$<BOOL:ztd-text-libiconv>:${CMAKE_DL_LIBS}>
)
string(CONCAT ztd-text-static-libiconv-define
$<IF:
$<AND:$<BOOL:${Iconv_FOUND}>, $<BOOL:${ZTD_TEXT_USE_STATIC_LIBICONV}>,
$<STREQUAL:
$<$<BOOL:$<TARGET_NAME_IF_EXISTS:Iconv::Iconv>>:$<TARGET_PROPERTY:Iconv::Iconv,TYPE>>,
STATIC_LIBRARY
>
>,
ZTD_LIBICONV_STATIC=1,
ZTD_LIBICONV_STATIC=0
>
)
string(CONCAT ztd-text-dynamic-libiconv-define
$<IF:
$<AND:$<BOOL:Iconv_FOUND>, $<BOOL:ZTD_TEXT_USE_LIBICONV>,
$<STREQUAL:
$<$<BOOL:$<TARGET_NAME_IF_EXISTS:Iconv::Iconv>>:$<TARGET_PROPERTY:Iconv::Iconv,TYPE>>,
SHARED_LIBRARY
>
>,
ZTD_LIBICONV_DYNAMIC=1,
ZTD_LIBICONV_DYNAMIC=0
>
)
# ztd.platform
FetchContent_Declare(ztd.platform
GIT_REPOSITORY https://github.com/soasis/platform.git
GIT_TAG main)
FetchContent_MakeAvailable(ztd.platform)

# Main library declarations
file(GLOB_RECURSE ztd.text.includes CONFIGURE_DEPENDS include/*.hpp)
Expand All @@ -212,20 +155,12 @@ target_include_directories(ztd.text
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
target_sources(ztd.text INTERFACE ${ztd.text.includes})
target_compile_definitions(ztd.text
INTERFACE
${ztd-text-libiconv-define}
${ztd-text-libiconv-load-define}
${ztd-text-libiconv-header-define}
${ztd-text-static-libiconv-define}
${ztd-text-dynamic-libiconv-define})
target_link_libraries(ztd.text
INTERFACE
${ztd-text-libiconv-dl}
ztd::idk
ztd::static_containers
${ztd-text-libiconv}
${ztd-text-cuneicode})
ztd::platform
ztd::cuneicode)
install(DIRECTORY include/
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

Expand Down
17 changes: 1 addition & 16 deletions cmake/ztd.text-config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,5 @@ if (TARGET ztd::text)
get_target_property(ZTD_TEXT_INCLUDE_DIRS
ztd.text INTERFACE_INCLUDE_DIRECTORIES)
set_and_check(ZTD_TEXT_INCLUDE_DIRS "${ZTD_TEXT_INCLUDE_DIRS}")
if (TARGET ztd::cuneicode)
set(ZTD_TEXT_LIBRARIES ztd::text ztd::cuneicode)
else()
set(ZTD_TEXT_LIBRARIES ztd::text)
endif()
endif()

if(TARGET ztd::text::single)
get_target_property(ZTD_TEXT_SINGLE_INCLUDE_DIRS
ztd.text.single INTERFACE_INCLUDE_DIRECTORIES)
set_and_check(ZTD_TEXT_SINGLE_INCLUDE_DIRS "${ZTD_TEXT_SINGLE_INCLUDE_DIRS}")
if (TARGET ztd::cuneicode::single)
set(ZTD_TEXT_LIBRARIES_SINGLE ztd::text::single ztd::cuneicode::single)
else()
set(ZTD_TEXT_LIBRARIES_SINGLE ztd::text::single)
endif()
set(ZTD_TEXT_LIBRARIES ztd::text ztd::cuneicode)
endif()
2 changes: 1 addition & 1 deletion documentation/Doxyfile.in
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ NUM_PROC_THREADS = 1
# normally produced when WARNINGS is set to YES.
# The default value is: NO.

EXTRACT_ALL = NO
EXTRACT_ALL = YES

# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
# be included in the documentation.
Expand Down
1 change: 1 addition & 0 deletions documentation/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ Properties and Classifications
api/is_code_units_replaceable
api/is_code_points_replaceable
api/is_ignorable_error_handler
api/is_state_complete
api/is_unicode_encoding
api/contains_unicode_encoding
api/is_unicode_code_point
Expand Down
2 changes: 1 addition & 1 deletion documentation/source/api/encodings/basic_iconv.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
basic_iconv
===========

This encoding is only available if the :ref:`configuration macro/build option for ZTD__LIBICONV <config-ZTD_LIBICONV>` is turned on.
This encoding is only available if the configuration macro/build option for ``ZTD_PLATFORM_LIBICONV`` is turned on.

This encoding is tied to the `iconv library <https://www.gnu.org/software/libiconv/>`_. It will attempt to use the header and the functions directly, and if not otherwise bootstrap iconv on first use of the encoding through ``GetProcAddress``/``dlsym`` and related. If it cannot find it will either assert, abort, or loudly annoy the user in some way. The code is retrieved dynamically where possible unless the user explicitly defines the build option for ``ZTD_TEXT_USE_STATIC_LIBICONV`` (for CMake), as iconv is under a LGPL/GPL licensed and cannot be traditionally built / statically linked with application code (though in the future we may provide a way for software to do that if the software being made with this library is also GPL-compatible software).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,9 @@
..
.. =============================================================================>
🔨 cuneicode_encoding (In Progress)
===================================

.. warning::

|unfinished_warning|

This encoding is only available if the :ref:`configuration macro for ZTD_TEXT_USE_CUNEICODE <config-ZTD_TEXT_USE_CUNEICODE>` is turned on.
🔨 cuneicode_registry_encoding (In Progress)
============================================

This encoding is tied to the `cuneicode library <https://ztdcuneicode.rtfd.io>`_. The cuneicode library is a C library for validation, counting, and transcoding between a fixed set of encodings, with an additional plug for arbitrary encodings that can be added at run-time. This is in opposition to :doc:`iconv </api/encodings/basic_iconv>`, where additional encodings can only be added by-hand through recompiling the code or hooking specific system configuration points.

cuneicode has a variable number of encodings it can be compiled with to support. States are pre-constructed in the encoding itself and copied as necessary when ``encode_state`` or ``decode_state``\ s are being created to call the desired conversion functions. The user can inspect the output error parameter from the ``cuneicode_encoding`` constructor to know of failure, or not pass in the output error parameter and instead take one of a assert, thrown exception, or ``abort`` (preferred invocation in that order).
cuneicode has a variable number of encodings it can be compiled with to support. States are pre-constructed in the encoding itself and copied as necessary when ``encode_state`` or ``decode_state``\ s are being created to call the desired conversion functions. The user can inspect the output error parameter from the ``cuneicode_registry_encoding`` constructor to know of failure, or not pass in the output error parameter and instead take one of a assert, thrown exception, or ``abort`` (preferred invocation in that order).
4 changes: 1 addition & 3 deletions documentation/source/api/encodings/execution.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@ This is the locale-based, runtime encoding. It uses a number of compile-time and
Currently, the hierachy of behaviors is like so:

- If the platform is MacOS, then it assumes this is :doc:`UTF-8 </api/encodings/utf8>`;
- Otherwise, if libiconv is available, then it attempts to use :doc:`iconv </api/encodings/basic_iconv>` configured to the ``"char"``-identified encoding;
- Otherwise, if the headers ``<cuchar>`` or ``<uchar.h>`` are available, then it attempts to use a gnarly, lossy, and dangerous encoding that potentially traffics through the C Standard Library and Locale APIs;
- Otherwise, it produces a compile-time error.
- Otherwise, if the :term:`cuneicode`, then Cuneicode will be used.

.. warning::

Expand Down
54 changes: 54 additions & 0 deletions documentation/source/api/encodings/punycode.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
.. =============================================================================
..
.. ztd.text
.. Copyright © 2022 JeanHeyd "ThePhD" Meneide and Shepherd's Oasis, LLC
.. Contact: opensource@soasis.org
..
.. Commercial License Usage
.. Licensees holding valid commercial ztd.text licenses may use this file in
.. accordance with the commercial license agreement provided with the
.. Software or, alternatively, in accordance with the terms contained in
.. a written agreement between you and Shepherd's Oasis, LLC.
.. For licensing terms and conditions see your agreement. For
.. further information contact opensource@soasis.org.
..
.. Apache License Version 2 Usage
.. Alternatively, this file may be used under the terms of Apache License
.. Version 2.0 (the "License") for non-commercial use; you may not use this
.. file except in compliance with the License. You may obtain a copy of the
.. License at
..
.. https://www.apache.org/licenses/LICENSE-2.0
..
.. Unless required by applicable law or agreed to in writing, software
.. distributed under the License is distributed on an "AS IS" BASIS,
.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
.. See the License for the specific language governing permissions and
.. limitations under the License.
..
.. =============================================================================>
punycode
========

Punycode is an encoding that turns Unicode-encoded text into a sequence of ASCII code units, making it readily inspectable by simple systems and human eyes to see differences between text even if confusable characters are used in the Unicode text. It is most frequently used in the Internationalized Domain Name in Applications system, specifically for DNS purposes. Despite it's unfriendliness to stream-based, one-at-a-time interfaces, sufficiently heap-based state allows encoding and decoding in a useful manner.

.. doxygenvariable:: ztd::text::punycode

.. doxygenvariable:: ztd::text::punycode_idna

.. doxygentypedef:: ztd::text::punycode_t

.. doxygentypedef:: ztd::text::punycode_idna_t

.. doxygentypedef:: ztd::text::basic_punycode

.. doxygentypedef:: ztd::text::basic_punycode_idna



Base Template
-------------

.. doxygenclass:: ztd::text::basic_any_punycode
:members:
5 changes: 1 addition & 4 deletions documentation/source/api/encodings/wide_execution.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,7 @@ This is the locale-based, wide runtime encoding. It uses a number of compile-tim
Currently, the hierachy of behaviors is like so:

- If the platform is Windows, then it assumes this is :doc:`UTF-16 </api/encodings/utf16>`;
- Otherwise, if libiconv is available, then it attempts to use :doc:`iconv </api/encodings/basic_iconv>` configured to the ``"wchar_t"``-identified encoding;
- Otherwise, if the platform is MacOS and ``WCHAR_MAX`` is greater than the maximum of an unsigned 21-bit number, or ``__STDC_ISO_10646__`` is defined, then it attempts to use :doc:`UTF-32 </api/encodings/utf32>`;
- Otherwise, if the headers ``<cwchar>`` or ``<wchar.h>`` are available, then it attempts to use a gnarly, lossy, and dangerous encoding that potentially traffics through the C Standard Library and Locale APIs in conjunction with a roundtrip through the :doc:`ztd::text::execution </api/encodings/execution>` encoding;
- Otherwise, it produces a compile-time error.
- Otherwise, cuneicode is used.

.. warning::

Expand Down
34 changes: 34 additions & 0 deletions documentation/source/api/is_state_complete.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
.. =============================================================================
..
.. ztd.text
.. Copyright © 2022 JeanHeyd "ThePhD" Meneide and Shepherd's Oasis, LLC
.. Contact: opensource@soasis.org
..
.. Commercial License Usage
.. Licensees holding valid commercial ztd.text licenses may use this file in
.. accordance with the commercial license agreement provided with the
.. Software or, alternatively, in accordance with the terms contained in
.. a written agreement between you and Shepherd's Oasis, LLC.
.. For licensing terms and conditions see your agreement. For
.. further information contact opensource@soasis.org.
..
.. Apache License Version 2 Usage
.. Alternatively, this file may be used under the terms of Apache License
.. Version 2.0 (the "License") for non-commercial use; you may not use this
.. file except in compliance with the License. You may obtain a copy of the
.. License at
..
.. https://www.apache.org/licenses/LICENSE-2.0
..
.. Unless required by applicable law or agreed to in writing, software
.. distributed under the License is distributed on an "AS IS" BASIS,
.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
.. See the License for the specific language governing permissions and
.. limitations under the License.
..
.. =============================================================================>
is_state_complete
=================

.. doxygenfunction:: ztd::text::is_state_complete
3 changes: 3 additions & 0 deletions documentation/source/bibliography.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ These are all the resources that this documentation links to, in alphabetical or
.. glossary::
:sorted:

cuneicode
JeanHeyd Meneide & Shepherd's Oasis, LLC. cuneicode. September 2022. URL: `https://savannah.gnu.org/git/?group=libiconv <https://savannah.gnu.org/git/?group=libiconv>`_. `A software library <https://www.gnu.org/software/libiconv/>`_ for working with and converting text. Typically ships on most, if not all, POSIX and Linux systems.

iconv
Bruno Haible and Daiki Ueno. libiconv. August 2020. URL: `https://savannah.gnu.org/git/?group=libiconv <https://savannah.gnu.org/git/?group=libiconv>`_. `A software library <https://www.gnu.org/software/libiconv/>`_ for working with and converting text. Typically ships on most, if not all, POSIX and Linux systems.

Expand Down
10 changes: 0 additions & 10 deletions documentation/source/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,6 @@ There are various configuration macros and CMake/build-time switches that will c
- Default: off.
- Turned on if the special ``__has_include`` directive is present with the compiler and ``__has_include(<ztd/cuneicode/cuneicode.h>)`` works.

.. _config-ZTD_LIBICONV:

- ``ZTD_LIBICONV`` (CMake: ``ZTD_TEXT_USE_ICONV``)
- Enables use of the `iconv project <https://www.gnu.org/software/libiconv/>`_.
- Attempts to use the headers directly if possible.
- Otherwise, attempts to load it from the system at runtime using ``GetProcAddress``, or ``dlopen``/``dlsym``/``dlclose``.
- Makes the ``ztd::text::basic_iconv`` available (accessible directly VIA ``#include <ztd/text/basic_iconv.hpp>``).
- Default: off.
- Not turned on by-default under any conditions.

.. _config-ZTD_TEXT_UNICODE_CODE_POINT_DISTINCT_TYPE:

- ``ZTD_TEXT_UNICODE_CODE_POINT_DISTINCT_TYPE``
Expand Down
2 changes: 1 addition & 1 deletion documentation/source/encodings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ As a general point, we hope to support almost all of the encodings here in one f
* - ``cuneicode`` Encoding
- Yes
- Yes
- :doc:`WIP 🛠️ </api/encodings/cuneicode_encoding>`
- :doc:`WIP 🛠️ </api/encodings/cuneicode_registry_encoding>`
* - UTF-EBCDIC
- No
- Yes
Expand Down
14 changes: 0 additions & 14 deletions documentation/source/future.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,17 +92,3 @@ Normalization
- ☐ Insertion (Fast normalization-preserving splicing/inserting algorithm)
- ☐ Deletion
- ☐ Converting Constructors between compatible types (errors the same way :doc:`lossy conversion protection </design/error handling/lossy protection>` describes if they are not compatible, forcing a user to pass in an error handler.)



``iconv``
---------

There should be an encoding that loads ``iconv`` dynamically from the system, if it is present, before using it to do conversions.



``cuneicode``
-------------

There should be a cuneicode-based encoding, for the update C implementation of all of these things.
6 changes: 6 additions & 0 deletions examples/shift_jis/source/shift_jis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,12 @@ shift_jis::sjis_decode_result shift_jis::decode_one(ztd::span<const shift_jis::c
}
else if ((first_byte <= 0x9F && first_byte >= 0x81) || (first_byte <= 0xFC && first_byte >= 0xE0)) {
// Top-Level case 2: this is a double-byte sequence!
if (in_it == in_last) {
return error_handler(*this,
sjis_decode_result(std::move(input), std::move(output), current_state,
ztd::text::encoding_error::incomplete_sequence),
input_span(), output_span());
}
unsigned char second_byte = static_cast<unsigned char>(*in_it);
++in_it;
unsigned char lookup_offset = second_byte < 0x7F ? 0x40 : 0x41;
Expand Down

0 comments on commit e8dce10

Please sign in to comment.