In [None]:
import pandas as pd
from pathlib import Path
from typing import NamedTuple, Optional
import warnings

CWD = Path.cwd()
UCD_PATH = CWD / "third_party/ucd"
ICON_PATH = CWD / "icons/ucd"


def hex_format(n: int, /, digits: int = 4) -> str:
    return f"{n:0{digits}x}".upper()


def hex2int(s: str, /) -> int:
    try:
        return int(s, 16)
    except ValueError as exc:
        raise ValueError(f"Invalid hex string: {s}") from exc

## Metadata, source: <https://www.unicode.org/L2/L1999/UnicodeData.html>

In [None]:
raw_metadata = [
    ("Code value", "normative", "Code value in 4-digit hexadecimal format."),
    (
        "Character name",
        "normative",
        "These names match exactly the names published in Chapter 7 of the Unicode Standard, Version 2.0, except for the two additional characters.",
    ),
    (
        "General category",
        "normative / informative(see below)",
        'This is a useful breakdown into various "character types" which can be used as a default categorization in implementations. See below for a brief explanation.',
    ),
    (
        "Canonical combining classes",
        "normative",
        "The classes used for the Canonical Ordering Algorithm in the Unicode Standard. These classes are also printed in Chapter 4 of the Unicode Standard.",
    ),
    (
        "Bidirectional category",
        "normative",
        "See the list below for an explanation of the abbreviations used in this field. These are the categories required by the Bidirectional Behavior Algorithm in the Unicode Standard. These categories are summarized in Chapter 3 of the Unicode Standard.",
    ),
    (
        "Character decomposition mapping",
        "normative",
        "In the Unicode Standard, not all of the mappings are full (maximal) decompositions. Recursive application of look-up for decompositions will, in all cases, lead to a maximal decomposition. The decomposition mappings match exactly the decomposition mappings published with the character names in the Unicode Standard.",
    ),
    (
        "Decimal digit value",
        "normative",
        "This is a numeric field. If the character has the decimal digit property, as specified in Chapter 4 of the Unicode Standard, the value of that digit is represented with an integer value in this field",
    ),
    (
        "Digit value",
        "normative",
        "This is a numeric field. If the character represents a digit, not necessarily a decimal digit, the value is here. This covers digits which do not form decimal radix forms, such as the compatibility superscript digits",
    ),
    (
        "Numeric value",
        "normative",
        'This is a numeric field. If the character has the numeric property, as specified in Chapter 4 of the Unicode Standard, the value of that character is represented with an integer or rational number in this field. This includes fractions as, e.g., "1/5" for U+2155 VULGAR FRACTION ONE FIFTH Also included are numerical values for compatibility characters such as circled numbers.',
    ),
    (
        "Mirrored",
        "normative",
        'If the character has been identified as a "mirrored" character in bidirectional text, this field has the value "Y"; otherwise "N". The list of mirrored characters is also printed in Chapter 4 of the Unicode Standard.',
    ),
    (
        "Unicode 1.0 Name",
        "informative",
        "This is the old name as published in Unicode 1.0. This name is only provided when it is significantly different from the Unicode 3.0 name for the character.",
    ),
    (
        "10646 comment field",
        "informative",
        "This is the ISO 10646 comment field. It is in parantheses in the 10646 names list.",
    ),
    (
        "Uppercase mapping",
        "informative",
        "Upper case equivalent mapping. If a character is part of an alphabet with case distinctions, and has an upper case equivalent, then the upper case equivalent is in this field. See the explanation below on case distinctions. These mappings are always one-to-one, not one-to-many or many-to-one. This field is informative.",
    ),
    ("Lowercase mapping", "informative", "Similar to Uppercase mapping"),
    (
        "Titlecase mapping",
        "informative",
        'Similar to Uppercase mapping"Titlecase mapping", "informative", "Similar to Uppercase mapping"',
    ),
]

metadata = pd.DataFrame(raw_metadata, columns=["Field Name", "Status", "Explanation"])
metadata

## Read data file

In [None]:
UCD = pd.read_csv(UCD_PATH / "unicodedata.txt", sep=";", names=metadata["Field Name"])
UCD = UCD.assign(code_point=UCD["Code value"].map(hex2int)).set_index("code_point")
UCD

## Read blocks.txt file

In [None]:
blocks = pd.read_csv(
    UCD_PATH / "blocks.txt",
    sep=";",
    comment="#",
    names=["block", "block_name"],
)

blocks = blocks.assign(
    block_name=blocks.block_name.str.strip(),
    **blocks.block.str.split("..", regex=False, expand=True)
    .rename(columns={0: "start", 1: "end"})
    .map(hex2int),
)
blocks

## Preprocess the blocks data

In [None]:
class UnicodeEntry(NamedTuple):
    code_point: str
    symbol: str
    block: str
    block_name: str
    name: str
    tab_completion: Optional[str] = None
    latex_code: Optional[str] = None


def format_block(s: str, /, digits=6) -> str:
    lower, upper, *rest = s.split("..")
    assert not rest
    lower = hex_format(hex2int(lower), digits=6)
    upper = hex_format(hex2int(upper), digits=6)
    return f"[{'-'.join([lower, upper])}]"


def format_name(s: str, /) -> str:
    return s.lower().replace(" ", "_")

In [None]:
processed_blocks = {}
for _, (block, block_name, start, end) in blocks.iterrows():
    # gather all entries in the current block
    entries = [
        UnicodeEntry(
            code_point=hex_format(k),
            symbol=f'"{chr(k)}"',
            block=block,
            block_name=block_name,
            name=str(UCD.loc[k, "Character name"]).capitalize(),
        )
        for k in range(start, end + 1)
        if k in UCD.index
    ]
    block_range = format_block(block)
    block_name = format_name(block_name)
    processed_blocks[(block_range, block_name)] = pd.DataFrame(entries)

next(iter(processed_blocks.items()))

## Export the blocks to tsv files

In [None]:
ICON_PATH.mkdir(exist_ok=True, parents=True)

for (block, name), payload in processed_blocks.items():
    fname = f"{block}-{name}.tsv"

    try:
        payload.to_csv(
            ICON_PATH / fname,
            index=False,
            sep="\t",
        )
    except Exception as exc:
        warnings.warn(f"❌ Failed to save {fname} due to {exc}")
        continue
    else:
        print(f"✅ Saved {fname}")