In [14]:
import pandas as pd


def hex_format(n: int, /, digits: int=4) -> str:
    return f"{n:0{digits}x}".upper()


def hex2int(s: str, /) -> int:
    try:
        return int(s, 16)
    except:
        print(s, type(s))

## Metadata, source: <https://www.unicode.org/L2/L1999/UnicodeData.html>

In [15]:
raw_metadata = [
    ("Code value", "normative", "Code value in 4-digit hexadecimal format."),
    (
        "Character name",
        "normative",
        "These names match exactly the names published in Chapter 7 of the Unicode Standard, Version 2.0, except for the two additional characters.",
    ),
    (
        "General category",
        "normative / informative(see below)",
        'This is a useful breakdown into various "character types" which can be used as a default categorization in implementations. See below for a brief explanation.',
    ),
    (
        "Canonical combining classes",
        "normative",
        "The classes used for the Canonical Ordering Algorithm in the Unicode Standard. These classes are also printed in Chapter 4 of the Unicode Standard.",
    ),
    (
        "Bidirectional category",
        "normative",
        "See the list below for an explanation of the abbreviations used in this field. These are the categories required by the Bidirectional Behavior Algorithm in the Unicode Standard. These categories are summarized in Chapter 3 of the Unicode Standard.",
    ),
    (
        "Character decomposition mapping",
        "normative",
        "In the Unicode Standard, not all of the mappings are full (maximal) decompositions. Recursive application of look-up for decompositions will, in all cases, lead to a maximal decomposition. The decomposition mappings match exactly the decomposition mappings published with the character names in the Unicode Standard.",
    ),
    (
        "Decimal digit value",
        "normative",
        "This is a numeric field. If the character has the decimal digit property, as specified in Chapter 4 of the Unicode Standard, the value of that digit is represented with an integer value in this field",
    ),
    (
        "Digit value",
        "normative",
        "This is a numeric field. If the character represents a digit, not necessarily a decimal digit, the value is here. This covers digits which do not form decimal radix forms, such as the compatibility superscript digits",
    ),
    (
        "Numeric value",
        "normative",
        'This is a numeric field. If the character has the numeric property, as specified in Chapter 4 of the Unicode Standard, the value of that character is represented with an integer or rational number in this field. This includes fractions as, e.g., "1/5" for U+2155 VULGAR FRACTION ONE FIFTH Also included are numerical values for compatibility characters such as circled numbers.',
    ),
    (
        "Mirrored",
        "normative",
        'If the character has been identified as a "mirrored" character in bidirectional text, this field has the value "Y"; otherwise "N". The list of mirrored characters is also printed in Chapter 4 of the Unicode Standard.',
    ),
    (
        "Unicode 1.0 Name",
        "informative",
        "This is the old name as published in Unicode 1.0. This name is only provided when it is significantly different from the Unicode 3.0 name for the character.",
    ),
    (
        "10646 comment field",
        "informative",
        "This is the ISO 10646 comment field. It is in parantheses in the 10646 names list.",
    ),
    (
        "Uppercase mapping",
        "informative",
        "Upper case equivalent mapping. If a character is part of an alphabet with case distinctions, and has an upper case equivalent, then the upper case equivalent is in this field. See the explanation below on case distinctions. These mappings are always one-to-one, not one-to-many or many-to-one. This field is informative.",
    ),
    ("Lowercase mapping", "informative", "Similar to Uppercase mapping"),
    (
        "Titlecase mapping",
        "informative",
        'Similar to Uppercase mapping"Titlecase mapping", "informative", "Similar to Uppercase mapping"',
    ),
]

metadata = pd.DataFrame(raw_metadata, columns=["Field Name", "Status", "Explanation"])
metadata

Unnamed: 0,Field Name,Status,Explanation
0,Code value,normative,Code value in 4-digit hexadecimal format.
1,Character name,normative,These names match exactly the names published ...
2,General category,normative / informative(see below),"This is a useful breakdown into various ""chara..."
3,Canonical combining classes,normative,The classes used for the Canonical Ordering Al...
4,Bidirectional category,normative,See the list below for an explanation of the a...
5,Character decomposition mapping,normative,"In the Unicode Standard, not all of the mappin..."
6,Decimal digit value,normative,This is a numeric field. If the character has ...
7,Digit value,normative,This is a numeric field. If the character repr...
8,Numeric value,normative,This is a numeric field. If the character has ...
9,Mirrored,normative,"If the character has been identified as a ""mir..."


## Read data file

In [16]:
data = pd.read_csv("ucd/unicodedata.txt", sep=";", names=metadata["Field Name"])
data = data.assign(code_point=data["Code value"].map(hex2int)).set_index("code_point")
data

Unnamed: 0_level_0,Code value,Character name,General category,Canonical combining classes,Bidirectional category,Character decomposition mapping,Decimal digit value,Digit value,Numeric value,Mirrored,Unicode 1.0 Name,10646 comment field,Uppercase mapping,Lowercase mapping,Titlecase mapping
code_point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0000,<control>,Cc,0,BN,,,,,N,,,,,
1,0001,<control>,Cc,0,BN,,,,,N,START OF HEADING,,,,
2,0002,<control>,Cc,0,BN,,,,,N,START OF TEXT,,,,
3,0003,<control>,Cc,0,BN,,,,,N,END OF TEXT,,,,
4,0004,<control>,Cc,0,BN,,,,,N,END OF TRANSMISSION,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
917999,E01EF,VARIATION SELECTOR-256,Mn,0,NSM,,,,,N,,,,,
983040,F0000,"<Plane 15 Private Use, First>",Co,0,L,,,,,N,,,,,
1048573,FFFFD,"<Plane 15 Private Use, Last>",Co,0,L,,,,,N,,,,,
1048576,100000,"<Plane 16 Private Use, First>",Co,0,L,,,,,N,,,,,


## Read blocks.txt file

In [17]:
blocks = pd.read_csv("ucd/blocks.txt", sep=";", comment="#", names=["block", "block_name"])
blocks = blocks.assign(
    block_name=blocks.block_name.str.strip(),
    **blocks.block.str.split("..", regex=False, expand=True)
    .rename(columns={0: "start", 1: "end"})
    .map(hex2int)
)
blocks

## Preprocess the blocks data

In [19]:
from typing import NamedTuple, Optional


class UnicodeEntry(NamedTuple):
    code_point: str
    symbol: str
    block: str
    block_name: str
    name: str
    tab_completion: Optional[str] = None
    latex_code: Optional[str] = None

In [20]:
def format_block(s: str, /, digits=6) -> str:
    lower, upper, *rest = s.split('..')
    assert not rest
    lower = hex_format(hex2int(lower), digits=6)
    upper = hex_format(hex2int(upper), digits=6)
    return f"[{'-'.join([lower, upper])}]"


def format_name(s: str, /) -> str:
    return s.lower().replace(" ", "_")

In [21]:
processed_blocks = {
    (format_block(block), format_name(block_name)): pd.DataFrame(
        [
            UnicodeEntry(
                hex_format(k),
                f'"{chr(k)}"',
                block,
                block_name,
                data.loc[k, "Character name"].capitalize(),
            )
            for k in range(start, end + 1)
            if k in data.index
        ]
    )
    for _, (block, block_name, start, end) in blocks.iterrows()
}
next(iter(processed_blocks.items()))

(('[000000-00007F]', 'basic_latin'),
     code_point symbol       block   block_name                 name  \
 0         0000    " "  0000..007F  Basic Latin            <control>   
 1         0001    ""  0000..007F  Basic Latin            <control>   
 2         0002    ""  0000..007F  Basic Latin            <control>   
 3         0003    ""  0000..007F  Basic Latin            <control>   
 4         0004    ""  0000..007F  Basic Latin            <control>   
 ..         ...    ...         ...          ...                  ...   
 123       007B    "{"  0000..007F  Basic Latin   Left curly bracket   
 124       007C    "|"  0000..007F  Basic Latin        Vertical line   
 125       007D    "}"  0000..007F  Basic Latin  Right curly bracket   
 126       007E    "~"  0000..007F  Basic Latin                Tilde   
 127       007F    ""  0000..007F  Basic Latin            <control>   
 
     tab_completion latex_code  
 0             None       None  
 1             None       None 

## Export the blocks to csv files

In [22]:
for (block, name), payload in processed_blocks.items():
    try:
        payload.to_csv(f"icons/{block}-{name}.csv", index=False, sep="\t")
    except:
        print(f"Failed to save {name}")

Failed to save basic_latin
Failed to save high_surrogates
Failed to save high_private_use_surrogates
Failed to save low_surrogates
