# Analysis of Unicode character names

In [1]:
import unicodedata
import collections
import sys

sys.version

'3.7.1 (default, Dec 14 2018, 13:28:58) \n[Clang 4.0.1 (tags/RELEASE_401/final)]'

## Largest Unicode code point

In [2]:
sys.maxunicode

1114111

## Top 20 words used in character names

In [3]:
c = collections.Counter()

for i in range(0, sys.maxunicode+1):
    try:
        name = unicodedata.name(chr(i))
    except ValueError:  # no such character
        continue
    for word in name.split():
        c[word] += 1

c.most_common(20)

[('CJK', 89107),
 ('UNIFIED', 87942),
 ('SYLLABLE', 13393),
 ('HANGUL', 11735),
 ('LETTER', 10017),
 ('SIGN', 3156),
 ('WITH', 2676),
 ('SMALL', 2595),
 ('CAPITAL', 1967),
 ('HIEROGLYPH', 1654),
 ('LATIN', 1495),
 ('ARABIC', 1250),
 ('YI', 1249),
 ('CUNEIFORM', 1234),
 ('MATHEMATICAL', 1152),
 ('SYMBOL', 1151),
 ('EGYPTIAN', 1071),
 ('COMPATIBILITY', 1014),
 ('DIGIT', 828),
 ('FORM', 812)]

## All characters used in Unicode character names

In [4]:
chars = set()

for i in range(0, sys.maxunicode+1):
    try:
        name = unicodedata.name(chr(i))
    except ValueError:  # no such character
        continue
    chars.update(name)

print('|', '|'.join(sorted(chars)), '|', sep='')

| |-|0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|


Conclusion: the only characters used in the Unicode character names are uppercase A..Z, 0..9, space, and hyphen.