In [10]:
import unicodedata
from collections import defaultdict

In [80]:
MAX_CLUSTER_SIZE = 150
MAX_IN_CLUSTER_DISTANCE = 20


def is_too_large(cluster):
    return len(cluster) > MAX_CLUSTER_SIZE


def are_far_enough(char1, char2):
    return abs(ord(char1) - ord(char2)) > MAX_IN_CLUSTER_DISTANCE


def split_into_subcategories(cluster):
    sorted_cluster = sorted(cluster, key=ord)
    subcats = defaultdict(list)
    current = sorted_cluster[0]
    for i, char in enumerate(sorted_cluster):
        if i > 0 and are_far_enough(char, sorted_cluster[i - 1]):
            current = char
        subcats[current].append(char)
    return subcats


def get_characters_clusters(characters):
    # Split into unicode categories
    cluster_by_category = defaultdict(list)
    for char in characters:
        cat = unicodedata.category(char)
        cluster_by_category[cat].append(char)
    
    # Split large clusters into subcategories
    clusters = {}
    for category, cluster in cluster_by_category.items():
        if is_too_large(cluster):
            for sub, chars in split_into_subcategories(cluster).items():
                clusters[(category, sub)] = chars
        else:
            clusters[(category,)] = cluster
    
    return clusters

In [81]:
with open('data/znaki_wikipedii.txt') as f:
    chars = f.read().split()

clusters = get_characters_clusters(chars)
for cluster_type, members in sorted(clusters.items()):
    print(
        '/'.join(cluster_type),
        ' '.join(members),
        sep='\n',
        end='\n\n',
    )

Cc
                              

Cf
‎ ­ ‍ ﻿ ‏ ​ ‮ ‬ ‪ ‌ ‫ ‭ ⁯ ⁫ ۝ ⁬ ⁮ ⁠

Cn
￴

Co
                     

Ll/a
a b c d e f g h i j k l m n o p q r s t u v w x y z

Ll/µ
µ

Ll/ß
ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ ā ă ą ć ĉ ċ č ď đ ē ĕ ė ę ě ĝ ğ ġ ģ ĥ ħ ĩ ī ĭ į ı ĳ ĵ ķ ĸ ĺ ļ ľ ŀ ł ń ņ ň ŉ ŋ ō ŏ ő œ ŕ ŗ ř ś ŝ ş š ţ ť ŧ ũ ū ŭ ů ű ų ŵ ŷ ź ż ž ſ ƀ ƃ ƅ ƈ ƌ ƍ ƒ ƕ ƙ ƚ ƛ ƞ ơ ƣ ƥ ƨ ƪ ƫ ƭ ư ƴ ƶ ƹ ƺ ƽ ƾ ƿ ǆ ǉ ǌ ǎ ǐ ǒ ǔ ǖ ǘ ǚ ǜ ǝ ǟ ǡ ǣ ǥ ǧ ǩ ǫ ǭ ǯ ǰ ǳ ǵ ǹ ǻ ǽ ǿ ȁ ȃ ȅ ȇ ȉ ȋ ȍ ȏ ȑ ȓ ȕ ȗ ș ț ȝ ȟ ȡ ȣ ȥ ȧ ȩ ȫ ȭ ȯ ȱ ȳ ȴ ȵ ȶ ȷ ȸ ȹ ȼ ȿ ɀ ɂ ɇ ɉ ɋ ɍ ɏ ɐ ɑ ɒ ɓ ɔ ɕ ɖ ɗ ɘ ə ɚ ɛ ɜ ɝ ɞ ɟ ɠ ɡ ɢ ɣ ɤ ɥ ɦ ɧ ɨ ɩ ɪ ɫ ɬ ɭ ɮ ɯ ɰ ɱ ɲ ɳ ɴ ɵ ɶ ɷ ɸ ɹ ɺ ɻ ɼ ɽ ɾ ɿ ʀ ʁ ʂ ʃ ʄ ʅ ʆ ʇ ʈ ʉ ʊ ʋ ʌ ʍ ʎ ʏ ʐ ʑ ʒ ʓ ʕ ʖ ʗ ʘ ʙ ʚ ʛ ʜ ʝ ʞ ʟ ʠ ʡ ʢ ʣ ʤ ʥ ʦ ʧ ʨ ʩ ʪ ʫ ʬ ʭ ʮ ʯ

Ll/ͱ
ͱ ͳ ͻ

Ll/ΐ
ΐ

Ll/ά
ά έ ή ί ΰ α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ ς σ τ υ φ χ ψ ω ϊ ϋ ό ύ ώ ϐ ϑ ϕ ϖ ϗ ϙ ϛ ϝ ϟ ϡ ϩ ϰ ϱ ϲ ϵ ϸ ϻ ϼ

Ll/а
а б в г д е ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы