In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../src")

In [3]:
from CompoTree import ComponentTree, Radicals, TSVariants, CharLexicon, CTFounds
from gensim.models.keyedvectors import KeyedVectors
import igraph as ig

In [4]:
ctree = ComponentTree.load()
radicals = Radicals.load()
tsvars = TSVariants.load()
lexicon = CharLexicon.load()

In [5]:
trad_chars = [x for x in lexicon if not tsvars.is_simplified(x)]

In [32]:
char_pools = [x for x in trad_chars if 
              x != tsvars.convert(x)[0]]
# char_pools = [tsvars.convert(x)[0] for x in char_pools]


In [33]:
char_decomp = []
for x in set(char_pools):
    try:
        decomp_x = (x, ctree.query(x, max_depth=1, use_flag="shortest")[0])
        char_decomp.append(decomp_x)
    except:
        pass

char_decomp = [x for x in char_decomp if not isinstance(x[1], str) and (len(x[1].components())==2)]


In [34]:
compo_dict = {}
edge_dict = {}
for char_x, decomp_x in char_decomp:
    compos = decomp_x.components()
    compo_dict[compos[0]] = compo_dict.get(compos[0], len(compo_dict))
    compo_dict[compos[1]] = compo_dict.get(compos[1], len(compo_dict))
    
    # edges
    eid = (compo_dict[compos[0]], compo_dict[compos[1]])
    if eid in edge_dict:
        print("duplicated decompositions: ", char_x, decomp_x)
    edge_dict[eid] = edge_dict.get(eid, len(edge_dict))

duplicated decompositions:  暈 ⿱['日']['軍']


In [35]:
len(char_pools), len(char_decomp), len(compo_dict), len(edge_dict)

(1673, 1624, 1005, 1623)

In [38]:
# 字表
char_pools[:10]

['銅', '屆', '貶', '區', '諳', '毆', '圇', '變', '懨', '諫']

In [10]:
G = ig.Graph()
for compo_x, compo_id in compo_dict.items():
    G.add_vertex(compo_id, compo=compo_x)
for edge_x, edge_id in edge_dict.items():
    src, tgt = edge_x
    G.add_edge(src, tgt)

In [11]:
compo_rev = {v: k for k, v in compo_dict.items()}
[(compo_rev[x.source], compo_rev[x.target]) for x in G.es[:10]]

[('口', '无'),
 ('禾', '只'),
 ('扌', '参'),
 (<⿰:丬夕>, '酉'),
 ('巾', '贞'),
 ('讠', '隹'),
 ('马', '各'),
 ('二', '山'),
 ('冫', '中'),
 ('庚', '贝')]

In [12]:
G.summary()

'IGRAPH UN-- 812 1572 -- \n+ attr: compo (v), name (v)'

In [13]:
[len(x) for x in G.components()]

[782, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2]

In [14]:
from graphviz import Graph

In [30]:
vG = Graph(comment="Component Graph", format="svg", engine='sfdp')
vG.attr(K="3")
vG.attr("edge", color="#AAAAAA33")
vG.attr("node", shape='point')
# G.attr('node', fontname="HanaMinA")

for compo_x, compo_id in compo_dict.items():
    # vG.node(str(compo_id), str(compo_x))
    vG.node(str(compo_id), fillcolor='blue')
vG.edges([(str(src), str(tgt)) for src, tgt in edge_dict.keys()])

In [31]:
vG.render("Graph.simp.gv.svg", view=True)

'Graph.simp.gv.svg.svg'