In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [3]:
from morphert.affix_ckip import CkipAffixoids

In [8]:
affix_data = CkipAffixoids("../data/ckip_affix")

In [9]:
affix_map = {}
for affix_x in affix_data:
    affix_map.setdefault(affix_x.affixoid, []).append(affix_x)

In [10]:
affix_keys = sorted(affix_map.keys(), key=lambda x: -len(affix_map[x]))

In [47]:
affix_map[affix_keys[2]][-2].meaning

'view,opinion'

In [54]:
# filter the affixoids as follows:
# 1. words freq > 1
# 2. word length is 2
# 3. the word is composed of two different characters
# 4. the word is not repeatedly listed in different affixoids
poly_affixoids = {}
for word, affixoids in affix_map.items():
    affixoids_buf = []
    duplicated = set()
    for aff_x in affixoids:
        freq_filter = lambda ex: ex[1] > 1
        wlen_filter = lambda ex: len(ex[0])==2
        rep_filter = lambda ex: ex[0][0] != ex[0][1]
        dup_filter = lambda ex: ex[0] not in duplicated
        ex_iter = map(lambda x: x[1], aff_x.examples)
        ex_iter = filter(freq_filter, ex_iter)
        ex_iter = filter(wlen_filter, ex_iter)
        ex_iter = filter(rep_filter, ex_iter)
        ex_iter = filter(dup_filter, ex_iter)        
        ex_list = sorted(list(ex_iter), key=lambda x: -x[1])
        duplicated |= set(x[0] for x in ex_list)
        if len(ex_list) > 1:
            affixoids_buf.append({
                "def": getattr(aff_x, "meaning", ""),
                "ex": ex_list[:5]
            })
    if len(affixoids_buf) > 1:
        poly_affixoids[word] = affixoids_buf

In [57]:
list(poly_affixoids.items())[155]

('官',
 [{'def': 'government official,officeholder',
   'ex': [('官員', 1360), ('官能', 33), ('官位', 16), ('官股', 8), ('官等', 7)]},
  {'def': 'government official,officeholder',
   'ex': [('警官', 109), ('預官', 43), ('士官', 28), ('高官', 28), ('百官', 12)]}])

In [58]:
# number of words
len(poly_affixoids)

796

In [60]:
# total usages/meanings/affixoids
sum(len(x) for x in poly_affixoids.values())

1765

In [62]:
# total example words count
sum(len(usage["ex"]) for affix_x in poly_affixoids.values() for usage in affix_x)

7072

In [65]:
import json
with open("../data/affix_dataset.json", "w", encoding="UTF-8") as fout:
    json.dump(poly_affixoids, fout, indent=2, ensure_ascii=False)

In [67]:
list(poly_affixoids.items())[2]

('人',
 [{'def': 'human,person',
   'ex': [('人生', 1987), ('人性', 634), ('人體', 522), ('人潮', 314), ('人氣', 170)]},
  {'def': 'person',
   'ex': [('華人', 540), ('法人', 443), ('漢人', 398), ('成人', 281), ('古人', 243)]}])