In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [3]:
from morphert.affix_ckip import CkipAffixoids

In [4]:
affix_data = CkipAffixoids("../data/ckip_affix")

In [19]:
from itertools import groupby
grp_iter = groupby([(i, x) for i, x in enumerate(affix_data)], key=lambda x: x[1].affixoid_type)
for key, grp in grp_iter:
    idxs = [x[0] for x in grp]
    print(key, min(idxs), max(idxs))

prefix 0 1214
start-root 1215 2135
suffix 2136 3754
end-root 3755 4055


In [6]:
len(affix_data)

4056

In [29]:
affix_map = {}
for affix_x in affix_data:
    affix_map.setdefault(affix_x.affixoid, []).append(affix_x)

In [30]:
len(affix_map)

2471

In [31]:
affix_keys = sorted(affix_map.keys(), key=lambda x: -len(affix_map[x]))

In [32]:
affix_map[affix_keys[2]][-2].meaning

'view,opinion'

In [41]:
# filter the affixoids as follows:
# 1. words freq > 1
# 2. word length is 2
# 3. the word is composed of two different characters
# 4. the word is not repeatedly listed in different affixoids
poly_affixoids = {}
same_pos_poly = []
for word, affixoids in affix_map.items():
    affixoids_buf = []
    duplicated = set()
    position_list = []
    for aff_x in affixoids:
        freq_filter = lambda ex: ex[1] > 1
        wlen_filter = lambda ex: len(ex[0])==2
        rep_filter = lambda ex: ex[0][0] != ex[0][1]
        dup_filter = lambda ex: ex[0] not in duplicated
        ex_iter = map(lambda x: x[1], aff_x.examples)
        ex_iter = filter(freq_filter, ex_iter)
        ex_iter = filter(wlen_filter, ex_iter)
        ex_iter = filter(rep_filter, ex_iter)
        ex_iter = filter(dup_filter, ex_iter)        
        ex_list = sorted(list(ex_iter), key=lambda x: -x[1])
        duplicated |= set(x[0] for x in ex_list)        
        if len(ex_list) > 1:
            affixoids_buf.append({
                "def": getattr(aff_x, "meaning", ""),
                "ex": ex_list[:5]
            })
            position_list.append(aff_x.position)
            
    if len(affixoids_buf) > 1:
        poly_affixoids[word] = affixoids_buf
    if len(position_list) != len(set(position_list)):
        same_pos_poly.append(word)

In [42]:
len(same_pos_poly)

189

In [43]:
poly_affixoids[same_pos_poly[12]]

[{'def': 'square',
  'ex': [('方塊', 36), ('方形', 34), ('方格', 15), ('方舟', 8), ('方陣', 4)]},
 {'def': 'side,party',
  'ex': [('警方', 1575), ('校方', 862), ('軍方', 232), ('檢方', 212), ('美方', 156)]},
 {'def': 'prescription', 'ex': [('良方', 22), ('妙方', 16), ('調方', 5)]}]

In [48]:
list(poly_affixoids.items())[318]

('迷',
 [{'def': "enchant, lose one's bearings",
   'ex': [('迷宮', 169), ('迷思', 92), ('迷彩', 10), ('迷幻', 8), ('迷夢', 6)]},
  {'def': 'fan',
   'ex': [('樂迷', 81), ('賭迷', 17), ('彩迷', 16), ('馬迷', 6), ('張迷', 5)]}])

In [35]:
# number of words
len(poly_affixoids)

796

In [36]:
# total usages/meanings/affixoids
sum(len(x) for x in poly_affixoids.values())

1765

In [62]:
# total example words count
sum(len(usage["ex"]) for affix_x in poly_affixoids.values() for usage in affix_x)

7072

In [49]:
7072/1765

4.006798866855524

In [65]:
import json
with open("../data/affix_dataset.json", "w", encoding="UTF-8") as fout:
    json.dump(poly_affixoids, fout, indent=2, ensure_ascii=False)

In [44]:
import json
with open("../data/affix_same_pos_poly.json", "w", encoding="UTF-8") as fout:
    json.dump(same_pos_poly, fout, indent=2, ensure_ascii=False)

In [67]:
list(poly_affixoids.items())[2]

('人',
 [{'def': 'human,person',
   'ex': [('人生', 1987), ('人性', 634), ('人體', 522), ('人潮', 314), ('人氣', 170)]},
  {'def': 'person',
   'ex': [('華人', 540), ('法人', 443), ('漢人', 398), ('成人', 281), ('古人', 243)]}])