In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
from mesh_import import mesh
import re
import json
import pickle
from mesh.bow import BowItem



In [3]:
wn = mesh.bow.get_wordnet16()

In [4]:
with (mesh.get_data_dir()/"bow/bow.json").open("r") as fin:
    bow = [BowItem(**item) for item in json.load(fin)]

In [5]:
import pickle
with (mesh.get_data_dir()/"asbc/asbc5_words.pkl").open("rb") as fin:
    wfreq = pickle.load(fin)

In [6]:
[x.translation for x in bow[:5]]

[['無情地', '苛刻地', '不近人情地'],
 ['臨盆的', '生產的', '分娩的'],
 ['打噴嚏'],
 ['垂死的', '臨終的', '將結束的'],
 ['原因', '由來']]

In [7]:
def try_stemming(synset, pos, word):
    repl = re.sub("^[將]", "", word)
    repl = re.sub("[地的]$", "", repl)
    if repl != word:
        stem = repl
        if stem and stem in wfreq:
            return (True, synset, pos, stem)
    return (False, synset, pos, word)
            

In [8]:
try_stemming("synset", "a", "的")

(False, 'synset', 'a', '的')

In [9]:
from itertools import chain, islice, starmap, cycle

In [10]:
bow_iter = ((x.synset, x.pos, x.translation) for x in bow)
bow_iter = (zip(cycle([synset]), cycle([pos]), trans) for synset, pos, trans in bow_iter)
bow_iter = chain.from_iterable(bow_iter)
stem_tuples = starmap(try_stemming, bow_iter)
result = starmap(lambda is_stemmed, synset, pos, word: dict(
                word=word, 
                synset=synset, 
                pos=pos,
                stemmed=is_stemmed, 
                is_word=is_stemmed or word in wfreq), stem_tuples)

In [11]:
from tqdm.autonotebook import tqdm
bow_data_list = list(tqdm(result))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [12]:
import pandas as pd
bow_data = pd.DataFrame.from_records(bow_data_list)

In [13]:
bow_data.shape

(149780, 5)

In [14]:
bow_words = bow_data.loc[bow_data.is_word, :]
bow_words.shape

(67367, 5)

In [15]:
bow_words_set = set(bow_words.word.tolist())
len(bow_words_set)

26905

In [23]:
from CwnGraph import CwnBase
cwn = CwnBase()

In [37]:
cwn_lemmas = set(ndata["lemma"] for ndata in cwn.V.values() if ndata["node_type"]=="lemma")
cwn_lemma_with_sense = [lemma for lemma in tqdm(cwn_lemmas) if len(cwn.find_all_senses(lemma)) > 0]

HBox(children=(IntProgress(value=0, max=26550), HTML(value='')))




In [39]:
len(cwn_lemma_with_sense)

9025

In [17]:
bow_words.to_csv(mesh.get_data_dir() / "bow/bow_words.csv")

In [18]:
bow_data.to_csv(mesh.get_data_dir() / "bow/bow_data.csv")

In [19]:
conversion_rate = bow_words.shape[0] / bow_data.shape[0]

In [20]:
type_coverage = sum(1 for w in wfreq.keys() if w in bow_words_set) / len(wfreq)
token_coverage = sum(f for w, f in wfreq.items() if w in bow_words_set) / sum(wfreq.values())

In [41]:
cwn_lemma_overlap = set(cwn_lemma_with_sense).intersection(bow_words_set)

In [45]:
cwn_overlap_rate = len(cwn_lemma_overlap) / len(bow_words_set)

In [47]:
print(f"conversion rate: {conversion_rate}")
print(f"type coverage: {type_coverage}")
print(f"token coverage: {token_coverage}")
print(f"cwn overlap rate: {cwn_overlap_rate}")

conversion rate: 0.44977300040058754
type coverage: 0.12378765850157351
token coverage: 0.6810064445688077
cwn overlap rate: 0.17714179520535217


In [48]:
sum([1 for x in bow_words_set if len(x)==1])

1472

In [49]:

sum([1 for x in bow_words.word.tolist() if len(x)==1])

3324