In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [65]:
from tqdm.auto import tqdm
from dotted_wsd import DottedWsdTagger

In [4]:
tagger = DottedWsdTagger()

In [7]:
pred = tagger.wsd_tag("<打>電話")[0]

In [15]:
pred.instance

{'example_id': 1,
 'example_type': 'wsd',
 'target_word': '打',
 'probe': '<打>電話',
 'sense_id': '05229179',
 'target_pos': None,
 'cwn_pos': 'VB',
 'simplified_pos': None,
 'sense_def': '利用電話裝置與人通話。',
 'sense_refex': '我從北京坐長途車去太原，6個小時的車程，就有400餘個電話<打>到我的手機上。'}

In [18]:
import pandas as pd
data = pd.read_csv("../data/di-morphosyntax.csv", index_col=0)

In [32]:
import numpy as np
pd.value_counts(data.MorphoSyntax), pd.value_counts(data.MorphoSyntax).sum()

(VR    879
 AV    340
 VV    244
 VO    213
 Name: MorphoSyntax, dtype: int64,
 1676)

In [35]:
mr_data = data.dropna(axis=0, subset=["MorphoSyntax"])
print(mr_data.shape)
mr_data.head()

(1676, 8)


Unnamed: 0,token,source,ASBC,Apple (2006-2016),China (2015-2016),Dcard (2019-2020),PTT (2004-2019),MorphoSyntax
4073,丟來,Corpus,1.0,5.0,5.0,1.0,0.0,VR
4074,丟光,Corpus,2.0,16.0,6.0,1.0,0.0,VR
4077,丟出,Corpus,9.0,190.0,56.0,16.0,0.0,VR
4079,丟到,Corpus,53.0,265.0,75.0,48.0,0.0,VR
4087,丟命,"MOE, Corpus",1.0,5.0,2.0,0.0,0.0,VO


In [54]:
def disambiguate_constituents(word):
    ret = []
    for i in range(len(word)):
        intext = word[:i] + f"<{word[i]}>"
        if i+1 < len(word):
            intext += word[i+1:]
        try:
            out = tagger.wsd_tag(intext)[0]
            pred_obj = {
                "pos": out.instance["cwn_pos"],
                "sense_id": out.instance["sense_id"],
                "sense_def": out.instance["sense_def"]
            }
        except:
            pred_obj = {"pos": "--",
                "sense_id": "--",
                "sense_def": "--"}
        ret.append(pred_obj)
        
    return ret

In [68]:
tagged_words = []
for _, row in tqdm(mr_data.iterrows(), total=mr_data.shape[0]):    
    word = row.token
    if len(word) != 2:
        print("not bisyllabic: ", word)
        continue
    mr = row.MorphoSyntax
    res = disambiguate_constituents(word)
    tagged_words.append(dict(
        word=word, morpho_syntax=mr,
        c1_pos=res[0]["pos"], c1_sid=res[0]["sense_id"], c1_def=res[0]["sense_def"],
        c2_pos=res[1]["pos"], c2_sid=res[1]["sense_id"], c2_def=res[1]["sense_def"]
    ))    

  0%|          | 0/1676 [00:00<?, ?it/s]

In [71]:
pd.DataFrame.from_records(tagged_words).to_csv("../data/disambig_mr.csv", index=False)