In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [52]:
from mesh_import import mesh
from tqdm.autonotebook import tqdm
import pandas as pd
import re
import json
import numpy as np

In [6]:
from nltk.corpus import WordNetCorpusReader
import nltk
wn_dir = str(mesh.get_data_dir() / "bow/wn16_dict")
wn = WordNetCorpusReader(wn_dir, nltk.data.find(wn_dir))

In [7]:
import pickle
with open(mesh.get_data_dir()/"bow/bow_monosyl_sense_vector.pkl", "rb") as fin:
    char_sense_vector = pickle.load(fin)

In [8]:
m2_cld = pd.read_csv(mesh.get_data_dir()/ "cns/m2_cld.csv", index_col=0)

In [9]:
m2_cld.columns

Index(['id', 'word', 'lexicality', 'N', 'RT', 'RTSD', 'zRT', 'zRTSD', 'ERR',
       'length', 'wfreq', 'C1', 'C2', 'C1Frequency', 'C1FamilySize',
       'C1Friends', 'C1FamilyFrequency', 'C1RE', 'C1Strokes', 'C1Entropy',
       'C1nSense', 'C2Frequency', 'C2FamilySize', 'C2Friends',
       'C2FamilyFrequency', 'C2RE', 'C2Strokes', 'C2Entropy', 'C2nSense'],
      dtype='object')

In [10]:
from opencc import OpenCC
cc = OpenCC('s2t')
word_tw = [cc.convert(x) for x in m2_cld.word]

In [140]:
from itertools import product, starmap
def find_min_max(iterable_1, iterable_2, fn):
    scores = starmap(fn, product(iterable_1, iterable_2))
    scores = [x for x in scores if not np.isnan(x)]
    if scores:
        min_score = min(scores)
        max_score = max(scores)
        return min_score, max_score
    else:
        return np.nan, np.nan

def path_similarity(syn_name_1, syn_name_2):
    wn._synset_offset_cache.clear()
    syn1 = wn.synset(syn_name_1)
    syn2 = wn.synset(syn_name_2)
    if syn1.pos() == syn2.pos() and \
        (syn1.pos() == 'v' or syn1.pos() == 'n'):
        sim = syn1.path_similarity(syn2, simulate_root=True)
        if not sim:
            return np.nan
        return sim
    else:
        return np.nan
    

def vec_distance(vec1, vec2):
    return np.sqrt(np.sum((np.array(vec1)-np.array(vec2))**2))

In [141]:
def compute_path_measures(word):
    assert len(word) == 2
    c1, c2 = list(word)
    c1_data = char_sense_vector.get(c1, None)
    c2_data = char_sense_vector.get(c2, None)
    if not (c1_data and c2_data):
        return None
    c1_syns = c1_data["synset"]
    c2_syns = c2_data["synset"]
    c1_vecs = c1_data["vector"]
    c2_vecs = c2_data["vector"]
    path_sims = find_min_max(c1_syns, c2_syns, path_similarity)
    vec_dists = find_min_max(c1_vecs, c2_vecs, vec_distance)
    return {"word": word, 
            "vec_dist_min": vec_dists[0], "vec_dist_max": vec_dists[1], 
            "path_sim_min": path_sims[0], "path_sim_max": path_sims[1]}

In [142]:
compute_path_measures("錯疤")

{'word': '錯疤',
 'vec_dist_min': 2.6457513110645907,
 'vec_dist_max': 3.872983346207417,
 'path_sim_min': 0.07692307692307693,
 'path_sim_max': 0.07692307692307693}

In [143]:
min([1])

1

In [144]:
wn._synset_offset_cache.clear()

In [145]:
wn_measures = [compute_path_measures(x) for x in tqdm(word_tw)]

HBox(children=(IntProgress(value=0, max=20038), HTML(value='')))




In [146]:
len([1 for x in wn_measures if x])

6056

In [147]:
wn_measure_df = pd.DataFrame.from_records([x for x in wn_measures if x])

In [148]:
m2_cld["word_tw"] = word_tw
m2_cld_wn = m2_cld.merge(wn_measure_df, left_on="word_tw", right_on="word")

In [149]:
m2_cld_wn

Unnamed: 0,id,word_x,lexicality,N,RT,RTSD,zRT,zRTSD,ERR,length,...,C2RE,C2Strokes,C2Entropy,C2nSense,word_tw,word_y,vec_dist_min,vec_dist_max,path_sim_min,path_sim_max
0,200001,一切,1,40,610.620250,104.615792,-0.744272,0.294925,0.000000,2,...,5.9720,4.0,2.6764,18,一切,一切,2.236068,3.464102,,
1,210166,土切,2,40,974.976250,332.479450,0.622559,1.210804,2.439024,2,...,5.9720,4.0,2.6764,18,土切,土切,2.236068,3.316625,0.111111,0.111111
2,210778,手切,2,26,908.211538,357.555559,0.363349,1.037322,36.585366,2,...,5.9720,4.0,2.6764,18,手切,手切,2.236068,3.872983,,
3,211807,电切,2,34,919.441176,371.619622,0.113097,0.882876,15.000000,2,...,5.9720,4.0,2.6764,18,電切,電切,2.236068,3.000000,,
4,212625,灯切,2,40,831.862750,204.859515,0.002340,0.582847,2.439024,2,...,5.9720,4.0,2.6764,18,燈切,燈切,3.316625,3.464102,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6053,219788,错疤,2,39,1018.995897,463.112043,0.408685,1.212337,2.500000,2,...,1.2264,9.0,,3,錯疤,錯疤,2.645751,3.872983,0.076923,0.076923
6054,220044,鹰斜,2,40,853.194250,351.891181,-0.006309,0.798374,4.761905,2,...,1.0123,11.0,3.0328,4,鷹斜,鷹斜,3.000000,3.605551,,
6055,209467,调剂,1,41,765.834390,329.601164,-0.206131,1.132149,0.000000,2,...,1.0650,8.0,,4,調劑,調劑,3.316625,3.605551,,
6056,219793,锤瘤,2,41,1003.872195,332.280792,0.393018,0.840217,2.380952,2,...,0.7428,15.0,0.0000,0,錘瘤,錘瘤,2.449490,3.872983,0.083333,0.142857


In [150]:
m2_cld_wn.to_csv(mesh.get_data_dir()/"bow/m2_cld_wn.csv")