In [1]:
%load_ext autoreload
%autoreload 2
from mesh_import import mesh

  from tqdm.autonotebook import tqdm


In [2]:
import igraph as ig
from CwnGraph import CwnBase, CwnSense
from tqdm.autonotebook import tqdm
import numpy as np

In [3]:
cns_dir = mesh.get_data_dir() / "cns"

In [4]:
import pandas as pd

In [5]:
Ga_measures = pd.read_csv(cns_dir / "Ga_measures.csv", index_col=0)
Gb_measures = pd.read_csv(cns_dir / "Gb_measures.csv", index_col=0)
Gc_measures = pd.read_csv(cns_dir / "Gc_measures.csv", index_col=0)

In [6]:
Ga_measures

Unnamed: 0,closenessA,betweennessA,degreeA,transA,ncompoA
03000101,0.000035,0.0,0,,1
03000102,0.000035,0.0,0,,1
03000201,0.000035,0.0,0,,1
03000202,0.000035,0.0,0,,1
03000203,0.000035,0.0,0,,1
...,...,...,...,...,...
14609902,0.000035,0.0,0,,1
14610001,0.000035,0.0,0,,1
14610101,0.000035,0.0,0,,1
14610201,0.000035,0.0,0,,1


## MELD_SCH

In [7]:
from CwnGraph import CwnBase, CwnLemma
from opencc import OpenCC
cc = OpenCC('s2t')
cwn = CwnBase()

In [42]:
meld_sch=pd.read_excel(mesh.get_data_dir()/"ldt-data/Tsang-2018-MELD-SCH.xlsx")
cn_words = meld_sch.loc[meld_sch.lexicality==1,].word.tolist()
tw_words = [cc.convert(x) for x in cn_words]
meld_sch = meld_sch.assign(trad_word=lambda df: [cc.convert(x) for x in df.word])
meld_sch = meld_sch.rename({"word": "simp_word"}, axis=1)

In [29]:
# get lemmas with at least one sense
lemma_iter = filter(lambda x: x[1]['node_type']=='lemma', cwn.V.items())
lemma_iter = filter(lambda x: CwnLemma(x[0], cwn).senses, lemma_iter)
lemma_iter = map(lambda x: x[1].get("lemma"), lemma_iter)

cwn_lemmas = list(lemma_iter)
int_words = set(cwn_lemmas) & set(tw_words)

## find cwn senses of each lemma

In [10]:
data = []
for word in tqdm(int_words):
    senses = cwn.find_all_senses(word)
    data.extend((word, sense.id) for sense in senses)

HBox(children=(FloatProgress(value=0.0, max=2931.0), HTML(value='')))




In [33]:
sense_frame = pd.DataFrame.from_records(data, columns=["word", "cwn_id"])

In [34]:
sense_cns = sense_frame.merge(Ga_measures, left_on="cwn_id", right_index=True)\
                    .merge(Gb_measures, left_on="cwn_id", right_index=True)\
                    .merge(Gc_measures, left_on="cwn_id", right_index=True)


In [35]:
def agg_sense_cns(agg_func_name):
    sense_cns_agg = sense_cns.drop("cwn_id", axis=1).groupby("word").agg(agg_func_name)
    sense_cns_agg["nSense"] = sense_cns.groupby("word").size()
    sense_cns_agg.reset_index(inplace=True)
    return sense_cns_agg

In [36]:
sense_cns_max = agg_sense_cns("max")
sense_cns_avg = agg_sense_cns("mean")
sense_cns_min = agg_sense_cns("min")

## MELD_SCH data

In [40]:
def merge_meld_sch(sense_cns_frame):
    merge_df = sense_cns_frame.merge(
                    meld_sch.loc[meld_sch.lexicality==1,:][["simp_word", "trad_word", "N", "RT", "RTSD", "ERR", "length"]], 
                    left_on="word", right_on="trad_word").drop("trad_word", axis=1)
    return merge_df

In [43]:
meld_sch_max = merge_meld_sch(sense_cns_max)
meld_sch_min = merge_meld_sch(sense_cns_min)
meld_sch_avg = merge_meld_sch(sense_cns_avg)

In [44]:
meld_sch_max.to_csv(cns_dir / "cns_meld_sch_max.csv")
meld_sch_min.to_csv(cns_dir / "cns_meld_sch_min.csv")
meld_sch_avg.to_csv(cns_dir / "cns_meld_sch_avg.csv")

In [45]:
meld_sch_avg

Unnamed: 0,word,closenessA,betweennessA,degreeA,transA,ncompoA,closenessB,betweennessB,degreeB,transB,...,degreeC,transC,ncompoC,nSense,simp_word,N,RT,RTSD,ERR,length
0,一共,0.000035,0.0,0.0,,1.0,0.000035,0.0,4.0,1.0,...,5.0,0.90,5165.000000,1,一共,40,741.714250,248.347155,4.761905,2
1,一切,0.000035,0.0,0.0,,1.0,0.000035,0.0,0.0,,...,0.0,,1.000000,1,一切,40,610.620250,104.615792,0.000000,2
2,一律,0.000035,0.0,0.0,,1.0,0.000035,0.0,1.0,,...,2.5,0.00,2583.500000,2,一律,39,736.614615,147.969630,7.142857,2
3,一心一意,0.000035,0.0,0.0,,1.0,0.000035,0.0,4.0,1.0,...,4.0,1.00,5.000000,1,一心一意,40,656.040000,152.714736,0.000000,4
4,一模一樣,0.000035,0.0,0.0,,1.0,0.000035,0.0,0.0,,...,0.0,,1.000000,1,一模一样,38,636.175789,154.982531,2.564103,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2929,齊聲,0.000035,0.0,0.0,,1.0,0.000035,0.0,0.0,,...,0.0,,1.000000,2,齐声,38,899.816316,413.937271,5.000000,2
2930,龍蝦,0.000037,0.0,1.0,,1931.0,0.000035,0.0,0.0,,...,1.0,,5165.000000,1,龙虾,40,656.071000,128.785154,2.439024,2
2931,龐,0.000035,0.0,0.0,,1.0,0.000035,0.0,0.0,,...,0.0,,1.000000,3,庞,42,807.039048,289.688082,0.000000,1
2932,龐大,0.000035,0.0,0.0,,1.0,0.000035,0.0,0.4,,...,0.4,,2.000000,5,庞大,41,643.367805,151.671149,0.000000,2


In [46]:
meld_sch_avg.columns

Index(['word', 'closenessA', 'betweennessA', 'degreeA', 'transA', 'ncompoA',
       'closenessB', 'betweennessB', 'degreeB', 'transB', 'ncompoB',
       'closenessC', 'betweennessC', 'degreeC', 'transC', 'ncompoC', 'nSense',
       'simp_word', 'N', 'RT', 'RTSD', 'ERR', 'length'],
      dtype='object')