In [1]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import vec4gloss
from vec4gloss import check_hashes  

## Data Dependencies

```
../data/auto_metrics.csv 7930a0
```

In [3]:
_ = check_hashes("../data/auto_metrics.csv")

..\data\auto_metrics.csv 7930a0


## Load resources

In [4]:
data = pd.read_csv("../data/auto_metrics.csv")

In [5]:
se = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))
eval_results_df = pd.DataFrame.from_records(data)
eval_results_df.groupby("pos")\
    .agg(
        n_sample = ("cwnid", len),
        bleu_mean=("bleu", "mean"),
        meteor_mean=("meteor", "mean"),
        rouge_mean=("rougeL_fmeasure", "mean"),
        bleu_se=("bleu", se),
        meteor_se=("meteor", se),
        rouge_se=("rougeL_fmeasure", se)
    ).sort_index(key=lambda x: ["N,V,D,O,Nb".split(",").index(p) for p in x])

Unnamed: 0_level_0,n_sample,bleu_mean,meteor_mean,rouge_mean,bleu_se,meteor_se,rouge_se
pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
N,2801,0.35104,0.586241,0.914749,0.007475,0.005587,0.004216
V,4376,0.434996,0.625141,0.874232,0.00613,0.004625,0.003787
D,432,0.408252,0.616829,0.815719,0.021168,0.01584,0.018018
O,530,0.410067,0.625508,0.757303,0.016867,0.013446,0.017195
Nb,414,0.632856,0.742545,0.888524,0.021533,0.016441,0.012069


In [6]:
data[["bleu", "meteor"]].mean()

bleu      0.414183
meteor    0.617687
dtype: float64

In [8]:
se(data["bleu"]), se(data["meteor"])

(0.004426287632642106, 0.003322553312070204)

In [7]:
data.shape

(8553, 8)

## On proper names

In [13]:
from CwnGraph import CwnImage
cwn = CwnImage.load("v.2022.06.21")
nb_cwnids = data.loc[data.pos=="Nb"].cwnid.tolist()

In [16]:
cwn.find_all_senses(cwn.from_sense_id(nb_cwnids[0]).head_word)

[<CwnSense[04087601](伊，Nh): 代指說話者和聽話者以外的個人。>,
 <CwnSense[05207001](伊，D): 表剛剛。>,
 <CwnSense[05207101](伊，I): 模擬不易聽懂的說話聲。>,
 <CwnSense[05207201](伊，Nc): 國名，位於亞洲西部，首都為巴格達。>,
 <CwnSense[05207202](伊，Nb): 治理伊拉克的政府。>,
 <CwnSense[05207301](伊，Nc): 國名，位於亞洲西南部，首都為德黑蘭。>,
 <CwnSense[05207302](伊，Nb): 治理伊朗的政府。>,
 <CwnSense[05207401](伊，Nb): 姓。>]

In [20]:
from collections import Counter
nb_defs = Counter(cwn.from_sense_id(x).definition for x in nb_cwnids)

In [22]:
nb_defs.most_common(10)

[('姓。', 179),
 ('外文名字。', 9),
 ('治理伊朗的政府。', 3),
 ('中國天文學的星宿，為北方玄武之一。', 3),
 ('中國近代政治家、軍事家，名中正，字介石，民國三十七年當選中華民國首任總統，民國三十八年帶領國民黨播遷到臺灣。', 3),
 ('中國天文學的星宿，為東方蒼龍之一。', 3),
 ('西洋十二星座之一，五月二十一日至六月二十日之間出生者的命宮。', 2),
 ('用於橋及其周遭區域的名稱。', 2),
 ('白話章回小說，相傳為元代施耐庵或明初羅貫中作。', 2),
 ('治理巴黎市的地方政府。', 2)]

In [None]:
179 / 