In [1]:
%load_ext autoreload
%autoreload 2
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [2]:
from vec4gloss import check_hashes

## Data dependencies

```
    (data) => ..\data\rating_raw_data_220707.csv 7dde1c
    30.02 => ..\data\rating_materials.n10.csv bfe418
```

In [3]:
_ = check_hashes([
    "../data/rating_raw_data_220707.csv",
    "../data/rating_materials.n10.csv"
])

..\data\rating_raw_data_220707.csv 7dde1c
..\data\rating_materials.n10.csv bfe418


## Loading resources

In [4]:
import pandas as pd
data = pd.read_csv("../data/rating_raw_data_220707.csv")
item_data = pd.read_csv("../data/rating_materials.n10.csv", index_col=0)

In [5]:
data.shape, item_data.shape

((5, 423), (140, 7))

In [6]:
items = item_data.to_dict(orient='records')

In [7]:
respmat = data.iloc[:, 3:].values

In [8]:
data.iloc[:,3:7]

Unnamed: 0,為下列何者的釋義,釋義詮釋度 Semantic explainability,釋義語法自然度 Syntactic naturality,為下列何者的釋義.1
0,D.以,3,5,D.異質性
1,C.及時,1,4,D.異質性
2,C.及時,1,5,D.異質性
3,D.以,1,5,D.異質性
4,C.及時,2,5,D.異質性


In [9]:
resp_data = []
import numpy as np
for item_i, item_x in enumerate(items):
    resp = {**item_x}
    choices = [x.split(".")[1]
                       for x in respmat[:, item_i*3+0].tolist()]
    resp["correct"] = np.mean([x==item_x["target"] for x in choices])
    resp["sem_mean"] = np.mean(respmat[:, item_i*3+1].tolist())    
    resp["syn_mean"] = np.mean(respmat[:, item_i*3+2].tolist()) 
    resp["sem_sd"] = np.std(respmat[:, item_i*3+1].tolist(), ddof=1)
    resp["syn_sd"] = np.std(respmat[:, item_i*3+2].tolist(), ddof=1)
    resp_data.append(resp)

In [10]:
resp_df = pd.DataFrame.from_records(resp_data)

In [11]:
def se(x):
    return np.std(x, ddof=1)/np.sqrt(len(x))
resp_df.pivot_table(index=["from"], 
                    values=["correct", "sem_mean", "syn_mean"], 
                    aggfunc=["mean", se])

Unnamed: 0_level_0,mean,mean,mean,se,se,se
Unnamed: 0_level_1,correct,sem_mean,syn_mean,correct,sem_mean,syn_mean
from,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ASBC,0.855,2.53,4.51,0.04113,0.215377,0.124849
CWN,0.95,4.47,4.82,0.024602,0.148873,0.095035
vec4gloss,0.88,3.5075,4.58,0.027467,0.164045,0.092886


In [12]:
resp_df.loc[resp_df.loc[:,"from"]=="ASBC"]\
       .pivot_table(index=["pos"], 
                    values=["correct", "sem_mean", "syn_mean"], 
                    aggfunc=["mean", se])

Unnamed: 0_level_0,mean,mean,mean,se,se,se
Unnamed: 0_level_1,correct,sem_mean,syn_mean,correct,sem_mean,syn_mean
pos,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
D,0.84,2.76,4.74,0.071802,0.430813,0.160693
N,0.86,1.92,4.32,0.079162,0.396877,0.342799
O,0.86,2.7,4.5,0.103494,0.445471,0.204939
V,0.86,2.74,4.48,0.084591,0.455143,0.273577


In [13]:
resp_df.loc[resp_df.loc[:,"from"]=="vec4gloss"]\
       .pivot_table(index=["pos"], 
                    values=["correct", "sem_mean", "syn_mean"], 
                    aggfunc=["mean", se])

Unnamed: 0_level_0,mean,mean,mean,se,se,se
Unnamed: 0_level_1,correct,sem_mean,syn_mean,correct,sem_mean,syn_mean
pos,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
D,0.84,3.75,4.69,0.064236,0.305347,0.179751
N,0.94,3.18,4.14,0.035836,0.351208,0.249252
O,0.85,3.47,4.7,0.059604,0.323558,0.161245
V,0.89,3.63,4.79,0.057078,0.341598,0.095669


## Output Hashes

```
..\data\rating_response_stat.csv a04c24
```

In [14]:
data_path = "../data/rating_response_stat.csv"
resp_df.to_csv(data_path, index=False)
_ = check_hashes([data_path])

..\data\rating_response_stat.csv a04c24


## Eye-balling

In [15]:
resp_df.query("syn_mean<4")

Unnamed: 0,target,ans,pos,from,item_id,definition,options,correct,sem_mean,syn_mean,sem_sd,syn_sd
1,異質性,D,N,vec4gloss,N-15,形容具有多個性質或性質的。,A.之內 B.素 C.反應 D.異質性,1.0,3.8,1.8,0.447214,0.447214
6,紙,D,N,vec4gloss,N-11,以紙張為主要材質製成的紙張。,A.干 B.洲 C.對 D.紙,1.0,3.4,2.0,1.516575,0.707107
25,愛,D,O,vec4gloss,O-16,形容愛惜並愛惜特定對象的。,A.喲 B.而 C.據 D.愛,1.0,3.2,1.8,1.095445,0.83666
47,開發中,B,O,ASBC,O-59,特定事件在一段時間內發生不久的特定階段。,A.深層 B.開發中 C.亞太 D.於焉,0.6,2.8,3.6,0.83666,1.140175
52,合併症,B,N,ASBC,N-58,疾病症狀由一個醫療體內產生病症的疾病。,A.樂府 B.合併症 C.套餐 D.情意,1.0,3.2,1.8,0.83666,1.30384
64,需要,B,N,vec4gloss,N-07,特定對象所需要的所有權。,A.料 B.需要 C.撢 D.卡,1.0,1.4,3.8,0.894427,1.30384
70,水到渠成,C,V,ASBC,V-55,比喻前述事件發展到成熟的情況或發展。,A.劃定 B.承繼 C.水到渠成 D.良善,1.0,4.8,3.6,0.447214,1.140175
71,一窩蜂,D,D,ASBC,D-50,比喻嗜好特定對象的怪獸。,A.同聲 B.何妨 C.方才 D.一窩蜂,1.0,1.0,3.4,0.0,1.81659
79,封閉性,B,N,ASBC,N-53,形容比喻不侷限於狹小範圍，不與外界溝通。,A.演進 B.封閉性 C.大一 D.牌樓,1.0,2.4,3.0,0.547723,1.581139
87,弦,C,N,vec4gloss,N-23,繫在兩條線上的線。,A.語言 B.廣場 C.弦 D.流,1.0,2.4,3.6,0.894427,1.341641


In [16]:
resp_df.query("sem_mean<3").sort_values("correct", ascending=False)

Unnamed: 0,target,ans,pos,from,item_id,definition,options,correct,sem_mean,syn_mean,sem_sd,syn_sd
87,弦,C,N,vec4gloss,N-23,繫在兩條線上的線。,A.語言 B.廣場 C.弦 D.流,1.0,2.4,3.6,0.894427,1.341641
96,昆,B,N,vec4gloss,N-05,昆蟲的文化。,A.文 B.昆 C.水 D.文字,1.0,1.2,2.4,0.447214,1.67332
68,小鬼,A,N,ASBC,N-57,指人能力不足。,A.小鬼 B.基隆市 C.絕症 D.肚皮,1.0,1.2,5.0,0.447214,0.0
71,一窩蜂,D,D,ASBC,D-50,比喻嗜好特定對象的怪獸。,A.同聲 B.何妨 C.方才 D.一窩蜂,1.0,1.0,3.4,0.0,1.81659
79,封閉性,B,N,ASBC,N-53,形容比喻不侷限於狹小範圍，不與外界溝通。,A.演進 B.封閉性 C.大一 D.牌樓,1.0,2.4,3.0,0.547723,1.581139
80,應否,C,D,ASBC,D-56,表對事件的疑問，詢問原因。,A.咯咯咯 B.活活 C.應否 D.不配,1.0,2.8,4.8,1.788854,0.447214
4,比較,D,D,vec4gloss,D-21,表程度加深。,A.到時候 B.反過來 C.剛好 D.比較,1.0,1.8,5.0,1.30384,0.0
88,嬌,C,V,vec4gloss,V-13,形容有花植物的花朵長出並舒展。,A.沉痛 B.佈局 C.嬌 D.比照,1.0,1.4,4.6,0.894427,0.894427
90,中上,A,O,ASBC,O-54,等級不在兩端的部份。,A.中上 B.沿著 C.寬頻 D.像是,1.0,2.4,3.6,0.547723,1.341641
98,旅店,B,N,ASBC,N-52,住家房屋內用餐的場所。,A.買盤 B.旅店 C.經常性 D.老天,1.0,1.0,5.0,0.0,0.0


In [17]:
resp_df.query("syn_mean<4 & sem_mean >3")

Unnamed: 0,target,ans,pos,from,item_id,definition,options,correct,sem_mean,syn_mean,sem_sd,syn_sd
1,異質性,D,N,vec4gloss,N-15,形容具有多個性質或性質的。,A.之內 B.素 C.反應 D.異質性,1.0,3.8,1.8,0.447214,0.447214
6,紙,D,N,vec4gloss,N-11,以紙張為主要材質製成的紙張。,A.干 B.洲 C.對 D.紙,1.0,3.4,2.0,1.516575,0.707107
25,愛,D,O,vec4gloss,O-16,形容愛惜並愛惜特定對象的。,A.喲 B.而 C.據 D.愛,1.0,3.2,1.8,1.095445,0.83666
52,合併症,B,N,ASBC,N-58,疾病症狀由一個醫療體內產生病症的疾病。,A.樂府 B.合併症 C.套餐 D.情意,1.0,3.2,1.8,0.83666,1.30384
70,水到渠成,C,V,ASBC,V-55,比喻前述事件發展到成熟的情況或發展。,A.劃定 B.承繼 C.水到渠成 D.良善,1.0,4.8,3.6,0.447214,1.140175
100,問題,D,O,CWN,O-04,形容特定對象有需要被解決的困難，通常會製造麻煩的。,A.淨 B.唉唷 C.從 D.問題,1.0,4.6,3.4,0.547723,1.140175


In [18]:
resp_df.query("syn_mean>4 & sem_mean <3")

Unnamed: 0,target,ans,pos,from,item_id,definition,options,correct,sem_mean,syn_mean,sem_sd,syn_sd
0,及時,C,O,vec4gloss,O-18,形容在事件發生的時間點之後。,A.哦荷荷 B.呀 C.及時 D.以,0.6,1.6,4.8,0.894427,0.447214
4,比較,D,D,vec4gloss,D-21,表程度加深。,A.到時候 B.反過來 C.剛好 D.比較,1.0,1.8,5.0,1.30384,0.0
8,主控,A,V,ASBC,V-57,對特定事件做出如何進行的決定，並暗示與說話者的認知不同。,A.主控 B.下海 C.探頭 D.攀上,1.0,2.2,4.2,1.095445,1.095445
11,可就,D,D,ASBC,D-55,表說話者主觀評價程度高，帶有誇張語氣或情感。,A.難保 B.切勿 C.鼎力 D.可就,0.6,1.2,5.0,0.447214,0.0
12,中介,B,O,vec4gloss,O-22,與特定對象相關的責任或事件。,A.同時 B.中介 C.乃至 D.遍,0.2,1.8,4.8,1.30384,0.447214
22,急遽,A,D,ASBC,D-59,形容事件急迫的。,A.急遽 B.自應 C.轟隆 D.會不會,1.0,2.8,4.4,1.788854,0.894427
29,月,D,O,vec4gloss,O-07,地球環繞太陽一周所需的時間。,A.嗟 B.單 C.噯 D.月,1.0,1.8,5.0,1.788854,0.0
30,年頭,C,N,vec4gloss,N-24,過去。,A.友 B.合約 C.年頭 D.圈,0.8,1.2,4.4,0.447214,0.894427
31,好不容易,A,D,vec4gloss,D-10,表說話者主觀評價程度高，帶有誇張語氣或情感。,A.好不容易 B.汲汲 C.悄悄 D.八成,0.8,2.2,5.0,0.83666,0.0
34,燈,C,N,vec4gloss,N-13,比喻具有特定功能的器具。,A.甲 B.毛 C.燈 D.圍,0.6,1.2,4.8,0.447214,0.447214
