In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
sys.path.append("../")

In [4]:
import pandas as pd
from GWA2019 import word_pos_utils as word_pos

In [5]:
moe_inst = word_pos.MOE("../resources/dict-revised.json")

# query_word_pos test

In [6]:
word_pos.query_word_pos("心跳", moe_inst)

['N', 'V']

In [7]:
word_pos.query_word_pos("企圖", moe_inst)

['V', 'N']

In [8]:
word_pos.query_word_pos("衝撞", moe_inst)

['V', 'V']

In [9]:
word_pos.query_word_pos("風景", moe_inst)

['N', 'N']

# apply the heuristics to word pair data

In [10]:
hyper_pairs = pd.read_csv("../data/merged_checked.csv")
hyper_pairs.drop("statement", axis = 1, inplace=True)

In [11]:
def query_word_pos_wrapper(x):
    ret = word_pos.query_word_pos(x, moe_inst)
    if any([x is None for x in ret]):
        return "--"
    else:
        return "".join(ret)
hyper_pairs["struct"] = hyper_pairs.lemma.apply(query_word_pos_wrapper)

In [12]:
word_struct = hyper_pairs.iloc[:,[4,0,1,2,3]]

In [13]:
word_pos.query_word_pos("一些", moe_inst)

['N', None]

In [14]:
word_struct.to_csv("../data/annot_word_struct/word_struct.csv")
## add a copy used to annotate
word_struct.to_csv("../data/annot_word_struct/annot_word_struct.csv")

In [15]:
stat_word_struct = word_struct[["struct", "response"]].groupby(["struct", "response"]).size()
stat_word_struct = stat_word_struct.reset_index().pivot(index="struct", columns="response")

In [16]:
stat_word_struct.columns = stat_word_struct.columns.droplevel(0)

In [17]:
stat_word_struct["sum"] = stat_word_struct.sum(1)

In [18]:
stat_word_struct["prop"] = stat_word_struct.iloc[:,1] / stat_word_struct["sum"]
stat_word_struct

response,0,1,sum,prop
struct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
--,204,118,322,0.36646
NN,427,311,738,0.421409
NV,153,94,247,0.380567
VN,203,100,303,0.330033
VV,123,82,205,0.4


In [19]:
from tabulate import tabulate

In [20]:
print(tabulate(stat_word_struct, headers=stat_word_struct.columns, floatfmt=[".0f"] * 4 + [".2f"]))

      0    1    sum    prop
--  ---  ---  -----  ------
--  204  118    322    0.37
NN  427  311    738    0.42
NV  153   94    247    0.38
VN  203  100    303    0.33
VV  123   82    205    0.40


# Use CWN to determine POS structure

In [29]:
from CwnGraph import CwnBase
from itertools import chain
cwn = CwnBase()

In [56]:
unk_structs = word_struct.loc[word_struct.struct=="--", :].copy()

In [51]:

def query_word_pos_cwn(word):
    if len(word) > 1:
        poses = [query_word_pos_cwn(w) for w in word]
        if any([x=="-" for x in poses]):
            return "--"
        else:
            return "".join(poses)
    else:
        senses = list(chain.from_iterable(x.senses for x in cwn.find_lemma(word)))
        n_N = sum(1 for x in senses if x.pos.startswith("N"))
        n_V = sum(1 for x in senses if x.pos.startswith("V"))
        if n_N > n_V:
            return "N"
        elif n_V > n_N:
            return "V"
        else:
            return "-"    

In [52]:
cwn_struct = unk_structs.lemma.apply(query_word_pos_cwn)

In [53]:
cwn_struct.value_counts()

VV    146
NN     85
VN     48
NV     29
--     14
Name: lemma, dtype: int64

In [57]:
unk_structs["cwn_struct"] = cwn_struct

In [73]:
word_struct_m = word_struct.merge(unk_structs[["lemma", "cwn_struct"]], left_on="lemma", right_on="lemma", how="left")
word_struct_m.loc[word_struct.struct=="--", "struct"] = word_struct_m.loc[word_struct.struct=="--", "cwn_struct"]
word_struct_m.drop("cwn_struct", axis=1, inplace=True)

In [98]:
word_struct_m.to_csv("../data/annot_word_struct/annot_word_struct.csv")

In [92]:
word_struct_stat = word_struct_m.groupby(["struct", "response"]).size().reset_index()
word_struct_stat = word_struct_stat.pivot(index="struct", columns="response")
word_struct_stat.columns = word_struct_stat.columns.droplevel(0)
word_struct_stat["sum"] = word_struct_stat.sum(1)
word_struct_stat["prop"] = word_struct_stat.iloc[:,1] / word_struct_stat["sum"]
word_struct_stat

response,0,1,sum,prop
struct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
--,9,5,14,0.357143
NN,486,337,823,0.409478
NV,174,102,276,0.369565
VN,236,115,351,0.327635
VV,205,146,351,0.415954


# After manual resolved

In [99]:
word_struct = pd.read_csv("../data/annot_word_struct/annot_word_struct_manual.csv")
word_struct_stat = word_struct.groupby(["struct", "response"]).size().reset_index()
word_struct_stat = word_struct_stat.pivot(index="struct", columns="response")
word_struct_stat.columns = word_struct_stat.columns.droplevel(0)
word_struct_stat["sum"] = word_struct_stat.sum(1)
word_struct_stat["prop"] = word_struct_stat.iloc[:,1] / word_struct_stat["sum"]
word_struct_stat

response,0,1,sum,prop
struct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NN,487,337,824,0.408981
NV,174,102,276,0.369565
VN,237,116,353,0.328612
VV,212,150,362,0.414365
