In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [3]:
import json
from tqdm.auto import tqdm
from itertools import islice
from datetime import datetime
from collections import Counter
import textwrap
from morphen.delta_classifier import (
    get_delta_vec,
    feature_extraction,
    estimate_lda,
    estimate_tree)

import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
output_paths = []

## Data dependencies

```
../data/verb_morpho.csv 15c7c1
../data/10.02-word-split.json 7ac81e
../data/delta_tenc_d200_biwords.bin 04e3f4
```

In [4]:
from hashlib import sha1
from pathlib import Path
paths = ["../data/verb_morpho.csv",          
         "../data/10.02-word-split.json",
         "../data/delta_tenc_d200_biwords.bin"]
for path_x in paths:
    h = sha1()
    h.update(Path(path_x).read_bytes())
    print(path_x, h.hexdigest()[:6])

../data/verb_morpho.csv 15c7c1
../data/10.02-word-split.json 7ac81e
../data/delta_tenc_d200_biwords.bin 04e3f4


## Load dependencies

In [5]:
mr_data = pd.read_csv("../data/verb_morpho.csv", index_col=0)
kv = KeyedVectors.load_word2vec_format("../data/delta_tenc_d200_biwords.bin", binary=True)
with open("../data/10.02-word-split.json", "r", encoding="UTF-8") as fin:
    word_split = json.load(fin)

## Make dataset

In [6]:
mr_data.iloc[:1, ]

Unnamed: 0,token,token_simp,source,ASBC,Apple (2006-2016),China (2015-2016),Dcard (2019-2020),PTT (2004-2019),MorphoSyntax
4073,丟來,丢来,Corpus,1.0,5.0,5.0,1.0,0.0,VR


In [7]:
word_list = [row.token for row in mr_data.itertuples()
             if row.token in kv or row.token_simp in kv]

In [8]:
len(word_list)

701

In [9]:
sel_mask = mr_data.apply(lambda x: x.token in kv or x.token_simp in kv, axis=1)
clf_data = mr_data.loc[sel_mask, ].reset_index(drop=True)\
                  .filter(["token", "token_simp", "MorphoSyntax"], axis=1)
# clf_data = mr_data
normalize_token = clf_data.apply(lambda x: x.token if x.token in kv else x.token_simp, axis=1)
clf_data = clf_data.assign(token_key=normalize_token)
clf_data.head()

Unnamed: 0,token,token_simp,MorphoSyntax,token_key
0,丟光,丢光,VR,丢光
1,丟出,丢出,VR,丢出
2,丟命,丢命,VO,丢命
3,丟盡,丢尽,VR,丢尽
4,丟給,丢给,VR,丢给


In [10]:
mr_data.shape, clf_data.shape

((1676, 9), (701, 4))

In [11]:
metric_data = []
feat_types = ["wv", 
              "c1", "c2", "c1+c2", 
              "mu1", "mu2", "mu1+mu2",
              "d1", "d2", "d1+d2",
              "dw"]
clf_entries = clf_data.to_dict(orient='records')

## linear discriminant analysis
for feat_type_x in feat_types:    
    kf_metrics = estimate_lda(feat_type_x, clf_entries, kv, kf_seed=12345)
    metric_data.extend(kf_metrics)

## decision tree
tree_metrics = estimate_tree(clf_entries)
metric_data.extend(tree_metrics)

metrics_df = pd.DataFrame(metric_data)

In [13]:
se_func = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))
metrics_summary = metrics_df.drop("fold_idx", axis=1).groupby(["clf", "feat_type"])\
    .agg(n_train=('n_train', 'mean'),
         n_test=('n_test', 'mean'),
         train_acc_M = ('train_acc', 'mean'),
         train_acc_se = ('train_acc', se_func),
         test_acc_M = ('test_acc', 'mean'),
         test_acc_se = ('test_acc', se_func),
         dummy_acc_M = ('dummy_acc', 'mean'),
         dummy_acc_se = ('dummy_acc', se_func))      
metrics_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,n_train,n_test,train_acc_M,train_acc_se,test_acc_M,test_acc_se,dummy_acc_M,dummy_acc_se
clf,feat_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
lda,c1,560.8,140.2,0.788872,0.006044,0.590648,0.02745,0.577822,0.024094
lda,c1+c2,560.8,140.2,0.995006,0.001041,0.56075,0.02841,0.577822,0.024094
lda,c2,560.8,140.2,0.915122,0.003064,0.66766,0.022931,0.577822,0.024094
lda,d1,555.2,138.8,0.904903,0.005831,0.585017,0.006602,0.583651,0.020301
lda,d1+d2,553.6,138.4,0.999277,0.000443,0.592357,0.021492,0.585132,0.026023
lda,d2,559.2,139.8,0.919884,0.002301,0.639507,0.010306,0.579476,0.023754
lda,dw,560.8,140.2,0.899791,0.005574,0.563465,0.011949,0.577822,0.024094
lda,mu1,559.2,139.8,0.787917,0.004715,0.593751,0.018987,0.579476,0.023754
lda,mu1+mu2,553.6,138.4,0.996387,0.000808,0.576468,0.033035,0.585132,0.026023
lda,mu2,555.2,138.8,0.915709,0.003948,0.652758,0.030328,0.583651,0.020301


In [14]:
metrics_summary.loc[metrics_summary.test_acc_M>0.6]

Unnamed: 0_level_0,Unnamed: 1_level_0,n_train,n_test,train_acc_M,train_acc_se,test_acc_M,test_acc_se,dummy_acc_M,dummy_acc_se
clf,feat_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
lda,c2,560.8,140.2,0.915122,0.003064,0.66766,0.022931,0.577822,0.024094
lda,d2,559.2,139.8,0.919884,0.002301,0.639507,0.010306,0.579476,0.023754
lda,mu2,555.2,138.8,0.915709,0.003948,0.652758,0.030328,0.583651,0.020301
lda,wv,560.8,140.2,0.940087,0.002287,0.667639,0.019008,0.577822,0.024094
tree,char,560.8,140.2,1.0,0.0,0.646211,0.012947,0.577822,0.024094
