In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")

In [3]:
import json
from tqdm.auto import tqdm
from itertools import islice
from datetime import datetime
from collections import Counter
import textwrap

import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

from morphen.delta_classifier import (
    get_delta_vec,
    feature_extraction,
    estimate_lda,
    estimate_tree)
output_paths = []

## Data dependencies

```
../data/verb_morpho.csv 15c7c1
../data/10.02-word-split.json 7ac81e
../data/delta_tenc_d200_biwords.bin 04e3f4
```

In [4]:
from hashlib import sha1
from pathlib import Path
paths = ["../data/verb_morpho.csv",          
         "../data/10.02-word-split.json",
         "../data/delta_tenc_d200_biwords.bin"]
for path_x in paths:
    h = sha1()
    h.update(Path(path_x).read_bytes())
    print(path_x, h.hexdigest()[:6])

../data/verb_morpho.csv 15c7c1
../data/10.02-word-split.json 7ac81e
../data/delta_tenc_d200_biwords.bin 04e3f4


## Load dependencies

In [5]:
mr_data = pd.read_csv("../data/verb_morpho.csv", index_col=0)
kv = KeyedVectors.load_word2vec_format("../data/delta_tenc_d200_biwords.bin", binary=True)
with open("../data/10.02-word-split.json", "r", encoding="UTF-8") as fin:
    word_split = json.load(fin)

## Make dataset

In [6]:
mr_data.iloc[:1, ]

Unnamed: 0,token,token_simp,source,ASBC,Apple (2006-2016),China (2015-2016),Dcard (2019-2020),PTT (2004-2019),MorphoSyntax
4073,丟來,丢来,Corpus,1.0,5.0,5.0,1.0,0.0,VR


In [7]:
## for compatibility with 20.11 and morphen/delta_classifier:feature_extraction
mr_data = mr_data.assign(token_key=mr_data.token)

In [8]:
mr_data.shape

(1676, 10)

## Load Morphen

In [9]:
from dataclasses import dataclass
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizerFast

In [10]:
class MrDataset(Dataset):
    def __init__(self, mrdata):
        self.data = [(x.token, x.MorphoSyntax) 
                     for _, x in mrdata.iterrows()]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]
    

In [11]:
mr_data_train = mr_data.loc[mr_data.token.isin(word_split["train"]),:]
mr_data_test = mr_data.loc[mr_data.token.isin(word_split["test"]),:]
train_ds = MrDataset(mr_data_train)
test_ds = MrDataset(mr_data_test)
mr_vocab = LabelEncoder()
train_classes = mr_vocab.fit_transform([x[1] for x in train_ds])

In [12]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")
model = BertForSequenceClassification.from_pretrained(f"../data/models/morphen-alpha-220802-0840/")

In [13]:
@dataclass
class MrCollator:
    tokenizer: "BertTokenizerFast"
    mr_vocab: "LabelEncoder"
    def __call__(self, batch):
        toks = self.tokenizer([x[0] for x in batch], return_tensors="pt")
        mrs = self.mr_vocab.transform([x[1] for x in batch])
        toks["labels"] = torch.tensor(mrs, dtype=torch.long)
        return toks
collator = MrCollator(tokenizer, mr_vocab)

In [14]:
model.eval()    
device = "cpu"

def infer_morphen(ds, print_results=False):
    loader = DataLoader(ds, shuffle=False, batch_size=32, collate_fn=collator)
    all_preds = []                                        
    all_labels = sum((x["labels"].tolist() for x in loader), [])
    for batch_x in tqdm(loader):
        with torch.no_grad():
            batch_x.to(device)
            out = model(**batch_x)
            preds = out.logits.argmax(-1)
            all_preds.extend(preds.cpu().tolist())
    results = classification_report(
                all_labels, all_preds, 
                target_names=mr_vocab.classes_, 
                output_dict=True)
    if print_results:
        print(classification_report(
                all_labels, all_preds, 
                target_names=mr_vocab.classes_, 
                output_dict=False))
    return results

train_results = infer_morphen(train_ds)
test_results = infer_morphen(test_ds, print_results=True)


  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

              precision    recall  f1-score   support

          AV       0.65      0.77      0.71        64
          VO       0.82      0.86      0.84        43
          VR       0.91      0.89      0.90       178
          VV       0.67      0.55      0.60        51

    accuracy                           0.81       336
   macro avg       0.76      0.77      0.76       336
weighted avg       0.81      0.81      0.81       336



In [15]:
morphen_metrics = dict(     
            clf = "bert",
            feat_type = "char",
            fold_idx = 1,
            n_train = len(train_ds),
            n_test = len(test_ds),
            train_acc = train_results["accuracy"],
            test_acc = test_results["accuracy"],
            dummy_acc = test_results["VR"]["support"] / test_results["macro avg"]["support"]
        )

## Metric Table

In [16]:
metric_data = []
feat_types = ["c1", "c2", "c1+c2", 
              "mu1", "mu2", "mu1+mu2"]
mr_entries = mr_data.to_dict(orient='records')

## linear discriminant analysis
for feat_type_x in feat_types:    
    kf_metrics = estimate_lda(feat_type_x, mr_entries, kv, kf_seed=12345)
    metric_data.extend(kf_metrics)

## decision tree
tree_metrics = estimate_tree(mr_entries)
metric_data.extend(tree_metrics)
metric_data.append(morphen_metrics)

metrics_df = pd.DataFrame(metric_data)

In [20]:
se_func = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))
metrics_summary = metrics_df.drop("fold_idx", axis=1).groupby(["clf", "feat_type"])\
    .agg(n_train=('n_train', 'mean'),
         n_test=('n_test', 'mean'),
         train_acc_M = ('train_acc', 'mean'),
         train_acc_se = ('train_acc', se_func),
         test_acc_M = ('test_acc', 'mean'),
         test_acc_se = ('test_acc', se_func),
         dummy_acc_M = ('dummy_acc', 'mean'),
         dummy_acc_se = ('dummy_acc', se_func))         
metrics_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,n_train,n_test,train_acc_M,train_acc_se,test_acc_M,test_acc_se,dummy_acc_M,dummy_acc_se
clf,feat_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bert,char,1340.0,336.0,0.972388,,0.809524,,0.529762,
lda,c1,1269.6,317.4,0.689036,0.001386,0.562683,0.009153,0.524231,0.010292
lda,c1+c2,1235.2,308.8,0.94171,0.003407,0.750635,0.011053,0.531102,0.00865
lda,c2,1304.0,326.0,0.81411,0.003186,0.68589,0.004908,0.531902,0.007102
lda,mu1,963.2,240.8,0.694351,0.001988,0.568935,0.00595,0.496653,0.014202
lda,mu1+mu2,788.8,197.2,0.981238,0.001227,0.673476,0.013048,0.535569,0.01809
lda,mu2,1114.4,278.6,0.839555,0.001351,0.697759,0.0138,0.560664,0.00848
tree,char,1340.8,335.2,1.0,0.0,0.66944,0.008613,0.524446,0.011003


In [21]:
metrics_summary.loc[metrics_summary.test_acc_M > 0.6]

Unnamed: 0_level_0,Unnamed: 1_level_0,n_train,n_test,train_acc_M,train_acc_se,test_acc_M,test_acc_se,dummy_acc_M,dummy_acc_se
clf,feat_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bert,char,1340.0,336.0,0.972388,,0.809524,,0.529762,
lda,c1+c2,1235.2,308.8,0.94171,0.003407,0.750635,0.011053,0.531102,0.00865
lda,c2,1304.0,326.0,0.81411,0.003186,0.68589,0.004908,0.531902,0.007102
lda,mu1+mu2,788.8,197.2,0.981238,0.001227,0.673476,0.013048,0.535569,0.01809
lda,mu2,1114.4,278.6,0.839555,0.001351,0.697759,0.0138,0.560664,0.00848
tree,char,1340.8,335.2,1.0,0.0,0.66944,0.008613,0.524446,0.011003


## Only those in kv

In [18]:
sel_mask = mr_data_test.apply(lambda x: x.token in kv or x.token_simp in kv, axis=1)
mr_data_inkv_test = mr_data_test.loc[sel_mask,:]
test_inkv_ds = MrDataset(mr_data_inkv_test)
_ = infer_morphen(test_inkv_ds, print_results=True) 

  0%|          | 0/5 [00:00<?, ?it/s]

              precision    recall  f1-score   support

          AV       0.57      0.57      0.57        14
          VO       0.94      0.88      0.91        17
          VR       0.92      0.96      0.94        81
          VV       0.81      0.68      0.74        19

    accuracy                           0.87       131
   macro avg       0.81      0.78      0.79       131
weighted avg       0.87      0.87      0.87       131



In [19]:
sel_mask_train = mr_data_train.apply(lambda x: x.token in kv or x.token_simp in kv, axis=1)
train_entries = mr_data_train.to_dict(orient='records')
inkv_test_entries = mr_data_inkv_test.to_dict(orient='records')
Xvecs, ylabels = feature_extraction("c1+c2", train_entries, kv)
Xtest, ytest = feature_extraction("c1+c2", inkv_test_entries, kv)
lda = LinearDiscriminantAnalysis()
lda.fit(Xvecs, ylabels)
ypred = lda.predict(Xtest)
print("Acc of LDA (c1+c2) trained on full data, but test on words in KV only: {:.2f}".format(
        accuracy_score(ytest, ypred)))
print("Compared with LDA (c1+c2) with trained on words in KV only: .56 ± .02")

Acc of LDA (c1+c2) trained on full data, but test on words in KV only: 0.79
Compared with LDA (c1+c2) with trained on words in KV only: .56 ± .02
