In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys
sys.path.append("../")

In [2]:
import pickle
from functools import partial, reduce
from itertools import chain, tee, islice
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from flair.embeddings import BertEmbeddings, Sentence

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
import local_config
from MWE2019.corpus import CorpusFactory
from MWE2019.corpus_index import CorpusIndex
from MWE2019.cwn_node_vec import CwnNodeVec
from MWE2019.utils import get_cache_path, install_data_cache

In [4]:
bert = BertEmbeddings('bert-base-chinese', layers='-1,-2,-3,-4', pooling_operation='mean')

In [5]:
nv_eq = CwnNodeVec(name='structeq', dimensions=100, walk_length=10, num_walks=20, p=0.5, q=2)

load CwnNodeVec from cache:  ../MWE2019/../data/cache_cwn_node_vec/cwn_node_vec_structeq.pkl


In [6]:
len([x for x in nv_eq.stoi.keys() if len(x) == 1])

3831

In [7]:
cwn_chars = [x for x in nv_eq.stoi.keys() if len(x) == 1]

## Build character sense vectors

In [8]:
sys.path.append("../../GWA2019")
from GWA2019 import cwn_sense_utils as utils
with open("../resources/sense_embed.pkl", "rb") as fin:
    sense_embed = pickle.load(fin)

In [9]:
next(iter(sense_embed.keys()))

'03000401'

In [10]:
from CwnGraph import CwnBase

In [11]:
cwn = CwnBase()

In [12]:
from GWA2019.cwn_sense_utils import find_examples, compute_sense_embedding

In [13]:
import torch
torch.cuda.is_available()

True

In [14]:
charac_sense_vectors = {}

In [15]:
def get_sense_vector_from_cache(sense_ids, sense_cache):
    sense_map = {}
    for sense_id_x in sense_ids:
        emb = sense_cache[sense_id_x][sense_id_x].embeddings
        sense_map[sense_id_x] = emb.mean(0)
    return sense_map

def get_char_sense_vectors(charac, cwn_inst, bert_inst, sense_cache):
    lemmas = cwn.find_lemma(f'^{ch}$')
    senses = chain.from_iterable(x.senses for x in lemmas)
    sense_ids = [x.id for x in senses]
    has_cache = all([sid_x in sense_cache for sid_x in sense_ids])
    
    if has_cache:
        sense_map = get_sense_vector_from_cache(sense_ids, sense_cache)
    else:
        sense_ids, sense_emb = compute_sense_embedding(charac, cwn_inst, bert_inst)
        sense_map = dict(zip(sense_ids, sense_emb))
    return sense_map


In [21]:
char_senses = {}
for ch in tqdm(cwn_chars):
    try:
        char_senses[ch] = get_char_sense_vectors(ch, cwn, bert, sense_embed)
    except Exception as ex:
        print(ex)        

HBox(children=(IntProgress(value=0, max=3831), HTML(value='')))

No valid embeddings from examples
expected a non-empty list of Tensors
No valid embeddings from examples
expected a non-empty list of Tensors
No valid embeddings from examples
>，」身旁有人說。
>，」身旁有人說。
No valid embeddings from examples
expected a non-empty list of Tensors
expected a non-empty list of Tensors
expected a non-empty list of Tensors
expected a non-empty list of Tensors
No valid embeddings from examples
expected a non-empty list of Tensors
expected a non-empty list of Tensors
expected a non-empty list of Tensors
No valid embeddings from examples
expected a non-empty list of Tensors
No valid embeddings from examples
expected a non-empty list of Tensors
No valid embeddings from examples
expected a non-empty list of Tensors
No valid embeddings from examples
expected a non-empty list of Tensors
No valid embeddings from examples
No valid embeddings from examples
expected a non-empty list of Tensors
No valid embeddings from examples
expected a non-empty list of Tensors
expected a non-em

In [28]:
install_data_cache('sense_vectors')

In [29]:
cache_path = get_cache_path('sense_vectors', 'char_senses.pkl')
with open(cache_path, "wb") as fout:
    pickle.dump(char_senses, fout)