In [5]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys
sys.path.append("../")

In [6]:
import pickle
from functools import partial, reduce
from itertools import chain, tee, islice
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from flair.embeddings import BertEmbeddings, Sentence

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [7]:
import local_config
from MWE2019.materials import NGram4List, MoeIdioms
from MWE2019.corpus import CorpusFactory
from MWE2019.corpus_index import CorpusIndex
from MWE2019.cwn_node_vec import CwnNodeVec
from MWE2019.utils import get_cache_path, install_data_cache

In [8]:
bert = BertEmbeddings('bert-base-chinese', layers='-1,-2,-3,-4', pooling_operation='mean')

## Build QIE vectors

In [10]:
# qie_contexts = {"一丘之貉" : ["他們兩個不過就是一丘之貉，兩個人都一樣", "兩人臭味相投，一丘之貉"]}

In [None]:
qie_context_path = get_cache_path("qie_list", "qie_context.pkl")
with open(qie_context_path, "rb") as fin:
    qie_contexts = pickle.load(fin)

In [13]:
def build_qie_vector(ng, ctx_list):
    ctx_sents = [x.replace(ng, f" {ng} ") for x in ctx_list]
    sents = [Sentence(x) for x in ctx_sents]
    
    embeds = []
    for sent_x in sents:
        bert.embed(sent_x)
        embeds.append(sent_x[1].embedding.numpy())

    embed_vec = np.vstack(embeds).mean(0)
    return embed_vec

qie_vectors = {}
for ng, ctx_list in qie_contexts.items():
    try:
        embed_vec = build_qie_vector(ng, ctx_list)
        qie_vectors[ng] = embed_vec
    except Exception as ex:
        print(ex)

In [14]:
install_data_cache("sense_vectors")
qie_vectors_path = get_cache_path("sense_vectors", "qie_vectors.pkl")
with open(qie_vectors_path, "wb") as fout:
    pickle.dump(qie_vectors, fout)