In [1]:
import sys
sys.path.append("../src")
import os
import re
import json
import gzip
import tarfile
from itertools import islice
from collections import Counter
import numpy as np
from tqdm.auto import tqdm
from opencc import OpenCC
from stanza.server import CoreNLPClient
from stanford_utils import *

In [2]:
os.environ["CORENLP_HOME"] = os.path.expanduser("~/etc/stanford-corenlp-4.4.0")

In [3]:
fin = gzip.open("/mnt/md0/seantyh/parsec/tencent-ailab-embedding-zh-d200-v0.2.0.tar.gz", "r")
tar = tarfile.open(fileobj=fin)
_ = tar.next()
txt_tzinfo = tar.next()

In [4]:
txt_tzinfo.name

'tencent-ailab-embedding-zh-d200-v0.2.0/tencent-ailab-embedding-zh-d200-v0.2.0.txt'

In [5]:
txt_fobj = tar.extractfile(txt_tzinfo)
nvocab, hdim = [int(x) for x in txt_fobj.readline().decode().split(" ")]
nvocab, hdim

(12287936, 200)

In [6]:
## write out vocabulary
chpat = re.compile("^[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]+$")
pbar = tqdm(toupdatel=nvocab)
monosylls = []
bisylls = []
quads = []
embs = []
fout = open("../data/tencent_vocabs.txt", "w")

while True:
    pbar.update(1)
    try:
        ln = txt_fobj.readline()
        if not ln:
            break
        toks = ln.decode().strip().split()  
        # hvec = np.array([float(x) for x in toks[1:]])
        word = toks[0]
        if not chpat.match(word):
            continue
        fout.write(word+"\n")
        if len(word)==1:
            monosylls.append(word)
        elif len(word)==2:
            bisylls.append(word)
        elif len(word)==4:
            quads.append(word)        
    except Exception as ex:
        print(ex)
fout.close()

  0%|          | 0/12287936 [00:00<?, ?it/s]

In [7]:
len(quads), len(bisylls), len(monosylls)

(2815445, 1675801, 21283)

In [8]:
print(" ".join(quads[:10]))
print(" ".join(quads[-10:]))

在线观看 中文字幕 在线视频 免费观看 在线播放 影音先锋 有限公司 高清无码 免费视频 久久综合
大和物語 武元唯衣 松田里奈 生活橱窗 都卜林格 柏树街站 阿宾斯克 天童如净 鲁亨盖里 教宗御座


## Parse quads

In [9]:
qs_iter = iter(quads)
batch_size = 20
batch_iter = iter(lambda: list(islice(qs_iter, batch_size)), [])
t2s = OpenCC('t2s.json')

np_compounds = []
with CoreNLPClient(properties="chinese",
        annotators=['tokenize','ssplit','pos','parse'],
        timeout=30000,        
        memory='6G', be_quiet=True) as client:    
    for qs_list in tqdm(batch_iter):
        qs_sen = "。".join(x.strip() for x in qs_list)
        ann = client.annotate(t2s.convert(qs_sen))
        for qs_x, sent_x in zip(qs_list, ann.sentence):
            np_nodes = get_nodes(sent_x.parseTree, is_two_bisyll_np)            
            if np_nodes:
                npcs = flatten_compound(np_nodes[0])
                np_compounds.append(npcs)        

2022-07-05 06:21:54 INFO: Using CoreNLP default properties for: chinese.  Make sure to have chinese models jar (available for download here: https://stanfordnlp.github.io/CoreNLP/) in CLASSPATH
2022-07-05 06:21:54 INFO: Starting server with command: java -Xmx6G -cp /home/seantyh/etc/stanford-corenlp-4.4.0/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties chinese -annotators tokenize,ssplit,pos,parse -preload -outputFormat serialized


0it [00:00, ?it/s]

In [15]:
np_compounds[:10]

[[('中文', '字幕'), ('NN', 'NN')],
 [('在线', '视频'), ('JJ', 'NN')],
 [('影音', '先锋'), ('NN', 'NN')],
 [('有限', '公司'), ('JJ', 'NN')],
 [('这个', '时候'), ('DT', 'NN')],
 [('奇米', '影视'), ('NR', 'NN')],
 [('当前', '位置'), ('NT', 'NN')],
 [('国产', '精品'), ('JJ', 'NN')],
 [('在线', '影院'), ('JJ', 'NN')],
 [('视频', '在线'), ('NN', 'NN')]]

In [16]:
compound_path = "../data/tencent_vocab_nps.txt"
with open(compound_path, "w", encoding="UTF-8") as fout:
    for np_x in np_compounds:
        nn_x = np_x[0]
        fout.write("{},{},{}\n".format(''.join(nn_x), nn_x[0], nn_x[1]))

In [18]:
nn_path = "../data/tencent_vocab_nn_compounds.txt"
with open(compound_path, "w", encoding="UTF-8") as fout:
    for np_x in np_compounds:        
        nn_x = np_x[0]
        pos_x = np_x[1]
        if any(x!="NN" for x in pos_x) or \
           any(len(w)!=2 for w in nn_x):
            continue
        fout.write("{},{},{}\n".format(''.join(nn_x), nn_x[0], nn_x[1]))

In [20]:
import hashlib
from pathlib import Path
for path_x in (compound_path, nn_path):
    h = hashlib.sha1()
    h.update(Path(path_x).read_bytes())
    print(path_x, h.hexdigest()[:6])

../data/tencent_vocab_nps.txt f4ac41
../data/tencent_vocab_nn_compounds.txt 67402c
