In [6]:
import sys
sys.path.append("../src")
import os
import re
import json
import zipfile
from itertools import islice
from collections import Counter
from tqdm.auto import tqdm
import pandas as pd
from stanza.server import CoreNLPClient
from stanford_utils import *

In [7]:
os.environ["CORENLP_HOME"] = os.path.expanduser("~/etc/stanford-corenlp-4.4.0")

In [36]:
with open("../data/wiki_zh_2019.zip", "rb") as fin:
    zipf = zipfile.ZipFile(fin)
    print(zipf.infolist()[2].filename)

wiki_zh/AC/wiki_73


In [8]:
def iterate_wiki_file(wiki_zip_path):
    fin = open(wiki_zip_path, "rb")
    zipf = zipfile.ZipFile(fin)
    infolist = sorted(zipf.infolist(), lambda x: x.filename)
    for info_x in filter(lambda x: not x.is_dir(), infolist):
        print(info_x)
        with zipf.open(info_x) as fwiki:
            text = fwiki.read().decode()
            data = []
            for ln in text.split("\n"):
                ln = ln.strip()
                if not ln: continue
                try:
                    data.append(json.loads(ln))                
                except Exception as ex:
                    print(ex)
        yield(data)
    fin.close()

In [9]:
import logging
logging.getLogger().setLevel("WARNING")

In [None]:
np_freqs = Counter()
with CoreNLPClient(properties="chinese",
        annotators=['tokenize','ssplit','pos','parse', 'depparse'],
        timeout=30000,
        memory='6G') as client:
    for data in tqdm(iterate_wiki_file("../data/wiki_zh_2019.zip")):
        for entry_x in data:
            if "text" not in entry_x: continue
            ann = client.annotate(entry_x["text"])
            for sent_x in ann.sentence:
                npc_nodes = get_nodes(sent_x.parseTree, is_np_compound)
                if not npc_nodes: continue
                np_compounds = [flatten_compound(np_x)
                                for np_x in npc_nodes]                
                np_freqs.update([(*x[0], *x[1]) for x in np_compounds])
            break
            

## Post processing asbc_compounds

In [29]:
with open("../data/wiki2019_compounds.csv", "w", encoding="UTF-8") as fout:
    fout.write("idx,np,nptype,w1,w2,p1,p2,freq\n")
    for np_i, (np_item, np_freq) in enumerate(np_freqs.most_common()):            
        w1, w2, p1, p2 = np_item            
        nptype = f"{len(w1)}{len(w2)}"
        fout.write(f'{np_i+1},"{w1+w2}",{nptype},')
        fout.write(f'"{w1}","{w2}","{p1}","{p2}",{np_freq}\n')

In [30]:
npc = pd.read_csv("../data/wiki2019_compounds.csv", index_col=0)
compounds_nn2 = npc.loc[(npc.nptype==22) & ((npc.p1=="NN") & (npc.p2=="NN")), :].reset_index(drop=True)
compounds_nn2.index.name = "idx"
compounds_nn2

Unnamed: 0_level_0,np,nptype,w1,w2,p1,p2,freq
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,航空公司,22,航空,公司,NN,NN,12
1,国内航线,22,国内,航线,NN,NN,5
2,自治联盟,22,自治,联盟,NN,NN,4
3,菁英会员,22,菁英,会员,NN,NN,4
4,月光公主,22,月光,公主,NN,NN,3
...,...,...,...,...,...,...,...
206,主力MS,22,主力,MS,NN,NN,1
207,战场对抗,22,战场,对抗,NN,NN,1
208,主力机体,22,主力,机体,NN,NN,1
209,吉翁战争,22,吉翁,战争,NN,NN,1


In [28]:
asbc_compounds_nn2.to_csv("../data/asbc_compounds_nn2.csv")

In [1]:
fout = open("../data/wiki2019_compounds_nn2.csv", "w", encoding="UTF-8")
with open("../data/wiki2019_compounds.csv", "r", encoding="UTF-8") as fin:
    fout.write(fin.readline())
    ln = fin.readline()
    while ln:
        if (",22," in ln) and ('"NN","NN"' in ln):
            fout.write(ln)
        ln = fin.readline()
        
fout.close()