In [None]:
relation_groups = [
    'atlocation/locatednear',
    'capableof',
    'causes/causesdesire/*motivatedbygoal',
    'createdby',
    'desires',
    'antonym/distinctfrom',
    'hascontext',
    'hasproperty',
    'hassubevent/hasfirstsubevent/haslastsubevent/hasprerequisite/entails/mannerof',
    'isa/instanceof/definedas',
    'madeof',
    'notcapableof',
    'notdesires',
    'partof/*hasa',
    'relatedto/similarto/synonym',
    'usedfor',
    'receivesaction',
]

In [None]:
pip install zhconv

In [None]:
import zhconv
import json

In [None]:
def load_merge_relation():
    relation_mapping = dict()
    for line in relation_groups:
        ls = line.strip().split('/')
        rel = ls[0]
        for l in ls:
            if l.startswith("*"):
                relation_mapping[l[1:]] = "*" + rel
            else:
                relation_mapping[l] = rel
    return relation_mapping

In [None]:
def del_pos(s):
    """
    Deletes part-of-speech encoding from an entity string, if present.
    :param s: Entity string.
    :return: Entity string with part-of-speech encoding removed.
    """
    if s.endswith("/n") or s.endswith("/a") or s.endswith("/v") or s.endswith("/r"):
        s = s[:-2]
    return s

print(load_merge_relation())

In [None]:
def extract_chinese(concept_path, output_csv_path, output_vocab_path):
    print('extracting Chinese concepts and relations from ConceptNet...')
    relation_mapping = load_merge_relation()
    num_lines = sum(1 for line in open(conceptnet_path, 'r', encoding='utf-8'))
    cpnet_vocab = []
    concept_relation = []
    concepts_seen = set()
    with open(conceptnet_path, 'r', encoding="utf8") as fin:
        for line in fin:
            toks = line.strip().split('\t')
            
            if toks[2].startswith('/c/zh/') and toks[3].startswith('/c/zh/'):
                """
                Some preprocessing:
                    - Remove part-of-speech encoding.
                    - Split("/")[-1] to trim the "/c/zh/" and just get the entity name, convert all to 
                    - Lowercase for uniformity.
                """
                rel = toks[1].split("/")[-1].lower()
                head = zhconv.convert(del_pos(toks[2]).split("/")[-1],'zh-hans').lower()
                tail = zhconv.convert(del_pos(toks[3]).split("/")[-1],'zh-hans').lower()
                
                if rel not in relation_mapping:
                    continue

                rel = relation_mapping[rel]
                if rel.startswith("*"):
                    head, tail, rel = tail, head, rel[1:]

                data = json.loads(toks[4])

                relation = '\t'.join([rel, head, tail, str(data["weight"])])
                
                concept_relation.append(relation)
                for w in [head, tail]:
                    if w not in concepts_seen:
                        concepts_seen.add(w)
                        cpnet_vocab.append(w)
                        
    with open(output_vocab_path, 'w+') as f:
        for word in cpnet_vocab:
            f.write(word + '\n')
    with open(output_csv_path, 'w+', encoding="utf8") as fout:
        for rela in concept_relation:
            fout.write(rela + '\n')
            
        

    print(f'extracted ConceptNet csv file saved to {output_csv_path}')
    print(f'extracted concept vocabulary saved to {output_vocab_path}')
    print()
    return cpnet_vocab, concept_relation

In [None]:
conceptnet_path = '../input/raw-chineseconceptnet/chineseconceptnet.csv'
output_csv_path = 'conceptnet.zh.csv'
output_vocab_path = 'concept.txt'
output_path = 'matcher_patterns.zh.json'

import pandas as pd

things_to_write = ['why', 'this']

with open('somepath.txt', 'w+') as f:
    for th in things_to_write:
        f.write(th + '\n')

cpnet_vocab, concept_relation = extract_chinese(conceptnet_path, output_csv_path, output_vocab_path)

In [None]:
pip install zhconv

In [None]:
import zhconv
print(zhconv.convert('男仔', 'zh-hans'))

In [None]:
import nltk
import tqdm

In [None]:
def create_pattern(nlp, doc, debug=False):
    pronoun_list = set(["我", "你", "它", "它的", "你的", "他", "她", "他的", "她的", "他们", "他们的", "我们的", "我们"])
    # Filtering concepts consisting of all stop words and longer than four words.
    if len(doc) >= 5 or doc[0].text in pronoun_list or doc[-1].text in pronoun_list:
        if debug:
            return False, doc.text
        return None  # ignore this concept as pattern

    pattern = []
    for token in doc:  # a doc is a concept
        pattern.append({"LEMMA": token.lemma_})
    if debug:
        return True, doc.text
    return pattern

In [None]:
def load_cpnet_vocab(cpnet_vocab_path):
    with open(cpnet_vocab_path, "r", encoding="utf8") as fin:
        cpnet_vocab = [l.strip() for l in fin]
    cpnet_vocab = [c.replace("_", " ") for c in cpnet_vocab]
    return cpnet_vocab

In [None]:
pip install spacy

In [None]:
import spacy

In [None]:
pip list

In [None]:
pip install '../input/raw-chineseconceptnet/zh_core_web_sm-3.1.0/dist/zh_core_web_sm-3.1.0.tar'

In [None]:
def create_matcher_patterns(cpnet_vocab_path, output_path, debug=False):
    cpnet_vocab = load_cpnet_vocab(cpnet_vocab_path)
    nlp = spacy.load('zh_core_web_sm', disable=['parser', 'ner', 'textcat'])
    docs = nlp.pipe(cpnet_vocab)
    return
    all_patterns = {}

    if debug:
        f = open("filtered_concept.txt", "w")

    for doc in docs:
        print(doc.text)
        pattern = create_pattern(nlp, doc, debug)
        if debug:
            if not pattern[0]:
                f.write(pattern[1] + '\n')

        if pattern is None:
            continue
        all_patterns["_".join(doc.text.split(" "))] = pattern

    print("Created " + str(len(all_patterns)) + " patterns.")
    with open(output_path, "w", encoding="utf8") as fout:
        json.dump(all_patterns, fout)
    if debug:
        f.close()

create_matcher_patterns(output_vocab_path, output_path)