# 导入包及预处理语料

In [2]:
import json
import sys
import os
import random
from collections import Counter

import numpy as np
import pandas as pd
from gensim.models.keyedvectors import KeyedVectors
from sklearn.cluster import KMeans, DBSCAN
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pylab as plt
import seaborn as sns
# 修改plt默认的样式，包括背景、网格线等
sns.set(style="white", palette="muted", color_codes=True)
# 支持中文标题、标签 for mac
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] 

In [3]:
# 更改k-means源码后立即升效
%load_ext autoreload
%autoreload 2

In [73]:
# NLP预处理
import hanlp
tokenizer = hanlp.load('LARGE_ALBERT_BASE')
tagger = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE)
recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH)

In [4]:
with open('./data/CLP2010/parsed.json','r') as f:
    corpus_labeled_parsed = json.load(f)
with open('./data/CLP2010/flat.json','r') as f:
    corpus_labeled_flat = json.load(f)
with open('./data/CLP2010/meta.json','r') as f:
    corpus_labeled_meta = json.load(f)
with open('./data/CLP2010/corpus.json','r') as f:
    corpus_labeled = json.load(f)

# 语料及预处理

## help function

In [3]:
def check_path(path, *args):
    """检查目录是否存在，不存在就创建"""
    path = path % args
    parent_path = os.path.split(path)[0]
    if not os.path.exists(parent_path):
        os.makedirs(parent_path)
    return path

In [39]:
def get_files(root, bool_func=False):
    """获得某个文件夹下的子文件，可以传递一个bool_func函数，判断子文件是否保留"""
    if not bool_func:
        bool_func = lambda x: True
    files = []
    for file in os.listdir(root):
        if bool_func(file):
            files.append(os.path.join(root,file)) 
    return files

## 分词&依存句法

In [5]:
def process_long_sentence(sentence, target):
    """长句取包含目标词的短句，也就是标点之前包含目标词的部分"""
    target_start = sentence.find(target)
    left_start = right_end = 0
    for i in range(target_start-1,-1,-1):
        if sentence[i] in [',','.','，','。',';','；']:
            left_start = i
            break
    for i in range(target_start,len(sentence),1):
        if sentence[i] in [',','.','，','。',';','；'] or i==len(sentence)-1:
            right_end = i
            break
    return sentence[left_start+1:right_end+1] if left_start!=0 else sentence[left_start:right_end+1]

In [89]:
def parse(sentence, target=False):
    if len(sentence)>120:
        sentence = process_long_sentence(sentence,target)
    parse_result = {}
    is_target_split = False
    tokens = tokenizer(sentence)
    if target and target not in tokens:
        tokens_new = []
        for index, tok in enumerate(tokens):
            if target in tok:
                start = tok.find(target)
                tokens_new+=([tok[:start],target] if start>0 else [target,tok[start+len(target):]])
            else:
                tokens_new.append(tok)
        tokens = tokens_new
        is_target_split = True
    pos = tagger(tokens)
    dp = syntactic_parser(list(zip(tokens,pos)))
    ne = recognizer(list(sentence))
    parse_result['tokens'] = tokens
    parse_result['pos'] = pos
    parse_result['ne'] = ne
    parse_result['dp'] = [{'index':d['id']-1,'token':d['form'], 'pos':d['cpos'],'relate': d['deprel'], 'head': d['head']-1} for d in dp]
    parse_result['sentence'] = sentence
    parse_result['is_target_split'] = is_target_split
    if target:
        parse_result['target'] = target
        try:
            parse_result['target_index'] = tokens.index(target)
        except:
            # 由于分词原因，目标词没有被切分出来，而是和其他字词成为了一个复合词，例如“标兵区”，而没切成“标兵”、“区”
            parse_result['target_index'] = -1 
    return parse_result 

## 预处理

In [93]:
from bs4 import BeautifulSoup
with open ('./data/CLP2010/AnswerData/暗淡.xml') as f:
    soup = BeautifulSoup(f)

In [129]:
corpus_labeled = {}
for file in get_files('./data/CLP2010/AnswerData/'):
    with open(file) as f:
        soup = BeautifulSoup(f)
    word = soup.lexelt['item']
    res = {}
    for sense in soup.select('sense'):
        res[sense['s']] = {'full_text':[],'bingo_sentence':[]}
        for instance in sense.select('instance'):
            full_text = instance.string.strip()
            res[sense['s']]['full_text'].append(full_text)
            for sentence in hanlp.utils.rules.split_sentence(full_text):
                if word in sentence:
                    res[sense['s']]['bingo_sentence'].append(sentence)
                    break
    corpus_labeled[word]=res

corpus_labeled_meta = {}
for target in corpus_labeled:
    num_sense = len(corpus_labeled[target].keys())
    senses = []
    num_sentense_of_each_sense = {}
    for sense in corpus_labeled[target].keys():
        num_sentense_of_each_sense[sense] = len(corpus_labeled[target][sense]['full_text'])
        senses.append(sense) 
    corpus_labeled_meta[target] = {'num_sense':num_sense,'senses':senses,'num_sentense_of_each_sense':num_sentense_of_each_sense}

In [183]:
corpus_labeled_parsed = {}
for target in corpus_labeled:
    corpus_labeled_parsed[target] = []
    for sense in corpus_labeled[target]:
        for sentence in corpus_labeled[target][sense]['bingo_sentence']:
            parse_result = parse(sentence,target)
            parse_result['sense'] = sense
            corpus_labeled_parsed[target].append(parse_result)

In [4]:
#加id，以及flat
for target, sentences in corpus_labeled_parsed.items():
    for index, s in enumerate(sentences):
        s['id'] = target+'_'+s['sense']+'_'+str(index)

corpus_labeled_flat = {}
for target, sentences in corpus_labeled_parsed.items():
    for index, s in enumerate(sentences):
        corpus_labeled_flat[s['id']] = s

## 持久化

In [5]:
with open('./data/CLP2010/parsed.json','w') as f:
    json.dump(corpus_labeled_parsed,f,ensure_ascii=False)
with open('./data/CLP2010/meta.json','w') as f:
    json.dump(corpus_labeled_meta,f,ensure_ascii=False)
with open('./data/CLP2010/corpus.json','w') as f:
    json.dump(corpus_labeled,f,ensure_ascii=False)
with open('./data/CLP2010/flat.json','w') as f:
    json.dump(corpus_labeled_flat,f,ensure_ascii=False)

In [182]:
for target in corpus_labeled:
    for sense in corpus_labeled[target]:
        for sentence in corpus_labeled[target][sense]['bingo_sentence']:
            if len(sentence)>120:
                s = process_long_sentence(sentence,target)
                if len(s)==0:
                    print(sentence,target)

## 结果及格式
**`corpus_labeled_parsed`**: a dict,语料预处理后的结果。将多义词作为key，value是一个列表，每一个item是一个dict，对应一个例句解析后的结果。格式如下：
```
{
    '标兵': [
        { # 例句1
            sense: 'S0',
            is_target_split: False,
            sentence: '加快发展,向标兵看齐,目前已经变成大家自觉的行为。',
            target: '标兵',
            target_index: 4,
            tokens: ['发展',...],
            pos: ['VV',...],
            ne: [{},...],
            dp: [
                {'index': 0, 'token': '加快', 'pos': 'VV', 'relate': 'conj', 'head': 9},
                ...
            ]
        },
        {...}, # 例句2
        ...
    ],
    '东北': [
        
    ]
}
```

**`corpus_labeled`**：语料，结构：
```
{
    '标兵': {
        'S0': {
            'full_text': ['xxx,'xx',...],
            'bingo_sentence': ['xxx,'xx',...]
        },
        'S1': {
            'full_text': ['xxx,'xx',...],
            'bingo_sentence': ['xxx,'xx',...]
        },
        ...
    }
}
```

**`corpus_labeled_meta`**: 语料元信息
```
{
    '标兵':{'num_sense':2,'senses':['S0','S1'],'num_sentense_of_each_sense':{'S0':18,'S1':32}},
    ...
}
```

# 特征词提取

In [6]:
def filter_punct(item): 
    return item['relate']=='punct'

In [140]:
def get_linked_words(sentence_parsed,filter_func=False):
    """从解析后的句子中提取上下文依存词汇，sentence_parsed格式见corpus_labeled_parsed
    return：[{'token': xxx,'relate': xxx, 'is_head': False},...]
    """
    s = sentence_parsed
    linked = []
    target_index = s['target_index']
    for item in s['dp']:
        if item['head'] == target_index:
            # is_head标明这个关联词在这个依存关系中是不是head
            # 当前循环是添加所有以target为head的词，所以添加的这些词的is_head都是false
            if filter_func and filter_func(item):
                continue
            linked.append({**item, 'is_head': False, 'dist': abs(target_index-item['index']), 'type':'dp'})
    target_head = s['dp'][target_index]['head']
    if target_head>=0:
        linked.append({**s['dp'][target_head], 'is_head': True, 'dist': abs(target_index-s['dp'][target_head]['index']), 'type':'dp'})
    return linked

def get_window_words(sentence_parsed,window=5,filter_func=False):
    """从解析后的句子中提取上下文window内的词汇，sentence_parsed格式见corpus_labeled_parsed
    return：[{'token': xxx,'relate': xxx, 'is_head': False},...]
    """
    s = sentence_parsed
    window_words = []
    target_index = s['target_index']
    for index in range(target_index-window,target_index+window):
        if index>-1 and index!=target_index and index<len(s['tokens']):
            item = s['dp'][index]
            if filter_func and filter_func(item):
                continue
            window_words.append({**item, 'is_head': False, 'dist': abs(target_index-item['index']), 'type':'window'})
    return window_words

def get_mixed_words(sentence_parsed,window=5,filter_func=False):
    linked_words = get_linked_words(sentence_parsed,filter_func=filter_func)
    linked_indexes = [item['index'] for item in linked_words]
    window_words = get_window_words(sentence_parsed,window=window,filter_func=filter_func)
    mixed_words = linked_words.copy()
    for item in window_words:
        if item['index'] not in linked_indexes:
            mixed_words.append(item)
    return mixed_words

# 权重

## dp and pos weight

In [141]:
dp_pos_dict  = {
    'nsubj': 2,
     'dobj': 2,
     'pobj': 2,
     'xsubj': 2,
     'nsubjpass': 2,
     'nn': 2,
     'conj': 2,
     'amod': 2,
     'rcmod': 2,
     'cc': 2,
     'dvpmod': 2,
    'VA': 1.2, 'VV': 1.2, 'NR': 1.2, 'NT': 1.2, 'NN': 1.2, 'AD': 1.2
               }
def get_dp_weight(item):
    dp_pos = item['type']
    dp_type = item['relate']
    pos_type = item['pos']
    if dp_pos=='dp':
        return dp_pos_dict.get(dp_type,1.6)
    else:
        return dp_pos_dict.get(pos_type,1)

## tfidf

In [142]:
def get_tfidf_weight(contexts, bind_dp=False):
    corpus = [' '.join([item['token'] for item in context]) for context in contexts]
    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
    X = vectorizer.fit_transform(corpus).toarray()
    terms = vectorizer.get_feature_names()
    weights = []
    for i,context in enumerate(contexts):
        weight = []
        for item in context:
            try:
                w = X[i, terms.index(item['token'])]
                weight.append(w*get_dp_weight(item) if bind_dp else w)
            except:
                weight.append(0)
        weights.append(weight)
    return weights

## tf

In [147]:
def get_tf_weight(contexts, bind_dp=False):
    counter = Counter()
    total_tf = 0
    for c in contexts:
        counter.update([item['token'] for item in c])
        total_tf+=len(c)
    weights = []
    for i,context in enumerate(contexts):
        weight = []
        for item in context:
            w = counter[item['token']]/total_tf
            weight.append((w*get_dp_weight(item) if bind_dp else w))
        weights.append(weight)
    return weights

## PMI

In [148]:
def get_tf(window=5):
    counter = Counter()
    for sid,s in corpus_labeled_flat.items():
        context_items = get_mixed_words(s,window=window,filter_func=filter_punct)
        context_words = [word['token'] for word in context_items]
        words_bind_target = [f'{word}-{s["target"]}' for word in context_words]
        counter.update(context_words)
        counter.update(words_bind_target)
        counter.update([s["target"]])
    return counter

tf_dict = {}

def get_pmi_weight(contexts,target,window=5, bind_dp=False):
    if window in tf_dict:
        tf_base = tf_dict[window]
    else:
        tf_base = get_tf(window=window)
        tf_dict[window] = tf_base

    tf_total = sum(tf_base.values())
    
    weights = []
    for i,context in enumerate(contexts):
        weight = []
        for item in context:
            token = item['token']
            bind_tf = tf_base[f'{token}-{target}']
            w = np.log2(bind_tf*tf_total/tf_base[token]/tf_base[target])
            weight.append((w*get_dp_weight(item) if bind_dp else w))
        weights.append(weight)
    return weights

# 计算上下文向量

In [12]:
# 训练好的embedding
with open('/Users/zeason/usr/data/WSI/news300d') as f:
    wv = KeyedVectors.load_word2vec_format(f, binary=False)

In [149]:
def get_context_vec(words,weights):
    """计算上下文词向量，words格式[{'token': xxx,'relate': xxx, 'is_head': False}, ...]
    """
    bingo = []
    missed = []
    if(len(words)==0):
        return np.zeros(300)
    context = np.zeros(300)
    for word, weight in zip(words,weights):
        if word['token'] in wv:
            bingo.append(word['token'])
            context += wv.get_vector(word['token'])*weight
        else:
            missed.append(word['token'])
    return context/len(words), bingo, missed

# 得到上下文向量化表示
def get_vecs(target,context_type='dp',window=5,filter_func=filter_punct, weight_type='norm', bind_dp=False):
    """得到一个target的所有上下文的向量化表示,以及周边信息"""
    vecs = []
    bingo = []
    missed =[]
    labels = []
    sentence_ids = []
    contexts = []
    weights = []
    # 上下文词汇
    for s in corpus_labeled_parsed[target]:
        if s['target_index']>-1:
            context_words = []
            if context_type=='dp':
                context_words = get_linked_words(s,filter_func=filter_func)
            if context_type=='window':
                context_words = get_window_words(s,window=window,filter_func=filter_func)
            if context_type=='mixed':
                context_words = get_mixed_words(s,window=window,filter_func=filter_func)
            contexts.append(context_words)
            labels.append(s['sense'])
            sentence_ids.append(s['id'])
    # 词项权重        
    if weight_type=='norm':
        for context in contexts:
            weights.append([1]*len(context))
    elif weight_type=='tfidf':
        weights=get_tfidf_weight(contexts, bind_dp=bind_dp)
    elif weight_type=='tf':
        weights=get_tf_weight(contexts, bind_dp=bind_dp)
    elif weight_type=='pmi':
        weights=get_pmi_weight(contexts,target=target, window=window, bind_dp=bind_dp)
     
    # 计算综合向量
    for context, weight in zip(contexts, weights):
        vec, _bingo, _missed = get_context_vec(context, weight)
        vecs.append(vec)
        bingo.append(_bingo)
        missed.append(_missed)
    info = {'bingo':bingo,'missed':missed,'ids':sentence_ids, 'contexts': contexts, 'weights':weights}
    return vecs, labels, info

# 聚类

## 聚类算法

### kmeans and dbscan

In [14]:
# 聚类得到预测标签,并保存每个簇的句子
def km_cluster(X, k=2,**kwargs):
    km = KMeans(n_clusters=k, max_iter=400, n_init=15, init='random',**kwargs)
    km.fit(X)
    return km.labels_, km.inertia_

def dbscan(X, eps=0.1, minPts=3):
    db =  DBSCAN(eps=eps, min_samples=minPts, algorithm='auto', n_jobs=-1)
    db.fit(data)
    return db.labels_, None 

### kmeans

## 二次聚类 re-weight

In [288]:
def re_calc_vecs(contexts,weights,pred):
    word_freq_dict = {}
    word_reweight_dict = {}
    new_vecs = []
    ens = []
    for context, label in zip(contexts, pred):
        for item in context:
            if item['token'] in word_freq_dict:
                if label in word_freq_dict[item['token']]:
                    word_freq_dict[item['token']][label]+=1
                else:
                    word_freq_dict[item['token']][label]=1
            else:
                word_freq_dict[item['token']]={label:1,'total':0} 
            word_freq_dict[item['token']]['total'] +=1
    for token, freq in word_freq_dict.items():
        entropy = 0
        for c,f in freq.items():
            if c != 'total':
                p = f/freq['total']
                entropy -= p*np.log2(p)
        ens.append(entropy)
#         word_reweight_dict[token] = 1/entropy
        if entropy<=0.1:
            word_reweight_dict[token] = 4
        elif entropy<=0.5:
            word_reweight_dict[token] = 3
        elif entropy<=0.8:
            word_reweight_dict[token] = 2
        else:
            word_reweight_dict[token] = 1
    for context, weight in zip(contexts, weights):
        new_weight = []
        for item, item_weight in zip(context, weight):
            new_item_weight = item_weight*word_reweight_dict[item['token']]
            new_weight.append(new_item_weight)

        new_vec = get_context_vec(context, new_weight)[0]
        new_vecs.append(new_vec)
    
    return new_vecs, ens  

## 确定k

In [85]:
def bounding_box(data):
    dim = data.shape[1]
    boxes = []
    for i in range(dim):
        data_min = np.amin(data[:, i])
        data_max = np.amax(data[:, i])
        boxes.append((data_min, data_max))
    return boxes

def best_k_gap(data, max_K, B, cluster_algorithm,thre=0.05, **kwargs):
    num_points, dim = data.shape
    K_range = np.arange(1, max_K, dtype=int)
    num_K = len(K_range)
    boxes = bounding_box(data)
    data_generate = np.zeros((num_points, dim))
    log_Wks = np.zeros(num_K)
    gaps = np.zeros(num_K)
    sks = np.zeros(num_K)
    for ind_K, K in enumerate(K_range):
        _, inertia = cluster_algorithm(data,K,**kwargs)
        log_Wks[ind_K] = np.log(inertia)
        # generate B reference data sets
        log_Wkbs = np.zeros(B)
        for b in range(B):
            for i in range(num_points):
                for j in range(dim):
                    data_generate[i][j] = \
                        np.random.uniform(boxes[j][0], boxes[j][1])
            _, inertia = cluster_algorithm(data_generate,K,**kwargs)
            log_Wkbs[b] = \
                np.log(inertia)
        gaps[ind_K] = np.mean(log_Wkbs) - log_Wks[ind_K]
        sks[ind_K] = np.std(log_Wkbs) * np.sqrt(1 + 1.0 / B)
    
    best_k = 1
    for i in range(len(gaps)):
        if gaps[i+1]/gaps[i] - 1 < thre:
            best_k = i+1
            break
    return best_k

def best_k_elbow(data, max_k, cluster_algorithm, thre=0.1, **kwargs):
    best_k = 1
    for i in range(1, max_k):
        _, inertia = cluster_algorithm(data,i,**kwargs)
        if i>1 and inertia_pre/inertia-1<thre:
            best_k = i
            break
        inertia_pre =  inertia 
    return best_k

# 评估

In [151]:
# 实现CLP2010的F-score计算
def eval_f_score(pred, true):
    """实现CLP2010的F1 Score计算，pred and true: list,不要求用相同的标签去标记类别，标签的作用只是将样本划分而已"""
    n_pred = len(set(pred))
    n_true = len(set(true))
    f1_crs = []
    n_crs = []
    for cr in set(true):
        f1_cr_si = []
        for si in set(pred):
            n_si=n_cr=n_cr_si=0
            for index,label in enumerate(pred):
                if label==si:
                    n_si += 1
                if true[index]==cr:
                    n_cr += 1
                if label==si and true[index]==cr:
                    n_cr_si += 1
            p = n_cr_si/n_si
            r = n_cr_si/n_cr
            f1 = 2*p*r/(p+r) if p+r>0 else 0
            f1_cr_si.append(f1)
        f1_crs.append(max(f1_cr_si))
        n_crs.append(n_cr)
    n_total = sum(n_crs)
    f1_score = 0
    for f, n in zip(f1_crs,n_crs):
        f1_score += (f*n/n_total)
    return f1_score

In [18]:
def print_res(pred, info):
    res = {p: [] for p in set(pred)}
    for p, sid, bingo, missed in zip(pred,info['ids'],info['bingo'],info['missed']):
        res[p].append('%s %s\t%s' %(sid.ljust(10), process_long_sentence(corpus_labeled_flat[sid]['sentence'],corpus_labeled_flat[sid]['target']), (','.join(bingo) + '|' + ','.join(missed)).ljust(30)))
    for p in res:
        print(str(p).center(10,'*'))
        for row in res[p]:
            print(row)
        print()

In [298]:
def test(model, X, labels, bonus={'trigger':False},pred_ready=False, **kwargs):
    """
        evaluate cluster algorithm or features just on ONE target
        
        Args:
            model: cluster algorithm
            X: n*m matrix of features, n: rows， m: # features
            labels: list of n labels
            bonus: 彩蛋。。。就是把一个函数放进来运行，目前设计来打印结果。
    """
    pred,_ = model(np.array(X), **kwargs) if not pred_ready else pred_ready
    # f1 score
    f = eval_f_score(pred, labels)
    # v score
    hs = metrics.homogeneity_score(labels, pred)
    cs = metrics.completeness_score(labels, pred)
    vs = metrics.v_measure_score(labels, pred)
    
    if bonus['trigger']:
        print('F-score: %0.2f .' % f)
        print('v_measure: %0.2f, homogeneity: %0.2f, completeness: %0.2f .' % (vs, hs, cs))
        bonus['func'](pred,**bonus['params'])
        
    return {'F-score': f,'Homogeneity': hs,'Completeness': cs,'V-measure': vs}, pred

In [153]:
def test_all(model, X_all, labels_all, spec, display=True, **kwargs):
    """
        evaluate cluster algorithm or features just on ALL target
        
        Args:
            model: cluster algorithm
            X_all: dict, target as key, value is n*m matrix of features, n: rows， m: # features
            labels_all: dict, target as key, value is a list of n labels
            spec: dict，target as key，value是给每个target的参数，放到test里
    """
    scores = {}
    for target in X_all:
        scores[target], pred = test(model, X_all[target], labels_all[target], **spec[target],**kwargs)
        
    res =  pd.DataFrame(scores).T.applymap(lambda x: round(x,3))
    res = res[['F-score','V-measure','Homogeneity','Completeness']]
    res.sort_values('F-score',inplace=True)
    if display:
        display(res.describe([.25,.5,.75,.9,.95]))
        display(res)
    return res

# 实验

## 公共方法

In [296]:
def car(target,model=km_cluster,**kwargs):
    """对单个目标词进行测试"""
    vecs, labels, info = get_vecs(target,**kwargs)
    bonus = {'func':print_res,'params':{'info': info},'trigger':True}
    spec = {'k':corpus_labeled_meta[target]['num_sense'], 'bonus':bonus} 
    test(model, vecs, labels, **spec)

In [303]:
def car_reweight(target,model=km_cluster,**kwargs):
    """对单个目标词进行词义归纳，并使用二次聚类调整权重"""
    vecs, labels, info = get_vecs(target,**kwargs)
    bonus = {'func':print_res,'params':{'info': info},'trigger':True}
    spec = {'k':corpus_labeled_meta[target]['num_sense'], 'bonus':bonus} 
    pre_score, pre_pred = test(model, vecs, labels, **spec)
    
    # re-weight and re-run
    new_vecs, ens = re_calc_vecs(info['contexts'],info['weights'],pre_pred)
    score, pred = test(model, vecs, labels, **spec)

In [165]:
def bus(model=kmeans,index='all', k='gold_standard', **kwargs):
    """运行词义归纳系统，获得评测结果"""
    vecs_all = {}
    labels_all = {}
    spec = {}
    for target in corpus_labeled_meta:
        vecs, labels, info = get_vecs(target,**kwargs)
        vecs_all[target] = vecs
        labels_all[target] = labels
        bonus = {'func':print_res,'params':{'info': info},'trigger':False}
        if k=='gold_standard':
            spec[target] = {'k':corpus_labeled_meta[target]['num_sense'], 'bonus':bonus}
        else:
            spec[target] = {'k':k[target], 'bonus':bonus}
    res_mean = []
    for i in range(5):
        res = test_all(model,vecs_all,labels_all,spec,display=False)
        res_mean.append(res.mean())
    final_res = pd.DataFrame(res_mean).mean().to_frame().T
    final_res.index = [index]
    return final_res.applymap(lambda x: round(x*100,2))

## 上下文特征向量验证

### 特征词提取方式

In [179]:
scores_ext = []
for window in (5,7,8,10,12):
    res = bus(model=km_cluster,index=f'窗口上下文（window={i}）',context_type='window', window=window, weight_type='norm')
    scores_ext.append(res)
pd.concat(scores_ext)

Unnamed: 0,F-score,V-measure,Homogeneity,Completeness
窗口上下文（window=5）,72.46,38.36,37.11,40.31
窗口上下文（window=7）,74.22,42.33,41.26,44.03
窗口上下文（window=8）,75.33,42.92,42.15,44.16
窗口上下文（window=10）,75.24,42.75,42.08,43.93
窗口上下文（window=12）,73.87,41.3,40.34,42.72


In [191]:
scores_dp = bus(model=km_cluster,index=f'依存句法上下文',context_type='dp', window=8, weight_type='norm')
scores_dp

Unnamed: 0,F-score,V-measure,Homogeneity,Completeness
依存句法上下文,71.61,29.24,26.25,35.01


In [252]:
scores_mixed = []
for window in (5,7,8,10,12):
    res = bus(model=km_cluster,index=f'混合上下文（window={window}）',context_type='mixed', window=window, weight_type='norm')
    scores_mixed.append(res)
pd.concat(scores_mixed)

Unnamed: 0,F-score,V-measure,Homogeneity,Completeness
混合上下文（window=5）,74.53,42.29,40.91,43.76
混合上下文（window=7）,75.32,42.86,41.65,44.15
混合上下文（window=8）,76.15,43.58,42.59,44.62
混合上下文（window=10）,75.98,43.81,42.81,44.85
混合上下文（window=12）,75.71,43.04,42.1,44.02


### 特征词权重

In [263]:
scores_weight = []
for weight_type in [('tf','句法特征*词频'),('pmi','句法特征*pmi')]:
    res = bus(model=km_cluster,index=f'{weight_type[1]}',context_type='mixed', window=8, weight_type=weight_type[0], bind_dp=True)
    scores_weight.append(res)
df_weight = pd.concat(scores_weight)
df_weight

Unnamed: 0,F-score,V-measure,Homogeneity,Completeness
句法特征*词频,68.2,20.6,19.89,21.36
句法特征*pmi,78.48,47.79,45.88,49.87


## 改进 k-means 验证

### 初始中心和噪声对比

In [268]:
# 运行前需要修改sklearn源码
df_cz = bus(model=km_cluster,index='改进的k-means',context_type='mixed', window=8, weight_type='pmi', bind_dp=True)
df_cz

Unnamed: 0,F-score,V-measure,Homogeneity,Completeness
改进的k-means,79.42,47.42,46.55,48.33


### dbscan

In [310]:
df_db = bus(model=dbscan, index='dbscan',context_type='mixed', window=8, weight_type='pmi', bind_dp=True)

Unnamed: 0,F-score,V-measure,Homogeneity,Completeness
dbscan,67.07,34.42,33.4,35.51


### 确定k值

In [86]:
k_gap = []
for target in corpus_labeled_meta:
    vecs, labels, info = get_vecs('东北',context_type='window', window=8, weight_type='pmi',filter_func=filter_punct)
    k = best_k_gap(np.array(vecs), 10, 50, km_cluster)
    k_gap[target] = k
    print(f'{target}: {corpus_labeled_meta[target]["num_sense"]}, gap: {k}')

标兵: 2, gap: 3
扼杀: 2, gap: 3
东西: 3, gap: 3
补贴: 2, gap: 2
调动: 3, gap: 3
出口: 3, gap: 3
戳穿: 2, gap: 3
澄清: 2, gap: 4
材料: 3, gap: 3
春秋: 3, gap: 3
大人: 2, gap: 3
断交: 2, gap: 4
打: 21, gap: 2
反射: 2, gap: 3
程序: 2, gap: 2
吃饭: 2, gap: 3
动力: 2, gap: 3
东北: 2, gap: 3
初二: 2, gap: 3
发动: 3, gap: 3
断气: 2, gap: 2
冲洗: 2, gap: 4
打开: 3, gap: 4
保安: 2, gap: 3
单纯: 2, gap: 3
把握: 4, gap: 3
翻身: 2, gap: 3
打断: 2, gap: 3
导师: 2, gap: 3
采购: 2, gap: 3
报销: 2, gap: 3
保管: 3, gap: 3
打气: 2, gap: 2
草包: 2, gap: 3
大军: 2, gap: 3
打破: 2, gap: 3
冲撞: 2, gap: 3
便宜: 3, gap: 3
参加: 2, gap: 3
充电: 2, gap: 3
病毒: 2, gap: 3
背离: 2, gap: 3
发展: 2, gap: 2
东方: 2, gap: 3
杜鹃: 2, gap: 4
哺育: 2, gap: 3
比重: 2, gap: 3
大气: 2, gap: 3
暗淡: 2, gap: 3
大陆: 2, gap: 3


In [282]:
df_gap = bus(model=km_cluster,k=k_gap, index='Gap Statistic',context_type='mixed', window=8, weight_type='pmi', bind_dp=True)

Unnamed: 0,F-score,V-measure,Homogeneity,Completeness
Gap Statistic,78.79,47.04,46.32,48.33


### 二次聚类

**instance**

In [316]:
pre_scores = {}
scores = {}
for target in corpus_labeled_meta:
    vecs, labels, info = get_vecs(target, context_type='mixed', window=15, weight_type='pmi')
    bonus = {'func':print_res,'params':{'info': info},'trigger':False}
    spec = {'k':corpus_labeled_meta[target]['num_sense'], 'bonus':bonus} 
    
    pre_pack = []
    pack = []
    for i in range(10):
        pre_score, pre_pred = test(km_cluster, vecs, labels, **spec)

        # re-weight and re-run
        new_vecs, ens = re_calc_vecs(info['contexts'],info['weights'],pre_pred)
        score, pred = test(km_cluster, new_vecs, labels, **spec)
        
        pre_pack.append(pre_score)
        pack.append(score)
    pre_scores[target] = pd.DataFrame(pre_pack).mean()
    scores[target] = pd.DataFrame(pack).mean()
pre_res =  pd.DataFrame(pre_scores).T.applymap(lambda x: round(x,3))
pre_res = pre_res[['F-score','V-measure','Homogeneity','Completeness']]
res =  pd.DataFrame(scores).T.applymap(lambda x: round(x,3))
res = res[['F-score','V-measure','Homogeneity','Completeness']]
print(pre_res.mean())
print(res.mean())


F-score         0.79422
V-measure       0.45608
Homogeneity     0.44442
Completeness    0.47860
dtype: float64
F-score         0.80110
V-measure       0.47670
Homogeneity     0.46670
Completeness    0.49702
dtype: float64



In [304]:
car_reweight('程序',model=km_cluster, context_type='window', window=8, weight_type='pmi')

F-score: 0.80 .
v_measure: 0.29, homogeneity: 0.29, completeness: 0.29 .
****0*****
程序_S0_3    该出让符合法定出让程序。	以,元,挂牌,该,出让,符合,法定,出让|1314万    
程序_S0_4    进一步规范了此类案件的审理程序。	由此,进一步,规范,了,此,类,案件,的,审理|      
程序_S0_5    占有优势的股东有时可能强行启动临时会议程序。	占有,优势,的,股东,有时,可能,强行,启动,临时,会议| 
程序_S0_6    引咎辞职官员是可以依据宪法和法律所规定的程序重新担任公职的，	官员,是,可以,依据,宪法,和,法律,所,规定,的,重新,担任,公职,的,其,基本,依据,就|
程序_S0_7    公司股东会会议的召集应遵守并执行相应的程序。	股东会,会议,的,召集,应,遵守,并,执行,相应,的|   
程序_S0_8    按法定程序办事，	温家宝,指出,按,法定,办事,是,依法,行政,的,重要,内容|
程序_S0_9    其中强调要“加强行政程序的制度建设”，	其中,强调,要,加强,行政,的,制度,建设,这,一,点,引|
程序_S0_11   它们共同构成了作为税收征纳程序内容的征纳行为，	纳税,行为,它们,共同,构成,了,作为,税收,征纳,内容,的,征纳,行为,保证,了,税收,这|
程序_S0_12   组织审核评议等程序。	展示,公众,投票,评选,组织,审核,评议,等|       
程序_S0_13   基金管理人在履行适当程序后，	投资,的,其他,品种,基金,管理人,在,履行,适当,后,可以,将,其,纳入,投资,范围|
程序_S0_14   最后只有一块土地进入竞价程序，	元,最后,只,有,一,块,土地,进入,竞价,11,块,以,底价,成交|
程序_S0_18   第二步启动38个总行部门副总经理职务公开竞聘程序，	启动,38,个,总行,部门,副,总经理,职务,公开,竞聘,部门,副总,的,竞聘,工作,将,于,4|
程序_S0_19   我们对情况表实施了包括核对、询问、抽查会计记录等我们认为必要的工作程序。	抽查,会计,记录,等,我们,认为,必要,的,工作|     
程序_S0_20   眉山市政务服务中心规划和建设局窗口进一步简

## 大规模词义归纳

# stat

In [294]:
# dp依存词在宽为5的window外的占比达到20%
from collections import Counter
counter = Counter()
for s in corpus_labeled_flat:
    for token in get_linked_words(corpus_labeled_flat[s]):
        counter.update([token['dist']])
sum_under5=sum_over5=0
for k, v in counter.items():
    if k<8:
        sum_under5+=v
    else:
        sum_over5+=v
sum_under5,sum_over5,sum_over5/(sum_under5+sum_over5)

(6179, 1000, 0.1392951664577239)

In [84]:
sense_nums = []
c = Counter()
for target in corpus_labeled_meta:
    sense_nums.append(corpus_labeled_meta[target]['num_sense'])
c.update(sense_nums)
c

Counter({2: 39, 3: 9, 21: 1, 4: 1})