### 使用经过人工分词后的北京大学《人民日报》标注语料库，实现搭配自动发现程序
要求每种方法列出 前10个搭配的 **词对** 及其**得分**

In [330]:
import os,sys
import chardet  
import numpy as np
from copy import deepcopy

def paths(path):
    path_collection=[]
    for dirpath,dirnames,filenames in os.walk(path):
        for file in filenames:
            fullpath=os.path.join(dirpath,file)
            path_collection.append(fullpath)
    return path_collection

def get_tokens(path):
    words = [] #文档中出现的所有词
    for i,file in enumerate(paths(path)):
    # for i,file in enumerate(["corpus/01060108.txt"]):
        with open(file) as somefile:
            for line in somefile:
                linestr = line.strip().decode("gbk")
                for w in linestr.split('  '):
                    tmp = w.split('/')[0]
                    if len(tmp)>1 and tmp[0] == '[':
                        tmp = tmp[1:]
                    words.append(tmp)
    return words


separator = [u'。',u'？',u'！',u'：',u'；',u'?',u'!',u':',u';',u'，',u',',u'——',u'、',u'（',u'）']
def split_sentence(path_list):
    sentences = [] #文档中的所有句子
    for i,file in enumerate(path_list):
        with open(file) as somefile:
            for line in somefile:
                linestr = line.strip().decode("gbk")
                line_list = linestr.split('  ')
                sentence = []
                for w in line_list:
                    tmp = w.split('/')[0]
                    if len(tmp)>1 and tmp[0] == '[':
                        tmp = tmp[1:]
                    if tmp in separator:
                        if len(sentence) > 1:
                            sentences.append(sentence)
                        sentence = []
                    else:
                        sentence.append(tmp)
                if line_list[-1] not in separator and len(sentence) > 1:
                    sentences.append(sentence)
    return sentences

def formatF(number):
    return "%.6f" % np.log10(number)

In [331]:
corpus_path = "corpus/"

In [332]:
corpus_files = paths(corpus_path)
sentences = split_sentence(corpus_files)
tokens = get_tokens(corpus_path)

total_files = len(corpus_files)
total_sentences = len(sentences)
total_words = len(tokens)
print "文档数：",total_files
print "句子数：",total_sentences
print "词数：",total_words

文档数： 3148
句子数： 139300
词数： 1120721


In [316]:
def generateBigram(sentence):
    word_list = sentence.split(' ')
    bigram_list = []
    pre_word = word_list[0]
    for w in word_list[1:]:
        bigram_list.append(pre_word + " " + w)
        pre_word = w 
    return bigram_list

class NGram(object):
    def __init__(self, n):
        # n is the order of n-gram language model
        self.n = n
        self.unigram = {}
        self.bigram = {}

    # scan a sentence, extract the ngram and update their
    # frequence.
    #
    # @param    sentence    list{str}
    # @return   none
    def scan(self, sentences):
        # file your code here
        for sentence in sentences:
            self.ngram(sentence)
            
    # caluclate the ngram of the words
    #
    # @param    words       list{str}
    # @return   none
    def ngram(self, words):
        # unigram
        if self.n == 1:
            for word in words:
                if word not in self.unigram:
                    self.unigram[word] = 1
                else:
                    self.unigram[word] = self.unigram[word] + 1

        # bigram
        if self.n == 2:
            pre_word = words[0]
            for w in words[1:]:
                stri = pre_word + " " + w
                if stri not in self.bigram:
                    self.bigram[stri] = 1
                else:
                    self.bigram[stri] = self.bigram[stri] + 1
                pre_word = w 


### 1. 频率方法 

In [290]:
# uni = NGram(1)
bi1 = NGram(2)
# uni.scan(sentences)
bi1.scan(sentences)

In [265]:
sorted_bi = sorted(bi1.bigram.items(),key=lambda x:x[1],reverse=True)

In [267]:
for b in sorted_bi[:10]:
    print b[0],b[1]

” 的 970
的 一 844
的 “ 843
新 的 734
这 一 645
电 （ 574
这 是 569
的 发展 537
（ 记者 530
一 种 529


In [28]:
from pyltp import Segmentor  
from pyltp import Postagger  
from pyltp import NamedEntityRecognizer

LTP_DATA_DIR = '../ltp_data'  # ltp模型目录的路径
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径，模型名称为`cws.model`
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')

In [17]:
segmentor = Segmentor()  
segmentor.load(cws_model_path)  # 加载模型
postagger = Postagger()  
postagger.load(pos_model_path)  # 加载模型
recognizer = NamedEntityRecognizer()  
recognizer.load(ner_model_path)  #加载模型  

In [262]:
segmentor.release()  #释放模型  
# postagger.release()  #释放模型  
recognizer.release()  #释放模型  

In [291]:
raw_rule = ['a n','n n'] # 是否考虑加上nd，nh，ni，nl，ns，nt，nz
n_set = ['nz','n']
filter_rule = raw_rule

In [292]:
def filter_bigram(bigram_dict):
    bigram_dict_cp = deepcopy(bigram_dict)
    for k,v in bigram_dict_cp.items():
        bi_list = k.encode('utf-8').split(' ')
        postags = list(postagger.postag(bi_list))
        if (postags[0] == 'a' or postags[0] in n_set) and postags[1] in n_set:
            pass
        else:
            bigram_dict_cp.pop(k)
    return bigram_dict_cp

In [293]:
filtered_bigram = filter_bigram(bi1.bigram)

In [295]:
sorted_filtered_bigram = sorted(filtered_bigram.items(),key=lambda x:x[1],reverse=True) 

In [296]:
for i in sorted_filtered_bigram[:10]:
    print i[0],i[1]

领导 干部 252
讯 记者 225
电 记者 219
金融 危机 180
人民 群众 139
社会主义 市场经济 120
人民 检察院 120
两岸 关系 115
多 人 113
金融 机构 95


### 2. 均值-方差方法

In [297]:
window_size = 3

In [317]:
class Mean_Variance(object):
    def __init__(self):
        self.win_bigram = {}
        self.mean_deviation = {}
    
    def generate_win_bigram(self, sentences, window_size=3):
        for word_list in sentences:
            bigram_list = []
            for i,w in enumerate(word_list):
                if i < window_size:
                    first_word = [word_list[j] for j in range(i)] # in the window
                    distance = [j for j in range(1,i+1)]
                else:
                    first_word = [word_list[j] for j in range(i-window_size,i)] # in the window
                    distance = [j for j in range(window_size,0,-1)]
                second_word = w
                for k,f_w in enumerate(first_word):
                    b = f_w + " " + w
                    if not self.win_bigram.has_key(b):
                        self.win_bigram[b] = {}
                        self.win_bigram[b]['dist'] = [distance[k]]
                    else:
                        self.win_bigram[b]['dist'].append(distance[k])
                        
    def calc_mean_deviation(self):
        for k,v in self.win_bigram.items():
            dist_list = v['dist']
            if len(dist_list) < 10:
                self.win_bigram.pop(k)
            else:
                mean = 1.0*sum(dist_list)/len(dist_list)
                variance = 1.0*sum([(dist-mean)**2 for dist in dist_list])/(len(dist_list)-1)
                deviation = 1.0*variance**0.5
                self.win_bigram[k]['mean'] = mean
                self.win_bigram[k]['deviation'] = deviation
                
    def rank(self):
        deviation = {}
        for k,v in self.win_bigram.items():
            if v['mean'] > 1:
                deviation[k] = v['deviation']
        return sorted(deviation.items(),key=lambda x:x[1])

In [318]:
mv = Mean_Variance()
mv.generate_win_bigram(sentences)
mv.calc_mean_deviation()

In [319]:
result = mv.rank()

In [324]:
bi2 = mv.win_bigram
for i in result[:10]:
    print i[0],i[1],bi2[i[0]]['mean'],len(bi2[i[0]]['dist']) # w1w2 s d count

口 航道 0.0 2.0 12
记者 林昌 0.0 2.0 11
气象 预报 0.0 2.0 33
联合国 小组 0.0 3.0 11
记者 西平 0.0 2.0 13
之 》 0.0 2.0 30
已 家 0.0 3.0 20
泽民 说 0.0 2.0 25
新华社 ８日 0.0 3.0 26
２４日 （ 0.0 2.0 17


### 3. 假设检验方法

In [360]:
uni3 = NGram(1)
uni3.scan(sentences)
bi3 = NGram(2)
bi3.scan(sentences)

In [350]:
class t_Test():
    def __init__(self,unigram,bigram):
        self.unigram = unigram
        self.bigram = bigram
    
    def calc_word_prob(self,word):
        count = self.unigram[word] if self.unigram.has_key(word) else 0
        return 1.0*count/len(self.bigram)

    def calc_mu(self,bigram):
        bi_list = bigram.split(' ')
        first_prob = self.calc_word_prob(bi_list[0])
        second_prob = self.calc_word_prob(bi_list[1])
        return first_prob*second_prob

    def calc_mean_x(self,bigram):
        count = self.bigram[bigram] if self.bigram.has_key(bigram) else 0
        return 1.0*count/len(self.bigram)

    def calc_t(self,bigram):
        mean_x = self.calc_mean_x(bigram)
        mu = self.calc_mu(bigram)
        N = len(self.bigram)
        variance = mean_x
        return (mean_x-mu)/(variance/N)**0.5

    def t_test(self,bigram):
        if self.calc_t(bigram) > 2.576:# significant level = 0.005
            return True
        return False # fail to reject the null hypothesis
    
    def rank(self):
        rank = {}
        for bigram,v in self.bigram.items():
            rank[bigram] = self.calc_t(bigram)
        return sorted(rank.items(),key=lambda x:x[1],reverse=True)

In [352]:
t_test = t_Test(uni3.unigram,bi3.bigram)
result = t_test.rank()

In [419]:
for i in result[:10]:
    bi_list = i[0].split(' ')
    t = round(i[1], 5)
    print i[0],t,uni3.unigram[bi_list[0]],uni3.unigram[bi_list[1]],bi3.bigram[i[0]] # w1w2 t c1 c2 c12

这 一 23.06023 3196 7253 645
一 种 22.31461 7253 849 529
两 国 22.09021 1952 806 496
本报 讯 21.41862 1467 700 464
江 泽民 21.0858 602 451 446
北京 １月 20.89613 1364 1781 449
一 年 20.77873 7253 2516 521
这 是 20.48581 3196 9819 569
据 新华社 20.1484 1008 1175 412
年 来 19.63223 2516 1618 406


In [391]:
stopword = []
with open('./stopword.txt') as f:
    for w in f:
        stopword.append(w.strip().decode('utf-8'))

In [413]:
stopword_bi = {}
for k,v in bi3.bigram.items():
    bi_list = k.split(' ')
    if (bi_list[0] not in stopword) and (bi_list[1] not in stopword):
        stopword_bi[k] = v

In [417]:
t_test2 = t_Test(uni3.unigram,stopword_bi)
result2 = t_test2.rank()

In [418]:
for i in result2[:10]:
    bi_list = i[0].split(' ')
    t = round(i[1], 5)
    print i[0],t,uni3.unigram[bi_list[0]],uni3.unigram[bi_list[1]],bi3.bigram[i[0]] # w1w2 t c1 c2 c12

两 国 21.84635 1952 806 496
本报 讯 21.25405 1467 700 464
江 泽民 21.04142 602 451 446
北京 １月 20.50037 1364 1781 449
附 图片 17.07914 294 369 293
新华社 北京 16.34178 1175 1364 286
新华社 记者 15.54849 1175 2129 271
本报 记者 15.51512 1467 2129 277
领导 干部 15.47787 1131 926 252
改革 开放 14.7841 1280 355 224


### 4. 点对互信息方法

In [424]:
class Mutual_Information():
    def __init__(self,unigram,bigram):
        self.unigram = unigram
        self.bigram = bigram
        
    def calc_bigram_prob(self,bigram):
        count = self.bigram[bigram] if self.bigram.has_key(bigram) else 0
        return 1.0*count/len(self.bigram)
    
    def calc_unigram_prob(self,unigram):
        count = self.unigram[unigram] if self.unigram.has_key(unigram) else 0
        return 1.0*count/len(self.unigram)
    
    def calc_I(self,bigram):
        bi_list = bigram.split(' ')
        first_w = self.calc_unigram_prob(bi_list[0])
        second_w = self.calc_unigram_prob(bi_list[1])
        bi_prob = self.calc_bigram_prob(bigram)
        return np.log2(bi_prob/(first_w*second_w))
        
    def rank(self):
        rank = {}
        for bigram,frequency in self.bigram.items():
            if frequency > 2:
                rank[bigram] = self.calc_I(bigram)
#             rank[bigram] = self.bigram[bigram]*self.calc_I(bigram)
        return sorted(rank.items(),key=lambda x:x[1],reverse=True)

In [425]:
uni4 = NGram(1)
uni4.scan(sentences)
bi4 = NGram(2)
bi4.scan(sentences)

In [426]:
mi = Mutual_Information(uni4.unigram,bi4.bigram)
result = mi.rank()

In [430]:
for res in result[:10]:
    bi_list = res[0].split(' ')
    I = round(res[1],6)
    print res[0],I,uni4.unigram[bi_list[0]],uni4.unigram[bi_list[1]],bi4.bigram[res[0]]#w1w2 I c1 c2 c12

丹参 滴丸 11.280967 3 3 3
胡图族 叛乱者 11.280967 3 3 3
波分 复用 11.280967 3 3 3
货仓式 自选商场 11.280967 3 3 3
管理课 课长 11.280967 3 3 3
孔雀 开屏 11.280967 3 3 3
诸葛 仓麟 11.280967 3 3 3
上虞 风机厂 11.280967 3 3 3
宫内 节育器 11.280967 3 3 3
± ％ 11.280967 3 3 3


In [428]:
mi2 = Mutual_Information(uni4.unigram,stopword_bi)
result2 = mi2.rank()

In [431]:
for res in result2[:10]:
    bi_list = res[0].split(' ')
    I = round(res[1],6)
    print res[0],I,uni4.unigram[bi_list[0]],uni4.unigram[bi_list[1]],bi4.bigram[res[0]]#w1w2 I c1 c2 c12

管理课 课长 12.512658 3 3 3
孔雀 开屏 12.512658 3 3 3
胡图族 叛乱者 12.512658 3 3 3
波分 复用 12.512658 3 3 3
丹参 滴丸 12.512658 3 3 3
诸葛 仓麟 12.512658 3 3 3
上虞 风机厂 12.512658 3 3 3
文传 电讯社 12.512658 3 3 3
草浆 书写纸 12.512658 3 3 3
货仓式 自选商场 12.512658 3 3 3
