In [1]:
import random
import jieba

class ProcessDocument:
    def __init__(self, documents,stopwords_path):
        self.documents = documents
        self.words = {}
        self.stopwords = set()
        self.load_stopwords(stopwords_path)
    def load_stopwords(self, path):
        with open(path, 'r') as fin:
            for line in fin:
                self.stopwords.add(line.strip())
                
    def is_chn(self, words):
        for w in words:
            if not (u'\u4e00' <= w <= u'\u9fff' or \
               u'a' <= w <= u'z' or u'A' <= w <= u'Z'or \
               u'0' <= w <= u'9'):
                return False
        return True
    
    def process(self):
        texts = []
        for document in self.documents:
            text = []
            document = document.split('\t')[-1]
            for word in jieba.cut(document.strip()):
                if not self.is_chn(word): continue
                if word in self.stopwords: continue
                if len(word) < 2: continue
                if word not in self.words:
                    self.words[word] = 0
                self.words[word] += 1
                text.append(word)
            texts.append(text)
            
        self.words = sorted(self.words.items(), key=lambda x:x[1], reverse=True)
        self.words = [word for word,count in self.words]
        return self.words,texts
    
    def get_word2id(self):
        return dict([(word,i) for i,word in enumerate(self.words)])
    def get_id2word(self):
        return dict(enumerate(self.words))
    
documents = []
with open('sougo_texts2.txt') as fin:
    for line in fin:
        documents.append(line)
pd = ProcessDocument(documents, 'stopwords.txt')
words,texts = pd.process()
word2id = pd.get_word2id()
id2word = pd.get_id2word()
print(len(word2id))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/pt/qq532lj10pl42v3d1vqpz67r0000gn/T/jieba.cache
Loading model cost 0.985 seconds.
Prefix dict has been built succesfully.


1095


In [2]:
class LDAModel:
    def __init__(self,num_topic, word2id,id2word,documents):
        self.K = num_topic
        self.alpha = 50/self.K
        self.beta = 0.01
        self.z = []
        self.word2id = word2id
        self.id2word = id2word
        self.documents = documents
        
    def init_topic(self, n_mk, n_m, n_kt, n_k):
        M = len(self.documents)
        V = len(self.word2id)
        for m in range(M):
            self.z.append([])
            for w in self.documents[m]:
                k = random.randint(0,self.K-1)
                t = word2id[w]
                self.z[m].append(k)
                n_mk[m][k] += 1
                n_m[m] += 1
                n_kt[k][t] += 1
                n_k[k] += 1 
                
    def reSampling(self, m, t, n_mk, n_m, n_kt, n_k):
        
        topic_alpha = self.alpha * self.K
        topic_beta = self.beta * len(self.word2id)
        
        p = [0] * self.K
        for k in range(self.K):
            p[k] = (n_kt[k][t] + self.beta)/(n_k[k] + topic_beta) * (n_mk[m][k] + self.alpha)/(n_m[m] + topic_alpha)
            if k > 1:
                p[k] += p[k-1]
        u = random.random() * p[self.K -1]
        
        new_k = 0
        for i in range(self.K):
            if p[i] >= u:
                new_k = i
                break
        return new_k
    
    def calc_theta(self, n_mk, n_m):
        topic_alpha = self.alpha * self.K
        theta = [[0 for i in range(self.K)] for i in range(len(self.documents))]
        for m in range(len(self.documents)):
            for k in range(self.K):
                theta[m][k] = (n_mk[m][k] + self.alpha) / (n_m[m] + topic_alpha)
        return theta
    
    def calc_phi(self, n_kt, n_k):
        topic_beta = self.beta * len(self.word2id)
        phi = [[0 for i in range(len(self.word2id))] for i in range(self.K)]
        for k in range(self.K):
            for t in range(len(self.word2id)):
                phi[k][t] = (n_kt[k][t] + self.beta) / (n_k[k] + topic_beta)
        return phi
    
    def lda(self, n_mk, n_m, n_kt, n_k, times):
        print('run lda')
        M = len(self.documents)
        for epoch in range(times):
            #print('epoch %d' % epoch)
            for m in range(M):
                for i in range(len(self.documents[m])):
                    k = self.z[m][i]
                    word = self.documents[m][i]
                    t = word2id[word]
                    n_mk[m][k] -= 1
                    n_m[m] -= 1
                    n_kt[k][t] -= 1
                    n_k[k] -= 1

                    new_k = self.reSampling(m, t, n_mk, n_m, n_kt, n_k)
                    n_mk[m][new_k] += 1
                    n_m[m] += 1
                    n_kt[new_k][t] += 1
                    n_k[new_k] += 1
                    self.z[m][i] = new_k
                
        theta = self.calc_theta(n_mk, n_m)
        phi = self.calc_phi(n_kt, n_k)
        return theta, phi
    
    def run_lda(self, times):
        n_mk = [[0 for i in range(self.K)] for i in range(len(self.documents))]
        n_m = [0 for i in range(len(self.documents))]
        n_kt = [[0 for i in range(len(words))] for i in range(len(self.documents))]
        n_k = [0 for i in range(self.K)]
        
        self.init_topic(n_mk, n_m, n_kt, n_k)
        theta,phi = self.lda(n_mk, n_m, n_kt, n_k, times)
        return theta,phi
    
    def get_topic_N(self,phi,N):
        topics_dic = {}
        for k in range(len(phi)):
            dic = {}
            for i in range(len(phi[k])):
                dic[i] = phi[k][i]
            word_prob_list = sorted(dic.items(), key = lambda d:d[1], reverse = True)
            topics_dic[k] = word_prob_list
            print('topic %d' % k, end='\t')
            for word_pro in word_prob_list[0:N]:
                print(self.id2word[word_pro[0]],word_pro[1], end=' ')
            print('\n')
        return topics_dic
    
    def get_doc_topic(self, theta, topics_dic):
        #doc_dic = {}
        for m in range(len(theta)):
            dic = {}
            for i in range(len(theta[m])):
                dic[i] = theta[m][i]
            topic_prob_list = sorted(dic.items(), key = lambda d:d[1], reverse=True)
            #doc_dic['doc ' + str(m)] = topic_prob_list
            print('doc %d ' % m, end = '\t')
            for top_prob in topic_prob_list[:1]:
                #print(topics_dic[top_prob[0]])
                for word_pro in topics_dic[top_prob[0]][0:15]:
                    print(self.id2word[word_pro[0]],word_pro[1], end=' ')
                print('')

In [4]:
lda = LDAModel(9, word2id, id2word, texts)
theta, phi = lda.run_lda(300)
top_dic = lda.get_topic_N(phi,50)
lda.get_doc_topic(theta, top_dic)

run lda
topic 0	办理 0.01986411176425681 宏碁 0.01375677532636079 时间 0.012229941216886784 市场 0.012229941216886784 审批 0.010703107107412778 美女 0.009176272997938772 努力 0.009176272997938772 建设 0.009176272997938772 海军陆战队 0.009176272997938772 监管局 0.009176272997938772 窗口 0.009176272997938772 中年 0.007649438888464768 女白领 0.007649438888464768 学习 0.007649438888464768 整编 0.007649438888464768 基地 0.007649438888464768 设施 0.007649438888464768 全球 0.007649438888464768 戴尔 0.007649438888464768 电脑 0.007649438888464768 办事 0.007649438888464768 正式 0.006122604778990762 转移 0.006122604778990762 大学 0.006122604778990762 选择 0.006122604778990762 告诉 0.006122604778990762 股票 0.006122604778990762 休息区 0.006122604778990762 未来 0.006122604778990762 MM 0.004595770669516756 面对 0.004595770669516756 小猪 0.004595770669516756 王香如 0.004595770669516756 担心 0.004595770669516756 发现 0.004595770669516756 美国 0.004595770669516756 全国 0.004595770669516756 竞争 0.004595770669516756 目标 0.004595770669516756 关系 0.004595770669516756 冲突 0.00459577066951

In [13]:
from gensim import corpora, models

In [14]:
dictionary = corpora.Dictionary(texts)

In [15]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [16]:
texts_tf_idf = models.TfidfModel(corpus)[corpus]
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=9,update_every=0,passes=20)
texts_lda = lda[texts_tf_idf]
res_dic = dict(lda.print_topics(num_topics=9, num_words=10))
print(res_dic)
print('-----------')
count = 0
for doc1 in texts_lda:
    #for doc in doc1:
    print('doc %d ' % count, res_dic[doc1[0][0]])
    count +=1
    print(count, '------------------')

{0: '0.001*"行政" + 0.001*"办理" + 0.001*"大厅" + 0.001*"业务" + 0.001*"工作" + 0.001*"受理" + 0.001*"中心" + 0.001*"审批" + 0.001*"国家" + 0.001*"窗口"', 1: '0.029*"行政" + 0.027*"办理" + 0.021*"工作" + 0.021*"受理" + 0.021*"大厅" + 0.018*"业务" + 0.016*"中心" + 0.014*"食品药品" + 0.014*"审批" + 0.012*"监管局"', 2: '0.028*"关岛" + 0.021*"海军陆战队" + 0.021*"建设" + 0.018*"设施" + 0.018*"整编" + 0.018*"基地" + 0.014*"日美军" + 0.011*"冲绳" + 0.011*"翻新" + 0.011*"预算"', 3: '0.015*"古力" + 0.011*"心理" + 0.011*"毛毛" + 0.008*"紧张" + 0.008*"美女" + 0.008*"家庭" + 0.007*"学习" + 0.007*"对局" + 0.007*"调整" + 0.007*"中年"', 4: '0.023*"桃花" + 0.019*"植物园" + 0.014*"周末" + 0.010*"开放" + 0.010*"游客" + 0.010*"天气" + 0.010*"玉兰" + 0.010*"湖边" + 0.010*"园内" + 0.005*"时间"', 5: '0.021*"免费" + 0.011*"裘园" + 0.011*"门票" + 0.007*"伦敦" + 0.007*"一张" + 0.007*"作家" + 0.007*"音乐剧" + 0.007*"奥斯汀" + 0.007*"剧场" + 0.007*"公共汽车"', 6: '0.001*"宏碁" + 0.001*"免费" + 0.001*"心理" + 0.001*"市场" + 0.001*"家庭" + 0.001*"女白领" + 0.001*"未来" + 0.001*"全球" + 0.001*"办理" + 0.001*"事业"', 7: '0.001*"古力" + 0.001*"毛毛" + 0.001*"美女" + 0.001