### ptm参考：https://github.com/dongwookim-ml/python-topic-model/tree/master/ptm

### LDA

In [None]:
import logging

import numpy as np
from ptm import GibbsLDA
from ptm import vbLDA
from ptm.nltk_corpus import get_reuters_ids_cnt
from ptm.utils import convert_cnt_to_list, get_top_words

Loading Reuter corpus from NLTK

In [None]:
n_doc = 1000
voca, doc_ids, doc_cnt = get_reuters_ids_cnt(num_doc=n_doc, max_voca=10000)
docs = convert_cnt_to_list(doc_ids, doc_cnt)
n_voca = len(voca)
print('Vocabulary size:%d' % n_voca)

Inferencen through the Gibbs sampling

In [None]:
max_iter=100
n_topic=10

logger = logging.getLogger('GibbsLDA')
logger.propagate = False

model = GibbsLDA(n_doc, len(voca), n_topic)
model.fit(docs, max_iter=max_iter)

Inferencen through the Variational Bayes

In [None]:
logger = logging.getLogger('vbLDA')
logger.propagate = False

vbmodel = vbLDA(n_doc, n_voca, n_topic)
vbmodel.fit(doc_ids, doc_cnt, max_iter=max_iter)

Print top 10 probability words for each topic

In [None]:
for ti in range(n_topic):
    top_words = get_top_words(vbmodel._lambda, voca, ti, n_words=10)
    print('Topic', ti ,': ', ','.join(top_words))

### HMM-LDA

In [None]:
import logging
from ptm.nltk_corpus import get_reuters_token_list_by_sentence
from ptm import HMM_LDA
from ptm.utils import get_top_words

logger = logging.getLogger('HMM_LDA')
logger.propagate=False

n_docs = 1000
voca, corpus = get_reuters_token_list_by_sentence(num_doc=n_docs)
print('Vocabulary size', len(voca))

Training HMM LDA

In [None]:
n_docs = len(corpus)
n_voca = len(voca)
n_topic = 50
n_class = 20
max_iter = 100
alpha = 0.1
beta = 0.01
gamma = 0.1
eta = 0.1
model = HMM_LDA(n_docs, n_voca, n_topic, n_class, alpha=alpha, beta=beta, gamma=gamma, eta=eta, verbose=False)
model.fit(corpus, max_iter=max_iter)

Print Top 10 words for each class and topic

In [None]:
for ti in range(n_topic):
    top_words = get_top_words(model.TW, voca, ti, n_words=10)
    print('Topic', ti ,': ', ','.join(top_words))
    
for ci in range(1, n_class):
    top_words = get_top_words(model.CW, voca, ci, n_words=10)
    print('Class', ci ,': ', ','.join(top_words))

### AuthorTopicModel

In [None]:
from ptm import AuthorTopicModel

logger = logging.getLogger('AuthorTopicModel')
logger.propagate=False

Load CORA dataset: https://people.cs.umass.edu/~mccallum/data.html

In [None]:
doc_ids = pickle.load(open('../data/cora/doc_ids.pkl', 'rb'))
doc_cnt = pickle.load(open('../data/cora/doc_cnt.pkl', 'rb'))
doc_author = pickle.load(open('../data/cora/doc_authorid.pkl', 'rb'))
author_name = pickle.load(open('../data/cora/authorid_authorname.pkl', 'rb'))
voca = pickle.load(open('../data/cora/voca.pkl', 'rb'))

corpus = convert_cnt_to_list(doc_ids, doc_cnt)
n_doc = len(corpus)
n_topic = 10
n_author = len(author_name)
n_voca = len(voca)
max_iter = 50

Fit author-topic model

In [None]:
model = AuthorTopicModel(n_doc, n_voca, n_topic, n_author)
model.fit(corpus, doc_author, max_iter=max_iter)

Print top 10 words for each topic

In [None]:
for k in range(n_topic):
    top_words = get_top_words(model.TW, voca, k, 10)
    print('topic ', k , ','.join(top_words))

### Relational Topic Model

In [None]:
from ptm import RelationalTopicModel

model = RelationalTopicModel(n_topic, n_doc, n_voca, verbose=True)
model.fit(doc_ids, doc_cnt, doc_links, max_iter=max_iter)

for k in range(n_topic):
    top_words = get_top_words(model.beta, voca, k, 10)
    print('Topic', k, ':', ','.join(top_words))

### Supervised Topic Model

In [None]:
from ptm import GibbsSupervisedLDA

n_topic = 50
r_var = 0.01

model = GibbsSupervisedLDA(n_doc, n_voca, n_topic, sigma=r_var)
model.fit(corpus, ratings)

for ti in model.eta.argsort():
    top_words = get_top_words(model.TW, voca, ti, n_words=10)
    print('Eta', model.eta[ti] ,'Topic', ti ,':\t', ','.join(top_words))

#### 以下参考 https://mp.weixin.qq.com/s/PMWAyytQjSGspAxXKsDx6w

### Biterm

In [None]:
class BTM(object):
    def __init__(self, data_path, alpha, beta, num_iter, num_topic, output_dir):
        self.data_path = data_path
        self.alpha = alpha
        self.beta = beta
        self.num_iter = num_iter
        self.num_topic = num_topic
        self.output_dir = output_dir
        
        self.word2Id = {}
        self.Id2Word = {}
        self.vocab_size = 0
        
        self.wordId_corpus = []
        
        self.biterms_in_doc = [] #list of dictionaries long->int
        self.num_doc_biterm = defaultdict(int)
        self.biterms = [] #List of numbers

        self.topic_biterm = []
        self.topic_word_num = [] #list of lists
        self.num_topic_biterm = []
        
        self.biterm_sum = {} #Map from long to double
        
    def get_file_reader(self, path = None):
        if path is None:
            path = self.data_path
        f = open(path, 'r')
        return f
    
    def get_file_writer(self,path, append = False):
        if append:
            read_mode = 'a'
        else:
            read_mode = 'w'
        g = open(os.path.join(self.output_dir, path), read_mode)
        return g
    
    def print_params(self):
        params = ['alpha','beta','num_iter','num_topic','topic_word_num','num_topic_biterm','topic_biterm']
        for param in params:
            print(param,':',getattr(self, param))
            print('-'*40)
    
    def load_data(self):
        f = self.get_file_reader()
        for line in f.readlines():
            words = line.split()
            curr_doc = []
            for word in words:
                if word not in self.word2Id:
                    index = len(self.word2Id)
                    self.word2Id[word] = index
                    self.Id2Word[index] = word
                curr_doc.append(self.word2Id[word])
            self.wordId_corpus.append(curr_doc)
        f.close()
        
        self.num_doc_biterm = [0]*len(self.wordId_corpus)
    
    def init_model(self):
        for doc_number, doc in enumerate(self.wordId_corpus):
            oneCop = defaultdict(int)
            for word1 in doc:
                for word2 in doc:
                    if(word1<word2):
                        item_num = word1*1000000+word2 #encoding the biterms
                        oneCop[item_num] +=1
                        self.biterms.append(item_num)
                        self.num_doc_biterm[doc_number] +=1
            self.biterms_in_doc.append(oneCop)
            
        self.vocab_size = len(self.word2Id)
        
        self.topic_biterm = [0]*len(self.biterms)
        self.topic_word_num = {j: {i:0 for i in range(self.num_topic)} for j in range(self.vocab_size)}
        print(len(self.topic_word_num), len(self.topic_word_num[0]))
        self.num_topic_biterm = [1]*self.num_topic
        
        for biterm_index, biterm in enumerate(self.biterms):
            topic_id = random.randint(0, self.num_topic-1)
            #if biterm_index  5:
                #print(biterm, biterm%1000000, biterm//1000000)
                #print(self.topic_word_num)
            self.topic_word_num[biterm%1000000][topic_id] +=1
            self.topic_word_num[biterm//1000000][topic_id] +=1
            self.topic_biterm[biterm_index] = topic_id
            
    def save_topic_words(self, topic_word_num = 10):
        writer = self.get_file_writer(path = 'model-final-topic-words.txt')
        for topic_id in range(self.num_topic):
            topic_line = {}
            for word_id, word in enumerate(self.word2Id):
                topic_line[word_id] = self.topic_word_num[word_id][topic_id]/ self.num_topic_biterm[topic_id] / 2
            sorted_topic_line = sorted(topic_line.items(), key = operator.itemgetter(1) )
            writer.write("Topic:"+str(topic_id) + '\n')
            for topic_word,score in sorted_topic_line[:topic_word_num]:
                writer.write("\t"+str(self.Id2Word[topic_word])+"\t"+str(score) + '\n')
        writer.close()
    
    def save_wordIds(self):
        writer = self.get_file_writer(path = 'model-final-wordIds.txt')
        for key,value in self.word2Id.items():
            writer.write(str(key) + ' ' + str(value) + '\n')
        writer.close()
        
    def get_sum(self, biterm):
        if biterm not in self.biterm_sum:
            word1 = biterm//1000000
            word2 = biterm%1000000
            sum = 0
            for topic_id in range(self.num_topic):
                calculation = (self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2
                sum += calculation
            self.biterm_sum[biterm] = sum
        return self.biterm_sum[biterm]
        
    def save_theta(self):
        writer = self.get_file_writer(path = 'model-final-theta.txt')

        for doc_index, line in enumerate(self.biterms_in_doc):
            for topic_id in range(self.num_topic):
                one_sum = 0
                for key in line:
                    word1 = key//1000000
                    word2 = key%1000000
                    one_sum += ((line[key]/self.num_doc_biterm[doc_index]) * ((self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2)/(self.get_sum(key)))
                writer.write(str(one_sum) + " ")
            writer.write('\n')
        writer.close()
        
    def save_phi(self):
        writer = self.get_file_writer(path = 'model-final-phi.txt')
        for topic_id in range(self.num_topic):
            for word_id in self.Id2Word:
                calculation = (self.topic_word_num[word_id][topic_id] + self.beta) / ((self.num_topic_biterm[topic_id] * 2) + (self.vocab_size * self.beta))
                writer.write(str(calculation) + ' ')
            writer.write('\n')
        writer.close()
        
    
    def build_model(self):
        for it in range(self.num_iter):
            start_time = time.time()
            for biterm_index, old_topic_id in enumerate(self.topic_biterm):
                word1 = self.biterms[biterm_index]//1000000
                word2 = self.biterms[biterm_index]%1000000
                self.topic_word_num[word1][old_topic_id] -=1
                self.topic_word_num[word2][old_topic_id] -=1
                self.num_topic_biterm[old_topic_id] -=1
                
                new_topic_id = -1
                
                p = [0]*self.num_topic
                for k in range(self.num_topic):
                    p[k] = (self.num_topic_biterm[k] + self.alpha) * (self.topic_word_num[word1][k] + self.beta) * (self.topic_word_num[word2][k] + self.beta) / ((2 * self.num_topic_biterm[k] ) + (self.vocab_size * self.beta))**2
                    
                for k in range(1,self.num_topic):
                    p[k] += p[k-1]
                
                u = random.random() * p[-1]
                for k in range(self.num_topic):
                    if u < p[k]:
                        new_topic_id = k
                        break
                
                self.topic_word_num[word1][new_topic_id] +=1
                self.topic_word_num[word2][new_topic_id] +=1
                self.num_topic_biterm[new_topic_id] += 1
                
                self.topic_biterm[biterm_index] = new_topic_id
                
            print('Finished iteration:', it, 'Time taken:' + str(time.time()-start_time))
    
    def save_result(self):
        self.save_topic_words(20)
        self.save_theta()
        self.save_wordIds()
        self.save_phi()
        
    def run(self):
        self.load_data()
        self.init_model()
        self.build_model()
        self.save_result() 
        
btm = BTM(data_path='../Data/sample-data.txt',alpha=2,beta=0.001, num_iter=100, num_topic=50, output_dir='.')
btm.run()
btm.save_result()

version2

In [None]:
from itertools import combinations
from numpy import fromiter, zeros
from numpy.random import choice, randint
from collections import defaultdict, OrderedDict, Counter
from operator import itemgetter

from gensim.utils import simple_preprocess

class BitermModel:
    """
    Biterm model for small documents.
    Parameters are:
        text: a list of lists where each element is the tokens of a document
            gensim.utils.simple_tokenize is a good choice
            Note: best results will come from removing stopwords as well
        ntopics: the number of topics to infer
        alpha: dirichlet prior hyperparameter for topic distribution
        beta: dirichlet prior hyperparameter for word distribution
        niter: number of gibbs sampling steps
    """
    
    def __init__(self, text, ntopics=5, alpha=0.001, beta=0.001, niter=1):
        self.ntopics = ntopics
        self.alpha = alpha
        self.beta = beta
        self.biterms, self.nwords, self.vocab = self._fit_corpus(text)
        self.topics, self.topic_words = self._gibbs_sample(niter)
        self.text = text
        
    def _flatten(self, l):
        return [item for sublist in l for item in sublist]
    
    def _normalize(self, v):
        c = v.sum()
        return v / c
        
    def _ngrams(self, sequence, n):
        return zip(*[sequence[i:] for i in range(n)])

    def _skipgrams(self, sequence, n, k):
        grams = []
        for ngram in self._ngrams(sequence + [None]*k, n + k):
            head = ngram[:1]
            tail = ngram[1:]
            for skip_tail in combinations(tail, n - 1):
                if skip_tail[-1] is None:
                    continue
                grams.append(head + skip_tail)
        return grams

    def _fit_corpus(self, text):
        skip2grams = []
        biterms = []
        for doc in text:
            skip2doc = self._skipgrams(doc, 2, 1)
            skip2grams.extend(skip2doc)
            for skip in skip2doc:
                i, j = skip
                if i == j:
                    continue
                b = (i, j) if i < j else (j, i)
                biterms.append(b)
            
        nwords = sum(len(doc) for doc in text)
        vocab = frozenset(self._flatten(text))
        
        return biterms, nwords, vocab

    def _gibbs_sample(self, niter):
        a = self.alpha
        b = self.beta
        K = self.ntopics
        M = self.nwords
        V = self.vocab
        
        def z_posterior(n_z, n_wiz, n_wjz):
            p = (n_z + a)*(n_wiz + b)*(n_wjz + b)/((2*n_z + M*b + 1)*(2*n_z + M*b))
            return p
        
        def theta_z(z):
            n_b = sum(n_z.values())
            return (n_z[z] + a) / (n_b + K*a)
        
        def phi_kw(z, w):
            return (n_wz[w][z] + b) / (2*n_z[z] + M*b)
    
        n_z = defaultdict(int)
        n_wz = {word: defaultdict(int) for word in V}
        current_assignments = []
        for bi in self.biterms:
            wi, wj = bi
            z_init = randint(K)
            
            current_assignments.append((bi, z_init))
            n_z[z_init] += 1
            n_wz[wi][z_init] += 1
            n_wz[wj][z_init] += 1
        
        for _ in range(niter):
            for i, (bi, z) in enumerate(current_assignments):
                wi, wj = bi
    
                n_z[z] -= 1
                n_wz[wi][z] -= 1
                n_wz[wj][z] -= 1
                
                z_prop = fromiter((z_posterior(n_z[z], n_wz[wi][z], 
                                               n_wz[wj][z]) for z in range(K)), 
                                  float, K)
                z_probs = self._normalize(z_prop)
                z_new = choice(K, p=z_probs)
                
                n_z[z_new] += 1
                n_wz[wi][z_new] += 1
                n_wz[wj][z_new] += 1
                
                current_assignments[i] = (bi, z_new)
        
        topic_words = {z: {word: phi_kw(z, word) for word in V} for z in range(K)}
        topic_dist = fromiter((theta_z(z) for z in range(K)), float, K)
        
        return topic_dist, topic_words
        
    def get_topics(self):
        """ 
        the global topic distribution 
        returns a list of K topic probabilities
        """
        return self.topics
        
    def get_topic_words(self, n=5):
        """
        the word distributions per topic
        returns an ordered dict with 
        the top n most probable words of the topic
        """
        sort_words = []
        for z in range(self.ntopics):
            word_prob = sorted(self.topic_words[z].items(), 
                               key=itemgetter(1), reverse=True)
            sort_words.append(word_prob)
            
        topn = OrderedDict((z, dict(t[:n])) for z, t in enumerate(sort_words))
        return topn
    
    def infer_documents(self):
        
        def p_zkbi(z, wi, wj):
            p_z = self.topics[z]
            p_wiz = self.topic_words[z][wi]
            p_wjz = self.topic_words[z][wj]
            return p_z * p_wiz *p_wjz
        
        doc_probs = {}
        for d, doc in enumerate(self.text):
            doc_biterms, _, _ = self._fit_corpus([doc])
            
            bcounts = Counter(doc_biterms)
            nb = fromiter((bcounts[b] for b in doc_biterms), 
                          float, len(doc_biterms))
            p_bd = self._normalize(nb)
            
            p_zb = zeros((self.ntopics, len(doc_biterms)))
            for i, (wi, wj) in enumerate(doc_biterms):
                zb = fromiter((p_zkbi(z, wi, wj) for z in range(self.ntopics)), 
                              float, self.ntopics)
                p_zbi = self._normalize(zb)
                p_zb[:, i] = p_zbi
            
            p_zd = p_zb.dot(p_bd)
            doc_probs[d] = p_zd
            
        return doc_probs
    
if __name__ == '__main__':
    with open('sampletext.txt', encoding='utf-8') as f:
        text = f.read().lower().splitlines()
    with open('stopwords.txt') as s:
        stop = frozenset(s.read().splitlines())
    
    def clean(txt):
        doc = simple_preprocess(txt)
        return [d for d in doc if d not in stop]
    
    preprocessed = [clean(doc) for doc in text]
    topic = BitermModel(preprocessed, ntopics=10, niter=20)

### Topic Modeling with Minimal Domain Knowledge

In [None]:
#词汇表
words = list(np.asarray(text_pred.get_feature_names()))

#加入锚定词汇，分别是汽车油耗、外观、噪音和空间这四个先验主题关键词列表
anchor_words = [['油耗','省油'],
['外观','外形','颜值','线条','前脸','时尚','造型','流畅'],
['噪音','胎噪','噪音控制','隔音'],
['空间','座位','拥挤']]


# 训练带入先验知识的主题模型
topic_model = tp.Coret(
                        n_hidden=20 ,
                        max_iter=100000,
                        verbose=0,
                        count='fraction',
                        seed=2019
                      )

topic_model.fit(X_pro , #输入为稀疏词汇表示
               words=words,
               anchors = anchor_words,
               anchor_strength=10  #锚定强度，数值越大，主题模型训练的结果受锚定词汇的影响就越大
              )

### Dynamic Topic Models

https://github.com/derekgreene/dynamic-nmf

### Embedded Topic Model

https://github.com/adjidieng/ETM

### LDA2VEC

https://github.com/meereeum/lda2vec-tf