In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

Уменьшим размер словаря для лучшей сходимости

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, min_df = 25)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(binary=True, min_df=25,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [4]:
len(vectorizer.vocabulary_)

4983

Сэмплируем из данного распределения

In [5]:
def generate_with_weight(weights):      # Возвращает позицию относительно веса
    weights_normed = np.sort(weights) / np.sum(weights)
    weights_bounded = np.cumsum(weights_normed)
    rand = np.random.rand()
    for i in range(len(weights)):
        if(rand < weights_bounded[i]):
            rand = np.argsort(weights)[i]
            break;
    return rand

In [7]:
word_to_topic = np.zeros(len(vectorizer.vocabulary_), dtype = int) # z (к какой теме относится слово)
num_topic = np.zeros(len(newsgroups_train.target_names))           # Счетчик n_k
num_topic_word = np.zeros((len(newsgroups_train.target_names), len(vectorizer.vocabulary_)))   # Счетчик n_k,w
num_text_topic = np.zeros((len(newsgroups_train.data), len(newsgroups_train.target_names)))    # Счетчик n_d,k
alpha = np.zeros(len(newsgroups_train.target_names))               # Распределение тем по текстам
beta = np.zeros((len(newsgroups_train.target_names), len(vectorizer.vocabulary_)))  # Распределение тем по словам

for i in range(len(vectorizer.vocabulary_)):
    word_to_topic[i] = generate_with_weight(np.full(20, 1/20))

In [8]:
# Обновляем счётчики
for i in range(len(newsgroups_train.data)):
    alpha[newsgroups_train.target[i]] = alpha[newsgroups_train.target[i]] + 1
    text = newsgroups_train.data[i]
    beta[newsgroups_train.target[i]] = beta[newsgroups_train.target[i]] + vectorizer.transform([text])
    x = np.resize(vectorizer.transform([text]).toarray(), len(vectorizer.vocabulary_))
    b = np.argwhere(x)
    c = word_to_topic[b]
    for j in range(len(num_topic)):
        num_text_topic[i, j] = len(c[(c == j)])
        num_topic[j] = num_topic[j] + len(c[(c == j)])
    text_transformed = vectorizer.inverse_transform(vectorizer.transform([text]))[0]
    for j in range(len(text_transformed)):
        word = vectorizer.vocabulary_.get(text_transformed[j])
        num_topic_word[word_to_topic[word], word] = num_topic_word[word_to_topic[word], word] + 1

In [9]:
for count in range(50):                            # Для устойчивости делаем несколько раз 
    for i in range(len(newsgroups_train.data)):
        text = newsgroups_train.data[i]
        text_transformed = vectorizer.inverse_transform(vectorizer.transform([text]))[0]
        for j in range(len(text_transformed)):             # Меняем счетчики
            word = vectorizer.vocabulary_.get(text_transformed[j])     # Индекс слова в словаре
            topic = word_to_topic[word]
            num_text_topic[i, topic] = num_text_topic[i, topic] - 1
            num_topic[topic] = num_topic[topic] - 1
            num_topic_word[topic, word] = num_topic_word[topic, word] - 1

            p = np.zeros(len(num_topic))
            for k in range(len(num_topic)):
                p[k] = (num_text_topic[i, k] + alpha[k]) * (num_topic_word[k, word] + beta[k, word]) / (num_topic[k] + np.sum(beta[k]))
            topic = generate_with_weight(np.abs(p))
            word_to_topic[word] = topic
            num_text_topic[i, topic] = num_text_topic[i, topic] + 1
            num_topic[topic] = num_topic[topic] + 1
            num_topic_word[topic, word] = num_topic_word[topic, word] + 1

In [12]:
# Топ-10 слов по каждому тегу
inverse_dict = {v:k  for k,v in vectorizer.vocabulary_.items()}
for i in range(len(newsgroups_train.target_names)):
    #print('    Top 10 words in the Topic = {0}'.format(newsgroups_train.target_names[i]))
    print('    Top 10 words in the Topic = '+str(i+1))
    print()
    x = np.argsort(num_topic_word[i]) [word_to_topic[np.argsort(num_topic_word[i])] == i] [:-11:-1]
    for j in range(len(x)):
        print(inverse_dict.get(x[j]), end = ' ')
    print()
    print('\n\n')

    Top 10 words in the Topic = 1

long thing mail great god ago means comes man tried 



    Top 10 words in the Topic = 2

place example today including message major disk early size thinking 



    Top 10 words in the Topic = 3

think got old second following run computer large source told 



    Top 10 words in the Topic = 4

fact lot list note running deal started body bible process 



    Top 10 words in the Topic = 5

little trying works change short went involved considered purpose screen 



    Top 10 words in the Topic = 6

does work year looking times news similar wouldn evidence rights 



    Top 10 words in the Topic = 7

yes doing getting kind 30 file article cost national level 



    Top 10 words in the Topic = 8

come day non cause certain past fast friend half 21 



    Top 10 words in the Topic = 9

people make ll mean problems small instead matter simple especially 



    Top 10 words in the Topic = 10

problem possible idea software experience offer box va

In [13]:
# Изначальные топики
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

Мы видим, что можно соотнести некоторые изначальные топики с тэгами алгоритма.
Например:
Topic = 12 - 'rec.autos'
Topic = 10 - 'comp.os.ms-windows.misc'
Topic = 13 - 'comp.windows.x'
Topic = 19 - 'talk.politics.mideast'
Topic = 3 - 'sci.electronics'
Topic = 7 - 'talk.politics.misc'
Topic = 15 - 'sci.med'
Topic = 4 - 'talk.religion.misc'
Topic = 18 - 'alt.atheism'
Topic = 2 - 'comp.sys.ibm.pc.hardware'
Topic = 5 - 'comp.graphics'
Однако, с другой стороны, замечаем, что разбиение происходит лишь частично, что говорит о необходимости большего числа итераций.