In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

Уменьшим размер словаря для лучшей сходимости

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, min_df = 25)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(binary=True, min_df=25,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [10]:
len(vectorizer.vocabulary_)

4983

Сэмплируем из данного распределения

In [4]:
def generate_with_weight(weights):      # Возвращает позицию относительно веса
    weights_normed = np.sort(weights) / np.sum(weights)
    weights_bounded = np.cumsum(weights_normed)
    rand = np.random.rand()
    for i in range(len(weights)):
        if(rand < weights_bounded[i]):
            rand = np.argsort(weights)[i]
            break;
    return rand

In [5]:
word_to_topic = np.zeros(len(vectorizer.vocabulary_), dtype = int) # z (к какой теме относится слово)
num_topic = np.zeros(len(newsgroups_train.target_names))           # Счетчик n_k
num_topic_word = np.zeros((len(newsgroups_train.target_names), len(vectorizer.vocabulary_)))   # Счетчик n_k,w
num_text_topic = np.zeros((len(newsgroups_train.data), len(newsgroups_train.target_names)))    # Счетчик n_d,k
alpha = np.zeros(len(newsgroups_train.target_names))               # Распределение тем по текстам
beta = np.zeros((len(newsgroups_train.target_names), len(vectorizer.vocabulary_)))  # Распределение тем по словам

for i in range(len(vectorizer.vocabulary_)):
    word_to_topic[i] = generate_with_weight(np.full(20, 1/20))

In [6]:
# Обновляем счётчики
for i in range(len(newsgroups_train.data)):
    alpha[newsgroups_train.target[i]] = alpha[newsgroups_train.target[i]] + 1
    text = newsgroups_train.data[i]
    beta[newsgroups_train.target[i]] = beta[newsgroups_train.target[i]] + vectorizer.transform([text])
    x = np.resize(vectorizer.transform([text]).toarray(), len(vectorizer.vocabulary_))
    b = np.argwhere(x)
    c = word_to_topic[b]
    for j in range(len(num_topic)):
        num_text_topic[i, j] = len(c[(c == j)])
        num_topic[j] = num_topic[j] + len(c[(c == j)])
    text_transformed = vectorizer.inverse_transform(vectorizer.transform([text]))[0]
    for j in range(len(text_transformed)):
        word = vectorizer.vocabulary_.get(text_transformed[j])
        num_topic_word[word_to_topic[word], word] = num_topic_word[word_to_topic[word], word] + 1

In [7]:
for count in range(5):                            # Для устойчивости делаем несколько раз 
    for i in range(len(newsgroups_train.data)):
        text = newsgroups_train.data[i]
        text_transformed = vectorizer.inverse_transform(vectorizer.transform([text]))[0]
        for j in range(len(text_transformed)):             # Меняем счетчики
            word = vectorizer.vocabulary_.get(text_transformed[j])     # Индекс слова в словаре
            topic = word_to_topic[word]
            num_text_topic[i, topic] = num_text_topic[i, topic] - 1
            num_topic[topic] = num_topic[topic] - 1
            num_topic_word[topic, word] = num_topic_word[topic, word] - 1

            p = np.zeros(len(num_topic))
            for k in range(len(num_topic)):
                p[k] = (num_text_topic[i, k] + alpha[k]) * (num_topic_word[k, word] + beta[k, word]) / (num_topic[k] + np.sum(beta[k]))
            topic = generate_with_weight(np.abs(p))
            word_to_topic[word] = topic
            num_text_topic[i, topic] = num_text_topic[i, topic] + 1
            num_topic[topic] = num_topic[topic] + 1
            num_topic_word[topic, word] = num_topic_word[topic, word] + 1

In [11]:
# Топ-10 слов по каждому тегу по алгоритму

inverse_dict = {v:k  for k,v in vectorizer.vocabulary_.items()}
for i in range(len(newsgroups_train.target_names)):
    print('    Top 10 words in the Topic = {0}'.format(newsgroups_train.target_names[i]))
    print()
    x = np.argsort(num_topic_word[i]) [word_to_topic[np.argsort(num_topic_word[i])] == i] [:-11:-1]
    for j in range(len(x)):
        print(inverse_dict.get(x[j]), end = ' ')
    print()
    print('\n\n')

    Top 10 words in the Topic = alt.atheism

old based person makes works large today running net advance 



    Top 10 words in the Topic = comp.graphics

really post support file space original hear children lines center 



    Top 10 words in the Topic = comp.os.ms-windows.misc

time want come group email free control agree instead buy 



    Top 10 words in the Topic = comp.sys.ibm.pc.hardware

comes posting files fast 93 future coming office dead players 



    Top 10 words in the Topic = comp.sys.mac.hardware

way doesn hard questions making john single pay 21 build 



    Top 10 words in the Topic = comp.windows.x

hope computer feel open clear sense asked ones sale sun 



    Top 10 words in the Topic = misc.forsale

tell left team evidence numbers friend 19 sent bought text 



    Top 10 words in the Topic = rec.autos

sure long line 15 public money low pc claim sell 



    Top 10 words in the Topic = rec.motorcycles

used ll life start game version certainly nice home

In [12]:
# Топ-10 слов по каждому тегу из датасета

for i in range(len(newsgroups_train.target_names)):
    print('    Top 10 words in the Topic = {0}\n'.format(newsgroups_train.target_names[i]))
    x = np.argsort(beta[i]) [word_to_topic[np.argsort(beta[i])] == i] [:-11:-1]
    for j in range(len(x)):
        print(inverse_dict.get(x[j]), end = ' ')
    print()
    print('\n\n')

    Top 10 words in the Topic = alt.atheism

say atheism makes actually person based islam away book christian 



    Top 10 words in the Topic = comp.graphics

graphics file run post appreciated vga support lines original programming 



    Top 10 words in the Topic = comp.os.ms-windows.misc

know using dos want hi time microsoft got anybody group 



    Top 10 words in the Topic = comp.sys.ibm.pc.hardware

speed 16 data port right uses fast vlb cache screen 



    Top 10 words in the Topic = comp.sys.mac.hardware

apple video scsi way hard help doesn lc machine powerbook 



    Top 10 words in the Topic = comp.windows.x

window windows code program sun x11r5 look mail send xlib 



    Top 10 words in the Topic = misc.forsale

shipping price use brand contact case address obo 250 trade 



    Top 10 words in the Topic = rec.autos

engine problem better dealer sure long bad oil honda model 



    Top 10 words in the Topic = rec.motorcycles

bike ride going ll motorcycle used ro

Алгоритм достаточно быстро ставит в соответствие каждому слову определенный тэг, который на самом деле соответствует основной тематике слова. Однако проведенных итераций явно недостотаточно для получения распределения тэгов над словами, удовлетворящего критерию стабильности и представлению о смысле слов. Это происходит потому, что в алгоритме на каждом шаге счётчики меняются на 1, в то время как объём слов помноженный на количество тэгов очень велик. Таким образом, для получения осмысленных результатов для распределения тэгов над словами необходимо провести намного больше итераций.