#Задача:
запустить модель LDA и Gibbs Sampling с числов тегов 20. Вывести топ-10 слов по каждому тегу. Соотнести полученные теги с тегами из датасета, сделать выводы.

In [3]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                            analyzer='word',min_df = 0.002)

X_train = vectorizer.fit_transform(newsgroups_train.data)
u = X_train.toarray()

w = []
for e in range(len(u)):
    k = []
    t = u[e].nonzero()[0]
    for i in range(len(t)):
        for j in range(u[e][t[i]]):
            k.append(t[i])
    w.append(k)

q = np.array(w)

def FormInitModel(arr, words_by_key, z, chosen_words, q):
    for d, text in enumerate(q):
        lst = []

        for w in text:
            p = np.divide(np.multiply(arr[d, :], words_by_key[:, w]), z)
            a = np.random.multinomial(1, p / p.sum()).argmax()
            lst.append(a)

            arr[d, a] += 1
            words_by_key[a, w] += 1
            z[a] += 1

        chosen_words.append(lst)


def sampling(arr,words_by_key,z,chosen_words,q):
    for d, text in enumerate(q):
        for i, w in enumerate(text):
            a = chosen_words[d][i]
            arr[d, a] -= 1
            words_by_key[a, w] -= 1
            z[a] -= 1
            p = np.divide(np.multiply(arr[d, :], words_by_key[:, w]), z)
            a = np.random.multinomial(1, p / p.sum()).argmax()
            chosen_words[d][i] = a
            arr[d, a] += 1
            words_by_key[a, w] += 1
            z[a] += 1

def GetKey(dictionary, value):
    Keys = list()
    ItemList = dictionary.items()

    for Item in ItemList:
        if Item[1] == value:
            Keys.append(Item[0])

    return Keys

num = len(vectorizer.vocabulary_)
alpha_param = 5
beta_param = 0.1

chosen_words = []
arr = np.zeros([len(q), 20]) + alpha_param
words_by_key = np.zeros([20, num]) + beta_param
z = np.zeros([20]) + num * beta_param

FormInitModel(arr,words_by_key,z,chosen_words,q)

for i in range(50):
  sampling(arr,words_by_key,z,chosen_words,q)

#набираем топ слов
words_by_topic = []

for topic in range(20):
  index = words_by_key[topic, :].argsort()[:10]
  words = []
 
  for i in index:
    words.append(GetKey(vectorizer.vocabulary_, i))
  words_by_topic.append(list(reversed(words)))

for i in range (20):
    print(i+1, words_by_topic[i],'\n')



1 [['87'], ['rights'], ['archive'], ['email'], ['arabs'], ['los'], ['title'], ['problem'], ['alt'], ['expansion']] 

2 [['friend'], ['run'], ['manager'], ['cars'], ['versions'], ['completely'], ['postscript'], ['84'], ['bell'], ['won']] 

3 [['identity'], ['rights'], ['dept'], ['attacks'], ['sounds'], ['period'], ['regular'], ['results'], ['posts'], ['english']] 

4 [['testament'], ['clinton'], ['greatly'], ['nsa'], ['tests'], ['carry'], ['room'], ['int'], ['client'], ['resurrection']] 

5 [['crypto'], ['signal'], ['hardware'], ['apply'], ['90'], ['sales'], ['operation'], ['typical'], ['checked'], ['method']] 

6 [['homosexuals'], ['charge'], ['operating'], ['actually'], ['adam'], ['normally'], ['record'], ['check'], ['university'], ['house']] 

7 [['consider'], ['internet'], ['processing'], ['buying'], ['apps'], ['looks'], ['report'], ['interface'], ['functions'], ['colormap']] 

8 [['background'], ['event'], ['meaning'], ['tried'], ['et'], ['postscript'], ['trouble'], ['truly'], ['po

In [5]:
l = newsgroups_train.target_names

for i in range(len(l)):
  print(l[i],'\n')

alt.atheism 

comp.graphics 

comp.os.ms-windows.misc 

comp.sys.ibm.pc.hardware 

comp.sys.mac.hardware 

comp.windows.x 

misc.forsale 

rec.autos 

rec.motorcycles 

rec.sport.baseball 

rec.sport.hockey 

sci.crypt 

sci.electronics 

sci.med 

sci.space 

soc.religion.christian 

talk.politics.guns 

talk.politics.mideast 

talk.politics.misc 

talk.religion.misc 



Видно, что большинство тем легко угадываются по набору часто используемых слов. К примеру, 5 топик -- это явно sci.crypt (тут встречаются слова "crypto", "method", "operation"). Тем не менее, некоторые получившиеся наборы слов трудно различить, потому что темы близки по смыслу, а потому таковы и используемые термины. (для меня такими наборами оказались  15 и 11 -- оба связаны с оружием -- RKBA и firearms наводят мысли об этом, но какой именно топик подходит к talks.politics.guns у меня не получилось определить)