In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem import WordNetLemmatizer 
from tqdm import tqdm

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
nameOfTag = newsgroups_train.target_names

In [3]:
n_features = 4700
n_components = 20
n_top_words = 10


vectorizer = CountVectorizer(
                    lowercase=True, stop_words=_stop_words.ENGLISH_STOP_WORDS,
                    analyzer='word', binary=True,
                    max_df=0.95, min_df=2,
                    max_features=n_features
)
# одновременно создали словарь и преобразовали строку в вектор
X_train = vectorizer.fit_transform(newsgroups_train.data).toarray()

In [4]:
len(vectorizer.vocabulary_)

4700

In [5]:
def _customLDA(n_d_k, n_k_w, n_k, _z, _document, _word, _alpha, _beta, _topic,  max_iter=10):
    for i in tqdm(range(max_iter)):
        for j in range(len(_document)):
            cur_word = _word[j]
            cur_document = _document[j]
            cur_topic = _z[j]
            n_d_k[cur_document, cur_topic] -= 1
            n_k_w[cur_topic, cur_word] -= 1
            n_k[cur_topic] -= 1
            p = (n_d_k[cur_document, :] + _alpha) * (n_k_w[:, cur_word] + _beta[cur_word]) / (n_k + _beta.sum())
            _z[j] = np.random.choice(np.arange(_topic), p = p / p.sum())
            n_d_k[cur_document, _z[j]] += 1
            n_k_w[_z[j], cur_word] += 1
            n_k[_z[j]] += 1
    return n_d_k, n_k_w, n_k, _z


In [7]:
topic = 20
n_d_k = np.zeros( topic * X_train.shape[0]).reshape(X_train.shape[0], topic)
n_k_w = np.zeros( topic * X_train.shape[1]).reshape(topic, X_train.shape[1])
n_k = np.zeros(topic)
document, word = X_train.nonzero()
z = np.random.choice(topic, len(document))

In [8]:
for i, j, k in zip(document, word, z):
    n_d_k[i, k] += 1
    n_k_w[k, j] += 1
    n_k[k] += 1
    

In [10]:
n_d_k, n_k_w,  n_k, z = _customLDA(n_d_k, n_k_w, n_k, z, document, word, np.ones(20), np.ones(X_train.shape[1]), 20, max_iter=30)

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [09:28<00:00, 18.95s/it]


In [11]:
result = np.argsort(n_k_w, axis=1)[:, -10:]
for i in range(20):
    matrix = np.zeros((1, X_train.shape[1]))
    for j in result[i]:
        matrix[0, j] = 1
    print('Tag {} \t{}'.format(i + 1, '\t'.join(vectorizer.inverse_transform(matrix)[0])))


Tag 1 	answer	different	far	new	posting	really	say	sure	think	way
Tag 2 	16	apple	bit	board	card	memory	software	speed	support	video
Tag 3 	10	11	15	18	30	game	games	play	team	year
Tag 4 	earth	high	large	low	nasa	power	small	space	time	use
Tag 5 	come	does	just	know	ll	look	people	right	think	way
Tag 6 	children	country	government	israel	jews	killed	people	state	war	world
Tag 7 	chip	clipper	don	encryption	government	just	key	law	people	use
Tag 8 	available	code	file	files	following	ftp	list	program	using	write
Tag 9 	does	don	good	know	like	look	looking	time	use	want
Tag 10 	buy	car	good	interested	new	offer	price	sale	sell	used
Tag 11 	believe	bible	christian	christians	god	jesus	life	people	religion	say
Tag 12 	american	april	clinton	federal	government	national	president	public	state	states
Tag 13 	ago	day	don	edu	good	know	like	need	soon	years
Tag 14 	agree	case	did	does	doesn	point	say	think	try	wrong
Tag 15 	better	don	just	like	lot	probably	really	think	ve	way
Tag 16 	did	don	g