Выполняем весь код с семинара для получения корпуса


In [22]:
from pprint import pprint

import re
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import spacy
from gensim.utils import simple_preprocess
import pandas as pd

import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')

data = df.content.values.tolist()

data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

data = [re.sub('\s+', ' ', sent) for sent in data]

data = [re.sub("\'", "", sent) for sent in data]


def sent_to_words(sentences):
    for sentence in sentences:
        yield simple_preprocess(str(sentence), deacc=True)


data_words = list(sent_to_words(data))

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)  # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)


def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]


def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


def lemmatization(texts, allowed_postags):
    """https://spacy.io/api/annotation"""
    if allowed_postags is None:
        allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


data_words_nostops = remove_stopwords(data_words)

data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load('en', disable=['parser', 'ner'])

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

[nltk_data] Downloading package stopwords to /Users/st/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Качаем Mallet.

In [23]:
mallet_path = 'mallet-2.0.8/bin/mallet'

Запускаем вот такую функцию сначала в широком диапазоне [10,30), потом сужаем до [15,25)

In [24]:
# max_coh = 0
# best_groups = 0
#
# for i in range(10,30):
#     ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=i, id2word=id2word)
#     coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
#     coherence_ldamallet = coherence_model_ldamallet.get_coherence()
#     print(i, coherence_ldamallet)
#     if max_coh < coherence_ldamallet:
#         max_coh = coherence_ldamallet
#         best_groups = i
#
# print()
# print(max_coh, best_groups)

Получаем в первом случае число групп 23, во втором 21. Во втором прогоне было больше, чем в первом, так что берем 21

На самом деле, значения на прогонах разные даже для одного и того же числа групп, максимум наблюдается на промежутке от 19 до 23 где-то.

Далее получаем топики по генсим-модели, проходим по всем текстам и для каждого в словаре считаем суммы весов слов из топиков, если они встречаются.
Ключ с максимальным значением в таком словаре - широкий топик для текста.
Тексты по широким топикам тоже кидаем в общий словарь.

In [25]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=21,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)
topics = lda_model.show_topics(formatted=False)

import operator

def findTextCommonTopic(text, all_topics=topics):
    text_topics = {}
    for word in text:
        for topic in all_topics:
            for topic_word in topic[1]:
                if word == topic_word[0]:
                    if topic[0] in text_topics:
                        text_topics[topic[0]]+=topic_word[1]
                    else:
                        text_topics[topic[0]]=topic_word[1]
    if text_topics:
        return max(text_topics.items(), key=operator.itemgetter(1))[0]
    else:
        return None


groups = {}

for text in texts:
    text_topic = findTextCommonTopic(text)
    if text_topic:
        if text_topic in groups:
            groups[text_topic].append(" ".join([word for word in text]))
        else:
            groups[text_topic] = [" ".join([word for word in text])]

Теперь ищем TF-IDF в cловах для каждой группы, сохраняем топ5 для текста

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

def processGroup(group_topic, group):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(group)
    feature_names = vectorizer.get_feature_names()
    denseList = vectors.todense().tolist()
    tfidfs = {}
    data = []
    for i in range(len(group)):
        for num, word in enumerate(feature_names):
            tfidfs[word] = denseList[i][num]
        top5 = sorted(tfidfs, key=tfidfs.get, reverse=True)[:5]
        data.append([group[i], group_topic, top5])
    return data


data = []
for group_topic in groups:
    data.extend(processGroup(group_topic, groups[group_topic]))

Создадим массив, в котором сложим слова топиков по порядку наших данных.
Сложим это все в датафрейм

In [27]:
topic_words = [topic[1] for row in data for topic in topics if row[1] == topic[0]]

dataframe = pd.DataFrame(data, columns=['Text', 'Topic_id', 'top5'])
dataframe.insert(2, 'topic words', topic_words)

In [28]:
dataframe

Unnamed: 0,Text,Topic_id,topic words,top5
0,where thing car nntp_poste host park line wond...,15,"[(line, 0.2247845), (host, 0.09429961), (nntp_...","[car, door, lerxst, funky, where]"
1,si poll final summary final call si clock repo...,15,"[(line, 0.2247845), (host, 0.09429961), (nntp_...","[clock, si, poll, upgrade, final]"
2,question engineering computer network distribu...,15,"[(line, 0.2247845), (host, 0.09429961), (nntp_...","[display, machine, bunch, hear, store]"
3,division line host amber write write article k...,15,"[(line, 0.2247845), (host, 0.09429961), (nntp_...","[division, quadrilateral, weitek, chip, amber]"
4,man tumor treatment thank people respond reque...,15,"[(line, 0.2247845), (host, 0.09429961), (nntp_...","[treatment, astrocytoma, bouncing, prob, tumor]"
...,...,...,...,...
11282,horse breeding sale dear sir private agricultu...,9,"[(number, 0.022235667), (may, 0.02095375), (al...","[horse, firm, class, breed, breeding]"
11283,write look program algorithm use computation l...,13,"[(sorry, 0.0945596), (utility, 0.04175744), (b...","[day, length, time, long, able]"
11284,sale sale machine condition scratch fully oper...,13,"[(sorry, 0.0945596), (utility, 0.04175744), (b...","[ask, manual, sale, scratch, shipping]"
11285,kick write question teach long time ago really...,13,"[(sorry, 0.0945596), (utility, 0.04175744), (b...","[angel, dragon, fight, hurl, kick]"


P.S.
Coherence определяет, насколько действительно семантически близки слова внутри одного топика