In [28]:
import pandas as pd
import numpy as np

In [29]:
from sklearn.datasets import fetch_20newsgroups
from gensim import corpora, models

data = fetch_20newsgroups(subset='train')
data = data['data']
data = data[:50]

# Preprocessing

In [63]:
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))



In [69]:
# tokinize
tokenizer = nltk.tokenize.WhitespaceTokenizer()
doc = [tokenizer.tokenize(document) for document in data]

In [98]:
# lematization
lematizer = nltk.stem.WordNetLemmatizer()
doc_lem = [
     [lematizer.lemmatize(token) for token in document]
     for document in doc]

In [99]:
# drop stop words
texts = [
     [word for word in document if word not in stopwords]
     for document in doc_lem
 ]

In [101]:
#  создаем словарь 
dictionary = corpora.Dictionary(texts)

In [102]:
# проводим векторизацию
corpus = [dictionary.doc2bow(text) for text in texts]

In [103]:
# tfidf model
tfidf_model = models.TfidfModel(corpus)  

In [104]:
# tfidf 
tfidf = tfidf_model[corpus]

Here we can implement
- RpModel
- LsiModel
- HdpModel
- LdaModel

# LSI

In [105]:
# init model
lsi = models.LsiModel(tfidf, id2word=dictionary, num_topics=20)

In [106]:
corpus_lsi = lsi[tfidf]

In [107]:
lsi.print_topics()[0]

(0,
 '0.253*">" + 0.236*">>" + 0.117*"-" + 0.099*"wa" + 0.097*"I" + 0.093*"*" + 0.091*"insurance" + 0.089*"car" + 0.087*"The" + 0.081*"revolver"')

# LDA

In [38]:
lda = models.LdaModel(tfidf, id2word=dictionary, num_topics=20)

In [39]:
lda.id2word

<gensim.corpora.dictionary.Dictionary at 0x1ac4a67fa58>

In [44]:
# for LDA only
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary=lda.id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [42]:
vis

In [109]:
lda_model = models.LdaMulticore(tfidf, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [111]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.001*">" + 0.001*"lefthanded" + 0.001*"starter" + 0.001*"root@ncube.com" + 0.001*"0" + 0.001*":)" + 0.001*"Computer" + 0.001*"lefty" + 0.001*"righthanded" + 0.001*"Viola"
Topic: 1 Word: 0.001*"captain" + 0.001*"TIFF" + 0.001*"3.0" + 0.001*"traded" + 0.001*"NL" + 0.001*"acne" + 0.001*">>" + 0.001*"cover" + 0.001*"wa" + 0.001*"Space"
Topic: 2 Word: 0.001*"font" + 0.001*">" + 0.001*"DOS" + 0.001*"bullpen" + 0.001*"8514/A" + 0.001*"size" + 0.001*"window" + 0.001*"small" + 0.000*"mode" + 0.000*"seven"
Topic: 4 Word: 0.001*"purchased" + 0.001*"revolver" + 0.001*">" + 0.001*"3V" + 0.001*"48V" + 0.001*"Toronto" + 0.001*"===" + 0.001*"$10" + 0.001*"semi" + 0.001*"|"
Topic: 5 Word: 0.001*"*" + 0.001*"thank" + 0.001*"movie" + 0.001*"Virginia" + 0.001*"Hinckley)" + 0.001*"kph2q@onyx.cs.Virginia.EDU" + 0.001*"input" + 0.001*"-------------------------------------------------------------------------------" + 0.001*"voice" + 0.001*"(Kenneth"
Topic: 7 Word: 0.001*"\" + 0.001*">>" + 0.00

In [114]:
for index, score in sorted(lda_model[tfidf[1]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8971303701400757	 
Topic: 0.001*"\" + 0.001*">>" + 0.001*"/" + 0.001*"SI" + 0.001*"clock" + 0.001*"plant" + 0.001*"SSF" + 0.001*"-" + 0.001*"water" + 0.001*"option"

Score: 0.011432066559791565	 
Topic: 0.001*"captain" + 0.001*"TIFF" + 0.001*"3.0" + 0.001*"traded" + 0.001*"NL" + 0.001*"acne" + 0.001*">>" + 0.001*"cover" + 0.001*"wa" + 0.001*"Space"

Score: 0.01142990030348301	 
Topic: 0.001*"*" + 0.001*"thank" + 0.001*"movie" + 0.001*"Virginia" + 0.001*"Hinckley)" + 0.001*"kph2q@onyx.cs.Virginia.EDU" + 0.001*"input" + 0.001*"-------------------------------------------------------------------------------" + 0.001*"voice" + 0.001*"(Kenneth"

Score: 0.011429856531322002	 
Topic: 0.001*"purchased" + 0.001*"revolver" + 0.001*">" + 0.001*"3V" + 0.001*"48V" + 0.001*"Toronto" + 0.001*"===" + 0.001*"$10" + 0.001*"semi" + 0.001*"|"

Score: 0.01142982579767704	 

Score: 0.01142975315451622	 

Score: 0.011429657228291035	 
Topic: 0.001*">>" + 0.001*")>>" + 0.001*"mask" + 0.001*"mass" + 0

### Performance evaluation

In [116]:
for index, score in sorted(lda_model[tfidf[1]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8971301317214966	 
Topic: 0.001*"\" + 0.001*">>" + 0.001*"/" + 0.001*"SI" + 0.001*"clock" + 0.001*"plant" + 0.001*"SSF" + 0.001*"-" + 0.001*"water" + 0.001*"option"

Score: 0.01143224909901619	 
Topic: 0.001*"captain" + 0.001*"TIFF" + 0.001*"3.0" + 0.001*"traded" + 0.001*"NL" + 0.001*"acne" + 0.001*">>" + 0.001*"cover" + 0.001*"wa" + 0.001*"Space"

Score: 0.01142991241067648	 
Topic: 0.001*"*" + 0.001*"thank" + 0.001*"movie" + 0.001*"Virginia" + 0.001*"Hinckley)" + 0.001*"kph2q@onyx.cs.Virginia.EDU" + 0.001*"input" + 0.001*"-------------------------------------------------------------------------------" + 0.001*"voice" + 0.001*"(Kenneth"

Score: 0.011429868638515472	 
Topic: 0.001*"purchased" + 0.001*"revolver" + 0.001*">" + 0.001*"3V" + 0.001*"48V" + 0.001*"Toronto" + 0.001*"===" + 0.001*"$10" + 0.001*"semi" + 0.001*"|"

Score: 0.01142983790487051	 

Score: 0.011429766193032265	 

Score: 0.011429669335484505	 
Topic: 0.001*">>" + 0.001*")>>" + 0.001*"mask" + 0.001*"mass" + 0

#### Задание для самостоятельно работы

1) данные voted.csv <br>
2) провести тематическое моделирование, использовать весь препроцессинг <br>
3) подобровать адекватный метод, и адекватное количество топиков. <br>
4) расшифровать эти топики

### Домашнее задание (option 1)

1) Файл HW1.txt <br>
2) обработать файл с помощью регулярных выражений, убрать лишние символы, оставить только абстракты <br>
3) провести предобработку текста <br>
4) провести тематическое моделирование (выбрать метод и нужное количество топиков)<br>
5) Визулизировать темы гистограммой (статическая визуализация)

### Домашнее задание (option 2)*

1) Сайт http://zpp.rospotrebnadzor.ru/Forum/Appeals - скачать не менее 300 страниц<br>
2) провести предобработку текста <br>
3) провести тематическое моделирование (LSI) не используя пакет gensim. Можно использовать только numpy. [Инструкция](https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.linalg.svd.html).<br>
4) (optional) Визуализация тем.