In [1]:
import numpy as np
from gensim.models import Word2Vec, LdaMulticore
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
import pyLDAvis
from pyLDAvis import gensim 
import codecs

In [2]:
def preprocessing(sentence):
    return [word for word in simple_preprocess(sentence) if word not in STOPWORDS]

In [3]:
def read_sentences(filename):
    with open(f'data/{filename}', encoding='latin-1') as f:
        for line in f:
            yield preprocessing(line)

In [4]:
%time sentences = list(read_sentences('reviews_data.txt'))

CPU times: user 1min 2s, sys: 709 ms, total: 1min 3s
Wall time: 1min 3s


In [5]:
len(sentences)

256263

In [6]:
model = Word2Vec(sentences, size=100, min_count=2, window = 5)

In [7]:
#model = Word2Vec.load('models/hotel_model_size100_window10_mincount2.model')

# Reprezentacja wektorowa

In [8]:
model.wv.most_similar('good')

[('decent', 0.8274618983268738),
 ('great', 0.8233014345169067),
 ('excellent', 0.8061910271644592),
 ('ok', 0.6639823317527771),
 ('fair', 0.6505758762359619),
 ('average', 0.6288623809814453),
 ('reasonable', 0.6256632804870605),
 ('nice', 0.6243017911911011),
 ('fantastic', 0.6124641299247742),
 ('terrific', 0.6112827658653259)]

In [9]:
model.wv.most_similar('love')

[('loved', 0.6971858143806458),
 ('favorite', 0.618948757648468),
 ('awesome', 0.5361564755439758),
 ('favourite', 0.5238020420074463),
 ('wonderful', 0.5195814371109009),
 ('liked', 0.517303466796875),
 ('fabulous', 0.5163142085075378),
 ('enjoy', 0.5141449570655823),
 ('amazing', 0.493054062128067),
 ('dream', 0.4659866690635681)]

In [10]:
model.wv.most_similar(positive=['bad', 'terrible'], negative=['good'])

[('horrible', 0.8491475582122803),
 ('awful', 0.8014451265335083),
 ('horrific', 0.7025386095046997),
 ('horrendous', 0.6988598108291626),
 ('shocking', 0.696241021156311),
 ('dreadful', 0.6770745515823364),
 ('horrid', 0.6746914386749268),
 ('appalling', 0.6689745783805847),
 ('nasty', 0.6572792530059814),
 ('disgusting', 0.6528801918029785)]

In [11]:
model.wv.most_similar('london')

[('nyc', 0.8362125158309937),
 ('sf', 0.8164799809455872),
 ('manhattan', 0.7946785688400269),
 ('ny', 0.7853796482086182),
 ('chicago', 0.7533805966377258),
 ('dubai', 0.7417941093444824),
 ('beijing', 0.7366599440574646),
 ('montreal', 0.7160376906394958),
 ('manhatten', 0.6859848499298096),
 ('city', 0.6763434410095215)]

# Modelowanie tematów - LDA

In [12]:
sentences_light = np.random.permutation(sentences)[:2500]

In [13]:
%time dictionary = Dictionary(sentences_light)

CPU times: user 374 ms, sys: 0 ns, total: 374 ms
Wall time: 373 ms


In [14]:
len(dictionary)

14430

In [15]:
%time bow_corpus = [dictionary.doc2bow(sent) for sent in sentences_light]

CPU times: user 196 ms, sys: 8 ms, total: 204 ms
Wall time: 203 ms


In [16]:
dictionary.doc2bow(['car', 'car'])

[(799, 2)]

In [17]:
%time lda_model = LdaMulticore(bow_corpus, id2word=dictionary, num_topics=100, workers=4)

CPU times: user 9.89 s, sys: 925 ms, total: 10.8 s
Wall time: 8.64 s


In [18]:
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx}\nWords: {topic}')

Topic: 0
Words: 0.023*"hotel" + 0.013*"room" + 0.013*"stay" + 0.011*"great" + 0.006*"service" + 0.005*"time" + 0.005*"place" + 0.004*"night" + 0.004*"went" + 0.004*"good"
Topic: 1
Words: 0.028*"hotel" + 0.023*"room" + 0.018*"great" + 0.013*"stay" + 0.009*"quot" + 0.008*"stayed" + 0.008*"breakfast" + 0.008*"night" + 0.007*"staff" + 0.007*"service"
Topic: 2
Words: 0.044*"hotel" + 0.017*"room" + 0.016*"great" + 0.012*"location" + 0.010*"stay" + 0.009*"rooms" + 0.008*"good" + 0.007*"day" + 0.007*"nice" + 0.006*"staff"
Topic: 3
Words: 0.014*"hotel" + 0.014*"room" + 0.010*"conrad" + 0.006*"view" + 0.006*"like" + 0.006*"great" + 0.006*"bed" + 0.006*"stay" + 0.006*"bathroom" + 0.006*"friendly"
Topic: 4
Words: 0.034*"hotel" + 0.020*"room" + 0.013*"good" + 0.012*"staff" + 0.011*"location" + 0.011*"nice" + 0.010*"great" + 0.009*"stay" + 0.009*"rooms" + 0.008*"time"
Topic: 5
Words: 0.032*"hotel" + 0.020*"room" + 0.010*"stayed" + 0.010*"clean" + 0.009*"rooms" + 0.009*"good" + 0.008*"location" + 0.0

# Wizualizacja tematów

In [19]:
lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(lda_vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
