In [1]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Visualize the topics
# Plotting tools
import pyLDAvis
# import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
id2word = corpora.Dictionary.load("corpus_dict/dict")
corpus = corpora.MmCorpus("corpus_dict/corpus")
tuning=pd.read_csv('2_lda_tuning_results.csv',encoding="utf-8")

In [53]:
tuning_5 = tuning.loc[(tuning["Topics"] == 5)]
tuning_10 = tuning.loc[(tuning['Topics'] == 10)]
tuning_20 = tuning.loc[(tuning['Topics'] == 20)]
tuning_30 = tuning.loc[(tuning['Topics'] == 30)]

alpha_5 = tuning_5.loc[tuning_5['Coherence'].idxmax(), 'Alpha']
alpha_10 = tuning_10.loc[tuning_10['Coherence'].idxmax(), 'Alpha']
alpha_20 = tuning_20.loc[tuning_20['Coherence'].idxmax(), 'Alpha']
alpha_30 = tuning_30.loc[tuning_30['Coherence'].idxmax(), 'Alpha']

# K = 5

In [54]:
lda_model_5 = gensim.models.LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=5, 
                                         random_state=100,
                                         chunksize=100,
                                         passes=40,
                                         iterations=1000,
                                         alpha=alpha_5,
                                         eta=1/5,
                                         eval_every=None)

In [55]:
for idx, topic in lda_model_5.show_topics(num_topics=5, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: play, spotify, music, issue, echo, alexa, ask, time, say, song, '
 'alarm, work, problem, try, get, happen, stop, listen, go, start, tell, way, '
 'playlist, amazon_music, want, fix, like, seem, wake, hear')
 
('Topic 1: echo, dot, use, speaker, echo_dot, get, connect, sound, pair, want, '
 'bluetooth, audio, well, go, device, plug, like, buy, room, work, phone, '
 'music, house, good, kitchen, think, way, need, come, make')
 
('Topic 2: turn, alexa, light, routine, use, control, set, work, tv, device, '
 'want, switch, command, group, get, able, way, name, room, echo, bulb, hub, '
 'go, need, trigger, app, add, make, like, say')
 
('Topic 3: device, app, echo, phone, set, alexa_app, use, work, account, try, '
 'connect, call, go, show, issue, get, need, see, wifi, able, problem, '
 'echo_show, update, drop, check, link, setting, change, router, thank')
 
('Topic 4: alexa, get, amazon, use, like, skill, show, make, say, time, think, '
 'want, thing, see, go, know, ask, give,

In [56]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_5, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [57]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_5.html')

# K = 10

In [58]:
lda_model_10 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=10, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_10,
                                          eta=1/10,
                                          eval_every=None)

In [59]:
for idx, topic in lda_model_10.show_topics(num_topics=10, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: play, music, spotify, song, listen, ask, echo, playlist, want, '
 'amazon_music, play_music, work, get, way, alexa, like, stream, try, station, '
 'radio, say, find, alexa_play, tell, podcast, apple_music, add, name, track, '
 'service')
 
('Topic 1: issue, problem, say, happen, hear, time, fix, try, seem, respond, '
 'start, echo, go, alexa, second, sometimes, day, change, thing, think, tell, '
 'back, notice, talk, fine, see, mine, reset, still, sound')
 
('Topic 2: light, alexa, turn, work, group, name, room, switch, device, '
 'control, bulb, use, hub, bedroom, living_room, set, plug, house, get, '
 'kitchen, want, go, add, fan, turn_lights, scene, echo, alexa_turn, change, '
 'echo_plus')
 
('Topic 3: device, phone, app, echo, alexa_app, set, use, account, connect, '
 'call, wifi, try, work, able, need, go, router, link, network, see, setup, '
 'change, amazon_account, internet, access, setting, iphone, contact, drop, '
 'home')
 
('Topic 4: use, like, alexa, make, well

In [60]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_10, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [61]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_10.html')

# K = 20

In [62]:
lda_model_20 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=20, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_20,
                                          eta=1/20,
                                          eval_every=None)

In [63]:
for idx, topic in lda_model_20.show_topics(num_topics=20, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: play, music, spotify, song, listen, playlist, amazon_music, '
 'play_music, want, stream, ask, station, radio, alexa_play, podcast, like, '
 'apple_music, track, album, tunein, service, play_song, playing, find, '
 'playing_music, pandora, artist, request, library, shuffle')
 
('Topic 1: echo, hear, voice, talk, respond, room, echos, pick, kitchen, '
 'close, speak, house, listen, wake_word, record, office, mic, microphone, '
 'move, living_room, command, like, mute, noise, loud, maybe, bedroom, know, '
 'sit, make')
 
('Topic 2: turn, light, group, alexa, switch, room, bulb, hub, control, '
 'bedroom, living_room, want, fan, alexa_turn, scene, turn_lights, use, '
 'echo_plus, set, hue, color, smartthing, smart_home, add, get, dim, '
 'hue_lights, say_alexa_turn, come, name')
 
('Topic 3: connect, wifi, try, setup, echo, use, router, computer, network, '
 'internet, pc, run, connection, connected, need, disconnect, channel, able, '
 'error, server, get, password, laptop, set

In [64]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_20, corpus, id2word, R=30, sort_topics=False) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [65]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_20.html')

# K =  30

In [66]:
lda_model_30 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=30, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_30,
                                          eta=1/30,
                                          eval_every=None)

In [67]:
for idx, topic in lda_model_30.show_topics(num_topics=30, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: play, spotify, music, song, listen, playlist, amazon_music, '
 'play_music, stream, station, radio, alexa_play, podcast, apple_music, track, '
 'album, tunein, ask, play_song, playing, artist, pandora, shuffle, library, '
 'service, playing_music, play_spotify, like, spotify_app, skip')
 
('Topic 1: know, maybe, like, talk, echos, far, kind, case, pick, let, place, '
 'one, time, believe, unless, record, course, mute, mic, sure, noise, '
 'different, area, together, assume, hope, near, somehow, monitor, one_echo')
 
('Topic 2: show, echo_show, see, screen, video, display, camera, clock, spot, '
 'setting, option, youtube, keep, watch, tablet, photo, use, like, '
 'home_screen, view, recipe, picture, card, background, black, ring_doorbell, '
 'scroll, bottom, front, able')
 
('Topic 3: connect, wifi, setup, router, network, internet, try, pc, go, '
 'connection, set, connected, work, disconnect, solution, online, password, '
 'laptop, make_sure, wifi_network, reconnect, use, 

In [68]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_30, corpus, id2word) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [69]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_30.html')

In [70]:
from gensim.test.utils import datapath
# Save model to disk.
lda_model_5.save("model/lda_model_5")
lda_model_10.save("model/lda_model_10")
lda_model_20.save("model/lda_model_20")
lda_model_30.save("model/lda_model_30")

# Finding the dominant topic in each document

In [3]:
df=pd.read_csv('amazonecho_merged.csv',encoding="utf-8")
lda = gensim.models.LdaMulticore.load("model/lda_model_10")

In [4]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=df['content']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=df['publish_date'])
# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Content']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Content
0,0,7.0,0.3446,"dot, echo, echo_dot, speaker, use, connect, so...",2016-06-01 00:08:22
1,1,9.0,0.229,"amazon, get, work, buy, go, order, car, update...",2016-06-01 01:42:04
2,2,1.0,0.4328,"issue, problem, say, happen, hear, time, fix, ...",2016-06-01 02:59:31
3,3,5.0,0.2247,"alexa, skill, ask, question, list, thank, answ...",2016-06-01 03:29:19
4,4,4.0,0.4332,"use, like, alexa, make, well, thing, think, ge...",2016-06-01 05:36:07
5,5,6.0,0.1902,"show, echo_show, alarm, time, go, screen, way,...",2016-06-01 06:28:52
6,6,5.0,0.3067,"alexa, skill, ask, question, list, thank, answ...",2016-06-01 06:44:26
7,7,4.0,0.3349,"use, like, alexa, make, well, thing, think, ge...",2016-06-01 14:39:37
8,8,4.0,0.3092,"use, like, alexa, make, well, thing, think, ge...",2016-06-01 15:28:30
9,9,4.0,0.4087,"use, like, alexa, make, well, thing, think, ge...",2016-06-01 15:39:33


In [5]:
df_dominant_topic.to_csv("5_df_dominant_topic.csv",encoding = 'utf-8',index=False) 

In [74]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Link"]

# Show
sent_topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Link
0,0.0,0.7702,"play, music, spotify, song, listen, ask, echo, playlist, want, amazon_music",When I ask Alexa to “play some music” it will continue the track I was previously playing anywhe...
1,1.0,0.6838,"issue, problem, say, happen, hear, time, fix, try, seem, respond","Alexa abruptly stoping when speaking (Echo Dot 3) Hi, today I got my Echo Dot 3, and sometimes w..."
2,2.0,0.7627,"light, alexa, turn, work, group, name, room, switch, device, control",Configuring multiple Hue lights in one room - suggestions please In my kitchen I have a strip of...
3,3.0,0.7237,"device, phone, app, echo, alexa_app, set, use, account, connect, call",How can I configure my Amazon/Alexa account to drop in on my dad? I'm sure this question has bee...
4,4.0,0.6206,"use, like, alexa, make, well, thing, think, get, good, echo","Intercom system uses Alexa [""The price includes two of them, but even then seems rather expensi..."
5,5.0,0.8404,"alexa, skill, ask, question, list, thank, answer, add, check, say","Amazon Echo Dot 3rd generation ['Hey, Almansour1987! Thank you for your submission, unfortunate..."
6,6.0,0.7563,"show, echo_show, alarm, time, go, screen, way, set, wake, turn",Issue with the timer I like to set a timer for 20 minutes and then follow it with another timer ...
7,7.0,0.8376,"dot, echo, echo_dot, speaker, use, connect, sound, pair, bluetooth, audio",Echo Flex - want to have bluetooth speakers AND bluetooth mic in shower Hi\n\nI have several Ech...
8,8.0,0.7314,"routine, turn, tv, use, alexa, set, command, want, way, control",Turn up Volume Alexa and Harmony Hub How can we get Alexa to increase volume from an AV connecte...
9,9.0,0.8642,"amazon, get, work, buy, go, order, car, update, echo_auto, new",Got invited to buy the Echo Auto but can't get it shipped to me So I thought I would give it a t...


In [75]:
sent_topics_sorteddf_mallet.to_csv("4_sent_topics_sorteddf_mallet.csv", encoding = 'utf-8',index=False) 