In [1]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Visualize the topics
# Plotting tools
import pyLDAvis
# import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
id2word = corpora.Dictionary.load("corpus_dict/dict")
corpus = corpora.MmCorpus("corpus_dict/corpus")
tuning=pd.read_csv('2_lda_tuning_results.csv',encoding="utf-8")

In [3]:
tuning_5 = tuning.loc[(tuning["Topics"] == 5)]
tuning_10 = tuning.loc[(tuning['Topics'] == 10)]
tuning_20 = tuning.loc[(tuning['Topics'] == 20)]
tuning_30 = tuning.loc[(tuning['Topics'] == 30)]

alpha_5 = tuning_5.loc[tuning_5['Coherence'].idxmax(), 'Alpha']
alpha_10 = tuning_10.loc[tuning_10['Coherence'].idxmax(), 'Alpha']
alpha_20 = tuning_20.loc[tuning_20['Coherence'].idxmax(), 'Alpha']
alpha_30 = tuning_30.loc[tuning_30['Coherence'].idxmax(), 'Alpha']

# K = 5

In [4]:
lda_model_5 = gensim.models.LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=5, 
                                         random_state=100,
                                         chunksize=100,
                                         passes=40,
                                         iterations=1000,
                                         alpha=alpha_5,
                                         eta=1/5,
                                         eval_every=None)

In [5]:
for idx, topic in lda_model_5.show_topics(num_topics=5, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: device, phone, app, show, alexa_app, account, use, connect, work, '
 'issue, echo_show, go, try, set, alexa, get, amazon, echo, wifi, problem, '
 'see, link, buy, setting, change, able, option, still, network, update')
 
('Topic 1: echo, dot, use, speaker, echo_dot, alarm, sound, set, device, want, '
 'volume, house, connect, room, way, get, tv, group, music, reminder, audio, '
 'like, echos, pair, work, go, possible, bluetooth, kitchen, fire_tv')
 
('Topic 2: alexa, skill, use, call, get, amazon, make, like, know, way, want, '
 'feature, think, say, add, question, ask, voice, thank, give, notification, '
 'google, people, thing, see, list, enable, find, read, look')
 
('Topic 3: turn, light, alexa, routine, work, set, use, control, switch, tv, '
 'command, want, trigger, plug, get, group, device, go, way, say, room, time, '
 'bulb, like, change, add, name, hub, timer, run')
 
('Topic 4: play, alexa, say, ask, spotify, time, song, music, get, stop, '
 'happen, try, start, na

In [6]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_5, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [7]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_5.html')

# K = 10

In [8]:
lda_model_10 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=10, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_10,
                                          eta=1/10,
                                          eval_every=None)

In [9]:
for idx, topic in lda_model_10.show_topics(num_topics=10, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: show, amazon, get, echo_show, buy, use, good, screen, see, order, '
 'go, come, well, keep, think, camera, put, month, display, purchase, already, '
 'list, new, year, item, car, take, like, still, look')
 
('Topic 1: phone, device, account, call, use, alexa, link, set, able, home, '
 'echo, house, drop, way, kid, want, need, access, alexa_device, wife, '
 'amazon_account, contact, alexa_app, household, iphone, possible, number, '
 'log, setup, add')
 
('Topic 2: app, skill, alexa_app, change, try, go, option, update, setting, '
 'see, enable, find, alexa, notification, add, check, list, delete, disable, '
 'available, remove, open, still, select, help, get, uk, feature, new, say')
 
('Topic 3: routine, set, time, way, alarm, go, trigger, want, start, day, '
 'volume, turn, wake, minute, stop, reminder, timer, command, hour, leave, '
 'possible, night, action, morning, use, sleep, run, make, second, activate')
 
('Topic 4: alexa, make, google, know, like, skill, people, use,

In [10]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_10, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [11]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_10.html')

# K = 20

In [12]:
lda_model_20 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=20, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_20,
                                          eta=1/20,
                                          eval_every=None)

In [13]:
for idx, topic in lda_model_20.show_topics(num_topics=20, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: show, see, echo_show, screen, video, ring, camera, close, display, '
 'take, purchase, door, car, come, top, tablet, receive, stay, buy, away, '
 'thought, spot, lock, go, include, device, fire, photo, upgrade, auto')
 
('Topic 1: phone, call, account, device, drop, set, house, kid, alexa_device, '
 'able, access, wife, home, amazon_account, contact, message, number, '
 'household, share, sign, log, family, profile, setup, person, register, '
 'announcement, need, parent, echo_device')
 
('Topic 2: option, feature, update, check, amazon, setting, new, support, '
 'available, still, live, yet, set, location, uk, choose, default, since, see, '
 'info, language, address, canada, english, country, wait, version, state, '
 'finally, us')
 
('Topic 3: time, start, happen, stop, back, sometimes, go, seem, keep, always, '
 'second, never, day, long, end, mine, move, month, week, year, mode, last, '
 'recently, take, still, though, temperature, cause, normal, first')
 
('Topic 4: ski

In [14]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_20, corpus, id2word, R=30, sort_topics=False) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [15]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_20.html')

# K =  30

In [16]:
lda_model_30 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=30, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_30,
                                          eta=1/30,
                                          eval_every=None)

In [17]:
for idx, topic in lda_model_30.show_topics(num_topics=30, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: phone, use, home, house, open, leave, ring, camera, car, door, '
 'automatically, require, receive, press, away, lock, drive, alert, window, '
 'auto, photo, walk, doorbell, battery, view, picture, security, need, '
 'background, system')
 
('Topic 1: call, account, drop, wife, set, kid, access, amazon_account, '
 'contact, number, household, share, log, able, sign, person, family, profile, '
 'register, house, use, parent, mom, dad, separate, allow, need, daughter, '
 'child, echo_device')
 
('Topic 2: amazon, option, feature, update, support, available, live, yet, '
 'location, uk, choose, default, currently, cant_find, set, version, language, '
 'canada, english, country, address, state, still, date, us, store, guess, '
 'base, local, release')
 
('Topic 3: happen, keep, back, put, check, maybe, never, year, mine, though, '
 'solution, month, move, least, week, last, normal, sort, enough, later, '
 'advice, bathroom, since, pull, bring, perhaps, side, audible, somehow, si

In [18]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_30, corpus, id2word) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [19]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_30.html')

In [20]:
from gensim.test.utils import datapath
# Save model to disk.
lda_model_5.save("model/lda_model_5")
lda_model_10.save("model/lda_model_10")
lda_model_20.save("model/lda_model_20")
lda_model_30.save("model/lda_model_30")

# Finding the dominant topic in each document

In [4]:
df=pd.read_csv('alexa_merged.csv',encoding="utf-8")
lda = gensim.models.LdaMulticore.load("model/lda_model_10")

In [5]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=df['content']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=df['publish_date'])
# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Content']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Content
0,0,1.0,0.1984,"phone, device, account, call, use, alexa, link...",2016-06-01 03:25:15
1,1,0.0,0.1,"show, amazon, get, echo_show, buy, use, good, ...",2016-06-03 02:59:42
2,2,3.0,0.2268,"routine, set, time, way, alarm, go, trigger, w...",2016-06-04 02:19:44
3,3,6.0,0.1174,"alexa, use, tv, control, work, switch, need, p...",2016-06-04 02:55:29
4,4,4.0,0.1746,"alexa, make, google, know, like, skill, people...",2016-06-04 11:01:10
5,5,1.0,0.1949,"phone, device, account, call, use, alexa, link...",2016-06-07 04:33:25
6,6,4.0,0.414,"alexa, make, google, know, like, skill, people...",2016-06-08 13:30:47
7,7,6.0,0.1176,"alexa, use, tv, control, work, switch, need, p...",2016-06-11 06:27:51
8,8,4.0,0.1414,"alexa, make, google, know, like, skill, people...",2016-06-11 10:44:39
9,9,3.0,0.1625,"routine, set, time, way, alarm, go, trigger, w...",2016-06-13 01:09:31


In [6]:
df_dominant_topic.to_csv("5_df_dominant_topic.csv",encoding = 'utf-8',index=False) 

In [14]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Link"]

# Show
sent_topics_sorteddf_mallet.head

<bound method NDFrame.head of    Topic_Num  Topic_Perc_Contrib  \
0        0.0              0.6305   
1        1.0              0.7229   
2        2.0              0.5820   
3        3.0              0.7668   
4        4.0              0.6415   
5        5.0              0.5895   
6        6.0              0.6271   
7        7.0              0.6980   
8        8.0              0.7667   
9        9.0              0.7035   

                                                                            Keywords  \
0                   show, amazon, get, echo_show, buy, use, good, screen, see, order   
1                    phone, device, account, call, use, alexa, link, set, able, home   
2               app, skill, alexa_app, change, try, go, option, update, setting, see   
3                      routine, set, time, way, alarm, go, trigger, want, start, day   
4                  alexa, make, google, know, like, skill, people, use, amazon, look   
5           issue, connect, echo, echo_dot, d

In [15]:
sent_topics_sorteddf_mallet.to_csv("4_sent_topics_sorteddf_mallet.csv", encoding = 'utf-8',index=False) 