In [1]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Visualize the topics
# Plotting tools
import pyLDAvis
# import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
id2word = corpora.Dictionary.load("corpus_dict/dict")
corpus = corpora.MmCorpus("corpus_dict/corpus")
tuning=pd.read_csv('2_lda_tuning_results.csv',encoding="utf-8")

In [3]:
tuning_5 = tuning.loc[(tuning["Topics"] == 5)]
tuning_10 = tuning.loc[(tuning['Topics'] == 10)]
tuning_20 = tuning.loc[(tuning['Topics'] == 20)]
tuning_30 = tuning.loc[(tuning['Topics'] == 30)]

alpha_5 = tuning_5.loc[tuning_5['Coherence'].idxmax(), 'Alpha']
alpha_10 = tuning_10.loc[tuning_10['Coherence'].idxmax(), 'Alpha']
alpha_20 = tuning_20.loc[tuning_20['Coherence'].idxmax(), 'Alpha']
alpha_30 = tuning_30.loc[tuning_30['Coherence'].idxmax(), 'Alpha']

# K = 5

In [4]:
lda_model_5 = gensim.models.LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=5, 
                                         random_state=100,
                                         chunksize=100,
                                         passes=40,
                                         iterations=1000,
                                         alpha=alpha_5,
                                         eta=1/5,
                                         eval_every=None)

In [5]:
for idx, topic in lda_model_5.show_topics(num_topics=5, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: play, ask, say, time, google, spotify, music, tell, get, like, '
 'happen, listen, stop, try, alarm, hear, start, song, go, work, command, day, '
 'use, want, respond, sometimes, voice, playlist, thing, way')
 
('Topic 1: phone, app, device, work, issue, try, set, show, home, add, '
 'google_home, link, account, google, problem, go, home_app, see, fix, list, '
 'use, update, change, remove, get, find, say, setting, call, help')
 
('Topic 2: get, google, use, like, make, hub, go, think, good, see, buy, look, '
 'well, come, want, know, thing, alexa, nest_hub, feature, much, love, camera, '
 'display, people, need, year, great, nest, take')
 
('Topic 3: turn, light, routine, use, set, room, control, work, switch, '
 'command, want, way, google_home, name, bedroom, device, change, need, make, '
 'house, bulb, like, run, get, able, living_room, google_assistant, plug, '
 'trigger, say')
 
('Topic 4: speaker, connect, tv, device, use, mini, chromecast, wifi, cast, '
 'google_home

In [6]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_5, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [7]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_5.html')

# K = 10

In [8]:
lda_model_10 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=10, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_10,
                                          eta=1/10,
                                          eval_every=None)

In [9]:
for idx, topic in lda_model_10.show_topics(num_topics=10, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: play, spotify, music, song, ask, playlist, play_music, use, '
 'youtube_music, listen, want, youtube, get, video, cast, like, album, work, '
 'news, start, way, podcast, try, radio, google_play, stop, stream, say, '
 'google_home, name')
 
('Topic 1: issue, work, problem, try, fix, device, happen, get, seem, update, '
 'say, google, reset, still, time, go, back, help, fine, google_home, start, '
 'factory_reset, error, reboot, today, notice, thing, see, bug, give')
 
('Topic 2: phone, use, google_assistant, call, list, work, assistant, way, '
 'find, google_home, add, want, reminder, get, able, google, app, ask, make, '
 'iphone, know, read, send, possible, say, try, name, help, android, '
 'notification')
 
('Topic 3: get, hub, buy, use, look, go, want, good, home_hub, like, order, '
 'plug, need, replace, work, display, nest, button, photo, see, thank, find, '
 'deal, think, screen, offer, take, cheap, make, purchase')
 
('Topic 4: connect, tv, chromecast, wifi, use, devic

In [10]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_10, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [11]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_10.html')

# K = 20

In [12]:
lda_model_20 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=20, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_20,
                                          eta=1/20,
                                          eval_every=None)

In [13]:
for idx, topic in lda_model_20.show_topics(num_topics=20, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: issue, problem, work, fix, try, happen, seem, time, get, google, '
 'reset, start, reboot, still, back, say, fine, sometimes, thing, well, go, '
 'disconnect, restart, help, bug, factory_reset, month, day, week, fail')
 
('Topic 1: say, ask, google, command, respond, work, gh, get, voice, '
 'hey_google, weather, try, response, hear, time, tell, change, answer, like, '
 'assistant, talk, give, ok_google, use, understand, word, know, sometimes, '
 'think, thing')
 
('Topic 2: get, buy, order, deal, free, offer, pay, email, go, sale, uk, '
 'price, purchase, code, canada, sell, available, store, say, today, '
 'subscription, month, use, thank, receive, dad, day, gpm, wait, mom')
 
('Topic 3: use, control, switch, work, plug, get, google_home, bulb, hub, '
 'need, want, buy, look, light, smart, like, color, power, connect, go, make, '
 'ge, cheap, thermostat, sensor, system, smartthing, smart_plug, hue, set')
 
('Topic 4: tv, chromecast, cast, use, video, netflix, work, pc, pod

In [14]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_20, corpus, id2word, R=30, sort_topics=False) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [15]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_20.html')

# K =  30

In [16]:
lda_model_30 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=30, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_30,
                                          eta=1/30,
                                          eval_every=None)

In [17]:
for idx, topic in lda_model_30.show_topics(num_topics=30, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: issue, problem, fix, work, try, google, seem, happen, time, '
 'disconnect, update, still, fine, cause, solution, back, never, break, well, '
 'month, google_home, start, fail, thing, help, say, get, restart, resolve, '
 'respond')
 
('Topic 1: routine, set, time, use, night, change, set_alarm, run, morning, '
 'day, schedule, want, command, pm, way, start, turn, noise, set_routine, '
 'action, suddenly, like, go, rain, option, possible, brightness, part, make, '
 'german')
 
('Topic 2: thank, advance, stream, service, ad, free, parent, soundbar, '
 'channel, watch, hi, shuffle, pay, youtube_tv, mom, dad, son, block, history, '
 'way, auto, datum, wonder, help, announce, cost, recommend, anyone_know, '
 'gift, man')
 
('Topic 3: use, control, switch, work, google_home, bulb, plug, get, light, '
 'need, want, smart, set, connect, app, buy, like, look, turn, hub, scene, '
 'smartthing, able, setup, ge, go, smart_home, hue, outlet, thing')
 
('Topic 4: connect, device, wifi, ro

In [18]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_30, corpus, id2word) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [19]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_30.html')

In [20]:
from gensim.test.utils import datapath
# Save model to disk.
lda_model_5.save("model/lda_model_5")
lda_model_10.save("model/lda_model_10")
lda_model_20.save("model/lda_model_20")
lda_model_30.save("model/lda_model_30")

# Finding the dominant topic in each document

In [3]:
df=pd.read_csv('googlehome_merged.csv',encoding="utf-8")
lda = gensim.models.LdaMulticore.load("model/lda_model_10")

In [4]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=df['content']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=df['publish_date'])
# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Content']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Content
0,0,8.0,0.4686,"google, like, get, feature, thing, think, make...",2016-06-01 20:53:18
1,1,6.0,0.24,"speaker, mini, google_home_mini, group, house,...",2016-06-06 15:04:12
2,2,1.0,0.2304,"issue, work, problem, try, fix, device, happen...",2016-08-26 01:45:35
3,3,3.0,0.2159,"get, hub, buy, use, look, go, want, good, home...",2016-09-01 03:19:28
4,4,3.0,0.2498,"get, hub, buy, use, look, go, want, good, home...",2016-09-24 02:41:27
5,5,2.0,0.2262,"phone, use, google_assistant, call, list, work...",2016-10-02 18:25:46
6,6,8.0,0.396,"google, like, get, feature, thing, think, make...",2016-10-04 18:19:08
7,7,8.0,0.2199,"google, like, get, feature, thing, think, make...",2016-10-04 22:44:21
8,8,8.0,0.5384,"google, like, get, feature, thing, think, make...",2016-10-05 01:06:01
9,9,8.0,0.366,"google, like, get, feature, thing, think, make...",2016-10-05 01:28:39


In [5]:
df_dominant_topic.to_csv("5_df_dominant_topic.csv",encoding = 'utf-8',index=False) 

In [6]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Link"]

# Show
sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Link
0,0.0,0.8163,"play, spotify, music, song, ask, playlist, play_music, use, youtube_music, listen",Play specific song from specific album (on Spotify) There's a children's song my daughter really...
1,1.0,0.7755,"issue, work, problem, try, fix, device, happen, get, seem, update",i cant link my wemo account to google home Hi every time I try linking my Wemo Account it says c...
2,2.0,0.8707,"phone, use, google_assistant, call, list, work, assistant, way, find, google_home",[Pre-ownership] How flexible is the coding? Example I'm thinking about intext. I don't own a Goo...
3,3.0,0.8104,"get, hub, buy, use, look, go, want, good, home_hub, like","Haven’t seen too much posted about this deal, Home Hub for $20 with a Nest purchase I just bough..."
4,4.0,0.7559,"connect, tv, chromecast, wifi, use, device, router, cast, network, google_home",Can Google Home cross wifi frequencies? Like the title says. Most of my devices are connected to...
5,5.0,0.7332,"set, device, show, link, account, app, home, add, home_app, option",Delete a house from home app How do I delete an old home/house from the google home app? I don't...
6,6.0,0.7692,"speaker, mini, google_home_mini, group, house, device, room, home, audio, sound",Reducing delay on a google home mini A trick to reduce the delay substantially on your google ho...
7,7.0,0.8964,"turn, light, routine, set, control, room, switch, command, use, name","Google home help, multiple lights in one room Hi all\n\nHoping someone can help out. I've got a ..."
8,8.0,0.8475,"google, like, get, feature, thing, think, make, alexa, well, much",So we Know updates are coming but...WHEN? It's been almost a month now since Google I/O and the ...
9,9.0,0.8539,"time, say, ask, tell, alarm, google, go, hear, voice, command",Alarm loop So I set an alarm for 4 minutes and immediately after my daughter also set an alarm f...


In [7]:
sent_topics_sorteddf_mallet.to_csv("4_sent_topics_sorteddf_mallet.csv", encoding = 'utf-8',index=False) 