In [1]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Visualize the topics
# Plotting tools
import pyLDAvis
# import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)



In [2]:
id2word = corpora.Dictionary.load("corpus_dict/dict")
corpus = corpora.MmCorpus("corpus_dict/corpus")
tuning=pd.read_csv('2_lda_tuning_results.csv',encoding="utf-8")

In [3]:
tuning_5 = tuning.loc[(tuning["Topics"] == 5)]
tuning_10 = tuning.loc[(tuning['Topics'] == 10)]
tuning_20 = tuning.loc[(tuning['Topics'] == 20)]
tuning_30 = tuning.loc[(tuning['Topics'] == 30)]

alpha_5 = tuning_5.loc[tuning_5['Coherence'].idxmax(), 'Alpha']
alpha_10 = tuning_10.loc[tuning_10['Coherence'].idxmax(), 'Alpha']
alpha_20 = tuning_20.loc[tuning_20['Coherence'].idxmax(), 'Alpha']
alpha_30 = tuning_30.loc[tuning_30['Coherence'].idxmax(), 'Alpha']

# K = 5

In [4]:
lda_model_5 = gensim.models.LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=5, 
                                         random_state=100,
                                         chunksize=100,
                                         passes=40,
                                         iterations=1000,
                                         alpha=alpha_5,
                                         eta=1/5,
                                         eval_every=None)

In [5]:
for idx, topic in lda_model_5.show_topics(num_topics=5, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: play, google_assistant, google, app, use, list, song, music, '
 'spotify, add, like, ask, link, assistant, note, good, service, news, get, '
 'show, work, way, shopping_list, video, want, keep, open, playlist, help, say')
 
('Topic 1: phone, turn, device, light, screen, google_assistant, tv, language, '
 'assistant, speaker, set, work, command, use, google_home, connect, change, '
 'home, control, way, english, switch, respond, chromecast, unlock, setting, '
 'want, bluetooth, room, volume')
 
('Topic 2: set, routine, time, assistant, command, tell, alarm, want, start, '
 'say, way, work, action, open, go, use, read, get, minute, day, trigger, '
 'need, calendar, give, app, make, weather, ask, timer, like')
 
('Topic 3: google_assistant, get, phone, work, google, assistant, reminder, '
 'issue, problem, update, fix, happen, use, try, android, still, go, setting, '
 'pixel, google_app, notification, app, location, see, new, button, show, '
 'back, stop, seem')
 
('Topic 4: go

In [6]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_5, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [7]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_5.html')

# K = 10

In [8]:
lda_model_10 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=10, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_10,
                                          eta=1/10,
                                          eval_every=None)

In [9]:
for idx, topic in lda_model_10.show_topics(num_topics=10, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: play, google_assistant, ask, music, spotify, get, news, question, '
 'location, video, drive, listen, issue, youtube, say, work, speaker, use, '
 'close, different, launch, live, like, audio, home, play_music, '
 'youtube_music, correct, medium, simply')
 
('Topic 1: device, change, language, tv, phone, home, google_home, english, '
 'assistant, google_assistant, connect, voice, chromecast, support, speak, '
 'setting, control, different, set, try, work, switch, available, cast, '
 'understand, think, use, google_home_mini, respond, able')
 
('Topic 2: set, routine, command, start, time, alarm, tell, want, say, action, '
 'trigger, assistant, weather, need, go, way, get, give, make, use, stop, day, '
 'work, create, morning, specific, dont_want, podcast, certain, end')
 
('Topic 3: problem, issue, phone, fix, try, update, google, google_assistant, '
 'assistant, happen, setting, still, google_app, work, android, button, pixel, '
 'get, samsung, stop, back, go, galaxy, headph

In [10]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_10, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [11]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_10.html')

# K = 20

In [12]:
lda_model_20 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=20, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_20,
                                          eta=1/20,
                                          eval_every=None)

In [13]:
for idx, topic in lda_model_20.show_topics(num_topics=20, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: show, news, google_assistant, ask, question, never, page, '
 'different, get, google, notice, yet, work, say, sure, live, simply, link, '
 'weird, random, im_use, whole, fail, choose, source, cant_get, top, feedback, '
 'ask_google, multiple')
 
('Topic 1: change, voice, language, assistant, setting, english, speak, set, '
 'google_assistant, voice_match, support, think, understand, mine, different, '
 'available, spanish, train, option, later, german, recognize, uk, dutch, '
 'find, accent, work, phone, normal, english_us')
 
('Topic 2: routine, start, set, want, get, weather, go, say, specific, tell, '
 'need, action, assistant, morning, make, seem, run, give, find, end, command, '
 'step, exle, figure, daily, way, certain, cant_find, assistant_settings, '
 'create')
 
('Topic 3: happen, fix, problem, stop, assistant, still, try, back, old, '
 'month, get, google, come, pop, google_assistant, recently, week, new, bug, '
 'anyone_know, ive_tried, seem, work, button, maybe, 

In [14]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_20, corpus, id2word, R=30, sort_topics=False) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [15]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_20.html')

# K =  30

In [16]:
lda_model_30 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=30, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_30,
                                          eta=1/30,
                                          eval_every=None)

In [17]:
for idx, topic in lda_model_30.show_topics(num_topics=30, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: play, youtube_music, ask, google_assistant, music, spotify, '
 'speaker, netflix, try, help, live, service, google_play_music, issue, '
 'artist, youtube, ask_google, perhaps, alexa, radio, request, stream, '
 'playing_music, medium, want, random, reply, play_song, tip, say')
 
('Topic 1: language, english, phone, change, set, assistant, google_assistant, '
 'speak, support, spanish, german, dutch, available, answer, '
 'google_home_device, english_us, work, voice_model, retrain_voice, '
 'new_google_assistant, understand, assistant_doesnt, talk, think, device, '
 'want, setting, try, way, switch')
 
('Topic 2: ga, play_music, music, hub, get, suggestion, work, nest_hub, '
 'google_assistant, select, want, display, use, play, iphone, '
 'use_google_assistant, google_nest, like, find, option, specify, '
 'play_playlist, command, pay, pro, say, simply, device, say_hey, cant_get')
 
('Topic 3: hey_google, try, work, google, fix, listen, ok_google, mic, use, '
 'respond, word, p

In [18]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_30, corpus, id2word) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [19]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_30.html')

In [20]:
from gensim.test.utils import datapath
# Save model to disk.
lda_model_5.save("model/lda_model_5")
lda_model_10.save("model/lda_model_10")
lda_model_20.save("model/lda_model_20")
lda_model_30.save("model/lda_model_30")

# Finding the dominant topic in each document

In [21]:
df=pd.read_csv('googleassistant_merged.csv',encoding="utf-8")
lda = gensim.models.LdaMulticore.load("model/lda_model_5")

In [22]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=df['content']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=df['content'])
# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Content']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Content
0,0,0.0,0.6028,"play, google_assistant, google, app, use, list...",What is Google Assistant and how does it work?...
1,1,3.0,0.2647,"google_assistant, get, phone, work, google, as...","What is Google Assistant, how does it work and..."
2,2,3.0,0.2372,"google_assistant, get, phone, work, google, as...",Allo Easter Egg (All your base) ['nan']
3,3,3.0,0.4343,"google_assistant, get, phone, work, google, as...",Google Assistant in Nexus Will google assistan...
4,4,0.0,0.2,"play, google_assistant, google, app, use, list...",Offical Site ['nan']
5,5,4.0,0.3623,"google, voice, say, call, google_assistant, tr...","What can I say, it’s a Labrador with super pow..."
6,6,0.0,0.3837,"play, google_assistant, google, app, use, list...",The big bet by Google for the next ten years i...
7,7,3.0,0.4026,"google_assistant, get, phone, work, google, as...",Why I Switched To Assistant... And Back Again!...
8,8,3.0,0.3092,"google_assistant, get, phone, work, google, as...",Enable Google Assistant on Android 6.0 Marshma...
9,9,3.0,0.3092,"google_assistant, get, phone, work, google, as...",How To Enable Google Assistant on Android 7.0 ...


In [23]:
df_dominant_topic.to_csv("3_df_dominant_topic.csv",encoding = 'utf-8',index=False) 

In [24]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Link"]

# Show
sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Link
0,0.0,0.8906,"play, google_assistant, google, app, use, list, song, music, spotify, add",Why can't Google play a particular song? I have been trying a few times to get my Google home to...
1,1.0,0.8128,"phone, turn, device, light, screen, google_assistant, tv, language, assistant, speaker",Grouping devices within a room? With Apple HomeKit and Alexa I’m able to group devices together ...
2,2.0,0.8396,"set, routine, time, assistant, command, tell, alarm, want, start, say",Cancelling alarms I have been playing with Google assistant routines and I am unable to figure o...
3,3.0,0.9158,"google_assistant, get, phone, work, google, assistant, reminder, issue, problem, update","""Ok Google"" Model Update causing Google App to crash So, a few hours ago, my phone received this..."
4,4.0,0.884,"google, voice, say, call, google_assistant, try, ga, ask, like, make","Voice to text name spelling My wife's name is Mackenzie. Whenever I use voice to text, her name ..."


In [25]:
sent_topics_sorteddf_mallet.to_csv("4_sent_topics_sorteddf_mallet.csv", encoding = 'utf-8',index=False) 