In [1]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Visualize the topics
# Plotting tools
import pyLDAvis
# import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)



In [2]:
id2word = corpora.Dictionary.load("corpus_dict/dict")
corpus = corpora.MmCorpus("corpus_dict/corpus")
tuning=pd.read_csv('2_lda_tuning_results.csv',encoding="utf-8")

In [3]:
tuning_5 = tuning.loc[(tuning["Topics"] == 5)]
tuning_10 = tuning.loc[(tuning['Topics'] == 10)]
tuning_20 = tuning.loc[(tuning['Topics'] == 20)]
tuning_30 = tuning.loc[(tuning['Topics'] == 30)]

alpha_5 = tuning_5.loc[tuning_5['Coherence'].idxmax(), 'Alpha']
alpha_10 = tuning_10.loc[tuning_10['Coherence'].idxmax(), 'Alpha']
alpha_20 = tuning_20.loc[tuning_20['Coherence'].idxmax(), 'Alpha']
alpha_30 = tuning_30.loc[tuning_30['Coherence'].idxmax(), 'Alpha']

# K = 5

In [4]:
lda_model_5 = gensim.models.LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=5, 
                                         random_state=100,
                                         chunksize=100,
                                         passes=40,
                                         iterations=1000,
                                         alpha=alpha_5,
                                         eta=1/5,
                                         eval_every=None)

In [5]:
for idx, topic in lda_model_5.show_topics(num_topics=5, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: play, ask, say, time, google, spotify, music, tell, get, like, '
 'happen, listen, stop, try, alarm, hear, start, song, go, work, command, day, '
 'use, want, respond, sometimes, voice, playlist, thing, way')
 
('Topic 1: phone, app, device, work, issue, try, set, show, home, add, '
 'google_home, link, account, google, problem, go, home_app, see, fix, list, '
 'use, update, change, remove, get, find, say, setting, call, help')
 
('Topic 2: get, google, use, like, make, hub, go, think, good, see, buy, look, '
 'well, come, want, know, thing, alexa, nest_hub, feature, much, love, camera, '
 'display, people, need, year, great, nest, take')
 
('Topic 3: turn, light, routine, use, set, room, control, work, switch, '
 'command, want, way, google_home, name, bedroom, device, change, need, make, '
 'house, bulb, like, run, get, able, living_room, google_assistant, plug, '
 'trigger, say')
 
('Topic 4: speaker, connect, tv, device, use, mini, chromecast, wifi, cast, '
 'google_home

In [6]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_5, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [7]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_5.html')

# K = 10

In [8]:
lda_model_10 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=10, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_10,
                                          eta=1/10,
                                          eval_every=None)

In [9]:
for idx, topic in lda_model_10.show_topics(num_topics=10, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: play, spotify, music, song, ask, playlist, play_music, use, '
 'youtube_music, listen, want, youtube, get, video, cast, like, album, work, '
 'news, start, way, podcast, try, radio, google_play, stop, stream, say, '
 'google_home, name')
 
('Topic 1: issue, work, problem, try, fix, device, happen, get, seem, update, '
 'say, google, reset, still, time, go, back, help, fine, google_home, start, '
 'factory_reset, error, reboot, today, notice, thing, see, bug, give')
 
('Topic 2: phone, use, google_assistant, call, list, work, assistant, way, '
 'find, google_home, add, want, reminder, get, able, google, app, ask, make, '
 'iphone, know, read, send, possible, say, try, name, help, android, '
 'notification')
 
('Topic 3: get, hub, buy, use, look, go, want, good, home_hub, like, order, '
 'plug, need, replace, work, display, nest, button, photo, see, thank, find, '
 'deal, think, screen, offer, take, cheap, make, purchase')
 
('Topic 4: connect, tv, chromecast, wifi, use, devic

In [10]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_10, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [11]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_10.html')

# K = 20

In [None]:
lda_model_20 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=20, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_20,
                                          eta=1/20,
                                          eval_every=None)

In [None]:
for idx, topic in lda_model_20.show_topics(num_topics=20, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

In [None]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_20, corpus, id2word, R=30, sort_topics=False) #mds='tsne' 
vis

In [None]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_20.html')

# K =  30

In [None]:
lda_model_30 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=30, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_30,
                                          eta=1/30,
                                          eval_every=None)

In [None]:
for idx, topic in lda_model_30.show_topics(num_topics=30, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

In [None]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_30, corpus, id2word) #mds='tsne' 
vis

In [None]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_30.html')

In [None]:
from gensim.test.utils import datapath
# Save model to disk.
lda_model_5.save("model/lda_model_5")
lda_model_10.save("model/lda_model_10")
lda_model_20.save("model/lda_model_20")
lda_model_30.save("model/lda_model_30")

# Finding the dominant topic in each document

In [None]:
df=pd.read_csv('googlehome_merged.csv',encoding="utf-8")
lda = gensim.models.LdaMulticore.load("model/lda_model_5")

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=df['content']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=df['content'])
# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Content']
df_dominant_topic.head(10)

In [None]:
df_dominant_topic.to_csv("3_df_dominant_topic.csv",encoding = 'utf-8',index=False) 

In [None]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Link"]

# Show
sent_topics_sorteddf_mallet.head(10)

In [None]:
sent_topics_sorteddf_mallet.to_csv("4_sent_topics_sorteddf_mallet.csv", encoding = 'utf-8',index=False) 