In [1]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Visualize the topics
# Plotting tools
import pyLDAvis
# import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
id2word = corpora.Dictionary.load("corpus_dict/dict")
corpus = corpora.MmCorpus("corpus_dict/corpus")
tuning=pd.read_csv('2_lda_tuning_results.csv',encoding="utf-8")

In [4]:
tuning_5 = tuning.loc[(tuning["Topics"] == 5)]
tuning_10 = tuning.loc[(tuning['Topics'] == 10)]
tuning_20 = tuning.loc[(tuning['Topics'] == 20)]
tuning_30 = tuning.loc[(tuning['Topics'] == 30)]

alpha_5 = tuning_5.loc[tuning_5['Coherence'].idxmax(), 'Alpha']
alpha_10 = tuning_10.loc[tuning_10['Coherence'].idxmax(), 'Alpha']
alpha_20 = tuning_20.loc[tuning_20['Coherence'].idxmax(), 'Alpha']
alpha_30 = tuning_30.loc[tuning_30['Coherence'].idxmax(), 'Alpha']

# K = 5

In [5]:
lda_model_5 = gensim.models.LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=5, 
                                         random_state=100,
                                         chunksize=100,
                                         passes=40,
                                         iterations=1000,
                                         alpha=alpha_5,
                                         eta=1/5,
                                         eval_every=None)

In [6]:
for idx, topic in lda_model_5.show_topics(num_topics=5, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: siri, homepod, turn, ask, say, set, light, use, time, like, home, '
 'get, shortcut, want, phone, tell, homekit, voice, way, device, work, room, '
 'hey_siri, iphone, command, call, thing, make, alarm, name')
 
('Topic 1: homepod, play, airplay, music, use, apple_music, iphone, apple_tv, '
 'phone, song, app, work, spotify, audio, play_music, stream, control, way, '
 'able, mac, select, device, show, want, playlist, speaker, connect, video, '
 'possible, like')
 
('Topic 2: issue, homepod, work, update, problem, fix, try, home_app, wifi, '
 'happen, iphone, connect, set, phone, reset, get, device, network, router, '
 'go, seem, time, say, setting, turn, still, see, back, use, homepod_mini')
 
('Topic 3: homepod, sound, speaker, tv, use, pair, stereo_pair, room, mini, '
 'get, minis, apple_tv, well, stereo, think, setup, music, like, audio, '
 'homepods, good, go, bass, want, make, put, set, hear, living_room, much')
 
('Topic 4: apple, get, homepod, homepod_mini, buy, go, th

In [7]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_5, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [8]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_5.html')

# K = 10

In [9]:
lda_model_10 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=10, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_10,
                                          eta=1/10,
                                          eval_every=None)

In [10]:
for idx, topic in lda_model_10.show_topics(num_topics=10, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: siri, homepod, turn, light, say, room, ask, time, hey_siri, '
 'respond, tell, volume, command, answer, way, call, name, get, like, talk, '
 'hear, want, speak, voice, living_room, thing, change, request, use, scene')
 
('Topic 1: homepod, iphone, phone, airplay, music, spotify, control, '
 'apple_music, device, app, stream, use, handoff, way, play_music, ios, tap, '
 'show, play, go, select, want, open, music_app, able, transfer, work, switch, '
 'ipad, button')
 
('Topic 2: issue, update, homepod, fix, problem, happen, try, work, seem, '
 'reset, still, time, back, since, go, restart, bug, say, start, ios, '
 'sometimes, thing, beta, notice, home_app, see, remove, stop, apple, second')
 
('Topic 3: sound, homepod, room, stereo_pair, speaker, pair, bass, music, put, '
 'think, stereo, hear, like, good, well, place, listen, love, get, make, much, '
 'wall, go, volume, bedroom, move, difference, really, stand, use')
 
('Topic 4: play, song, music, shortcut, apple_music, like,

In [11]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_10, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [12]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_10.html')

# K = 20

In [13]:
lda_model_20 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=20, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_20,
                                          eta=1/20,
                                          eval_every=None)

In [14]:
for idx, topic in lda_model_20.show_topics(num_topics=20, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: airplay, apple_music, app, homepod, spotify, support, able, stream, '
 'use, music, want, service, allow, apple, way, possible, switch, source, '
 'need, device, via_airplay, mean, use_airplay, pandora, ability, reason, '
 'actually, iphone_ipad, open, exle')
 
('Topic 1: homepod, iphone, phone, control, music, handoff, select, tap, show, '
 'pause, device, hold, button, music_app, open, transfer, ipad, airplay, play, '
 'work, hand, touch, control_center, continue, play_music, instead, sometimes, '
 'go, press, way')
 
('Topic 2: time, homepod, go, call, stop, make, way, alarm, day, start, use, '
 'leave, automation, wake, morning, yes, hour, sleep, minute, watch, timer, '
 'tell, night, set, airpod, think, get, like, possible, automatically')
 
('Topic 3: sound, listen, love, music, bass, good, like, hear, much, really, '
 'homepod, think, great, difference, get, well, make, amazing, feel, '
 'sound_quality, compare, try, definitely, test, clear, go, bit, much_better, '
 '

In [15]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_20, corpus, id2word, R=30, sort_topics=False) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [16]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_20.html')

# K =  30

In [17]:
lda_model_30 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=30, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_30,
                                          eta=1/30,
                                          eval_every=None)

In [18]:
for idx, topic in lda_model_30.show_topics(num_topics=30, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: homepod, issue, play, problem, sometimes, fix, airplay, music, '
 'pause, speaker, play_music, bug, airplaye, seem, restart, stop, music_app, '
 'show, happen, playing_music, audio, default, second, playing, go, start, '
 'macbook, whenever, press, hi')
 
('Topic 1: apple, homepod, transfer, amazon, make, need, next, mean, product, '
 'new, come, bring, battery, company, line, feature, standard, release, take, '
 'display, door, open, see, interesting, customer, really, allow, introduce, '
 'design, datum')
 
('Topic 2: call, alarm, set, morning, time, go, hour, homepod, way, day, '
 'sleep, make, kid, night, use, read, get, ring, tell, hit, finish, charge, '
 'stop, bed, recognise, minute, last, min, phone, ui')
 
('Topic 3: sound, volume, hear, loud, bass, listen, make, difference, low, '
 'level, test, music, noise, know, turn, audio, homepod, think, way, change, '
 'notice, single_homepod, microphone, neighbor, try, increase, night, play, '
 'say, much')
 
('Topic 4: tim

In [19]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_30, corpus, id2word) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [20]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_30.html')

In [21]:
from gensim.test.utils import datapath
# Save model to disk.
lda_model_5.save("model/lda_model_5")
lda_model_10.save("model/lda_model_10")
lda_model_20.save("model/lda_model_20")
lda_model_30.save("model/lda_model_30")

# Finding the dominant topic in each document

In [6]:
df=pd.read_csv('homepod_merged.csv',encoding="utf-8")
lda = gensim.models.LdaMulticore.load("model/lda_model_10")

In [7]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=df['content']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=df['publish_date'])
# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Content']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Content
0,0,5.0,0.2465,"get, buy, apple, order, look, go, new, homepod...",2017-06-06 03:13:54
1,1,4.0,0.1417,"play, song, music, shortcut, apple_music, like...",2017-06-06 03:13:58
2,2,6.0,0.416,"apple, mini, homepod_mini, homepod, minis, spe...",2017-06-06 03:27:39
3,3,2.0,0.1447,"issue, update, homepod, fix, problem, happen, ...",2017-06-06 03:32:30
4,4,7.0,0.2336,"set, homepod, home, work, home_app, setting, t...",2017-06-06 19:13:18
5,5,1.0,0.1693,"homepod, iphone, phone, airplay, music, spotif...",2017-06-06 19:14:49
6,6,9.0,0.1684,"apple_tv, homepod, tv, use, audio, airplay, sp...",2017-06-06 20:58:36
7,7,6.0,0.5202,"apple, mini, homepod_mini, homepod, minis, spe...",2017-06-07 02:51:16
8,8,1.0,0.2589,"homepod, iphone, phone, airplay, music, spotif...",2017-06-07 06:47:27
9,9,7.0,0.3989,"set, homepod, home, work, home_app, setting, t...",2017-06-07 22:40:37


In [8]:
df_dominant_topic.to_csv("5_df_dominant_topic.csv",encoding = 'utf-8',index=False) 

In [25]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Link"]

# Show
sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Link
0,0.0,0.8889,"siri, homepod, turn, ask, say, set, light, use, time, like",HomeKit commands are aware of what room HomePod is in and let you say “turn on the lights in her...
1,1.0,0.8813,"homepod, play, airplay, music, use, apple_music, iphone, apple_tv, phone, song","If you Airplay a song from Apple Music to the HomePod, does the HomePod play that song directly ..."
2,2.0,0.9004,"issue, homepod, work, update, problem, fix, try, home_app, wifi, happen",homepod mini personal requests or anything iPhone related not working. Everytime I try to use my...
3,3.0,0.9278,"homepod, sound, speaker, tv, use, pair, stereo_pair, room, mini, get",Two HomePods for TV Viewing I currently have an ok Vizio soundbar with a subwoofer and two rear ...
4,4.0,0.9379,"apple, get, homepod, homepod_mini, buy, go, think, like, see, come",HomePod mini ordering from Apple-shipping estimates Hi all-\n\nOrdered 4 minis for my nephews as...


In [26]:
sent_topics_sorteddf_mallet.to_csv("4_sent_topics_sorteddf_mallet.csv", encoding = 'utf-8',index=False) 