In [1]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Visualize the topics
# Plotting tools
import pyLDAvis
# import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)



In [2]:
id2word = corpora.Dictionary.load("corpus_dict/dict")
corpus = corpora.MmCorpus("corpus_dict/corpus")
tuning=pd.read_csv('2_lda_tuning_results.csv',encoding="utf-8")

In [4]:
tuning_5 = tuning.loc[(tuning["Topics"] == 5)]
tuning_10 = tuning.loc[(tuning['Topics'] == 10)]
tuning_20 = tuning.loc[(tuning['Topics'] == 20)]
tuning_30 = tuning.loc[(tuning['Topics'] == 30)]

alpha_5 = tuning_5.loc[tuning_5['Coherence'].idxmax(), 'Alpha']
alpha_10 = tuning_10.loc[tuning_10['Coherence'].idxmax(), 'Alpha']
alpha_20 = tuning_20.loc[tuning_20['Coherence'].idxmax(), 'Alpha']
alpha_30 = tuning_30.loc[tuning_30['Coherence'].idxmax(), 'Alpha']

# K = 5

In [5]:
lda_model_5 = gensim.models.LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=5, 
                                         random_state=100,
                                         chunksize=100,
                                         passes=40,
                                         iterations=1000,
                                         alpha=alpha_5,
                                         eta=1/5,
                                         eval_every=None)

In [6]:
for idx, topic in lda_model_5.show_topics(num_topics=5, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: siri, get, shortcut, like, apple, ask, give, time, question, make, '
 'try, answer, thing, response, start, people, actually, google, reminder, '
 'work, app, timer, year, know, see, good, today, use, lot, want')
 
('Topic 1: siri, phone, turn, use, hey_siri, work, screen, apple, listen, '
 'make, sound, homepod, need, command, iphone, light, watch, device, happen, '
 'home, unlock, control, activate, go, car, thing, set, way, audio, like')
 
('Topic 2: siri, play, song, app, like, add, ask_siri, music, apple_music, '
 'list, way, playlist, tell, ask, spotify, want, library, come, create, '
 'calendar, go, help, search, note, find, able, link, apple, share, open')
 
('Topic 3: siri, say, call, name, set, alarm, use, contact, text, go, time, '
 'want, ask, message, read, try, word, send, way, speak, right, pm, think, '
 'wife, english, maybe, sure, mean, language, know')
 
('Topic 4: siri, iphone, voice, ios, work, issue, change, get, find, problem, '
 'use, phone, update, tr

In [7]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_5, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [8]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_5.html')

# K = 10

In [9]:
lda_model_10 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=10, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_10,
                                          eta=1/10,
                                          eval_every=None)

In [10]:
for idx, topic in lda_model_10.show_topics(num_topics=10, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: app, shortcut, work, use, set, try, light, homekit, home, weather, '
 'timer, run, command, make, start, siri, use_siri, support, way, exle, '
 'number, siri_shortcut, get, leave, first, ask, action, easy, simple, phrase')
 
('Topic 1: siri, apple, apple_music, give, make, people, library, thing, like, '
 'year, google, actually, assistant, bad, know, alexa, user, release, lot, '
 'good, write, sorry, big, need, always, want, first, version, use, well')
 
('Topic 2: add, ask_siri, time, help, siri, calendar, post, way, reminder, '
 'able, tell, love, hour, create, thank, please, take, note, link, account, '
 'show, understand, remind, next, event, apple_watch, come, read, apple, app')
 
('Topic 3: siri, say, name, contact, text, speak, call, language, want, word, '
 'english, message, setting, way, go, wife, know, dictation, sure, edit, '
 'change, option, type, try, hey, siri_doesnt, pronounce, think, use, ok')
 
('Topic 4: ask, music, hey_siri, question, answer, find, siri

In [11]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_10, corpus, id2word, R=30, sort_topics=False)
vis

  default_term_info = default_term_info.sort_values(


In [12]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_10.html')

# K = 20

In [13]:
lda_model_20 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=20, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_20,
                                          eta=1/20,
                                          eval_every=None)

In [14]:
for idx, topic in lda_model_20.show_topics(num_topics=20, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: app, timer, weather, shortcut, work, start, use, next, watch, '
 'siri_shortcut, minute, create, yes, make, like, let, feature, face, '
 'use_siri, stop, set, phrase, action, mean, custom, hour, shortcuts, able, '
 'want, shortcuts_app')
 
('Topic 1: siri, phone, homepod, say, like, happen, turn, try, get, command, '
 'go, use, thing, light, issue, hey_siri, make, ask, start, talk, time, since, '
 'think, apple, fix, first, watch, work, really, know')
 
('Topic 2: apple, siri, homekit, link, siris, like, alexa, please, assistant, '
 'release, app, cool, know, integrate, home, ios, source, well, amazon, life, '
 'delete, question, kid, thank, future, offer, say_hey, google, random, device')
 
('Topic 3: name, siri, say, like, call, contact, video, hear, get, good, '
 'pronounce, try, lol, right, add, text, little, know, go, thing, info, setup, '
 'hello, process, pronunciation, respond, bit, find, sound, happen')
 
('Topic 4: hey_siri, voice, iphone, work, phone, issue, featu

In [15]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_20, corpus, id2word, R=30, sort_topics=False) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [16]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_20.html')

# K =  30

In [17]:
lda_model_30 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=30, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=alpha_30,
                                          eta=1/30,
                                          eval_every=None)

In [18]:
for idx, topic in lda_model_30.show_topics(num_topics=30, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

('Topic 0: turn, light, get, weather, homepod, home, today, week, like, ask, '
 'hey_siri, control, try, morning, business, car, use, simply, connection, '
 'online, shortcuts_app, exle, device, know, go, experience, thing, '
 'play_music, mobile, joke')
 
('Topic 1: siri, say, apple, make, work, get, thing, phone, happen, need, try, '
 'homepod, use, like, see, think, ask, time, always, really, big, talk, '
 'response, since, want, go, google, way, first, day')
 
('Topic 2: read, answer, question, ask_siri, good, ask, text, lol, funny, '
 'message, make_siri, delete, able, say, pretty, source, get_siri, think, day, '
 'tip, get, siri_suggestion, voice_control, friend, want, announce, laugh, '
 'carplay, learn, like')
 
('Topic 3: say, siri, word, command, see, like, know, homekit, tap, time, '
 'work, setup, give, pick, either, turn, try, screen, action, scene, good, '
 'restart, look, able, set, thank, hello, hard, icon, error')
 
('Topic 4: iphone, issue, hey_siri, work, phone, prob

In [19]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_30, corpus, id2word) #mds='tsne' 
vis

  default_term_info = default_term_info.sort_values(


In [20]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_30.html')

In [21]:
from gensim.test.utils import datapath
# Save model to disk.
lda_model_5.save("model/lda_model_5")
lda_model_10.save("model/lda_model_10")
lda_model_20.save("model/lda_model_20")
lda_model_30.save("model/lda_model_30")

# Finding the dominant topic in each document

In [23]:
df=pd.read_csv('siri_merged.csv',encoding="utf-8")
lda = gensim.models.LdaMulticore.load("model/lda_model_5")

In [24]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=df['content']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=df['content'])
# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Content']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Content
0,0,0.0,0.4097,"siri, get, shortcut, like, apple, ask, give, t...",Apple is working on an AI system that wipes th...
1,1,0.0,0.4229,"siri, get, shortcut, like, apple, ask, give, t...","I asked my 5S Siri to flip a coin for me, had ..."
2,2,0.0,0.2493,"siri, get, shortcut, like, apple, ask, give, t...",If Apple's latest commercial was honest... ['...
3,3,3.0,0.2158,"siri, say, call, name, set, alarm, use, contac...",Worldwide exclusive interview with Siri ['nan']
4,4,3.0,0.3462,"siri, say, call, name, set, alarm, use, contac...","Siri responds to AT&amp;T ""Hostess with the Mo..."
5,5,0.0,0.2,"siri, get, shortcut, like, apple, ask, give, t...",Asked Siri What's up? ['nan']
6,6,4.0,0.2641,"siri, iphone, voice, ios, work, issue, change,...","She's probably a huge fan of Michael Bay, too...."
7,7,3.0,0.3327,"siri, say, call, name, set, alarm, use, contac...",Who are you voting for in the 2016 U.S. Electi...
8,8,0.0,0.3224,"siri, get, shortcut, like, apple, ask, give, t...","Apple's Siri calls ambulance for baby [""Well ..."
9,9,0.0,0.3557,"siri, get, shortcut, like, apple, ask, give, t...",3 ways Apple needs to improve Siri ['I hope A...


In [25]:
df_dominant_topic.to_csv("3_df_dominant_topic.csv",encoding = 'utf-8',index=False) 

In [26]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Link"]

# Show
sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Link
0,0.0,0.9496,"siri, get, shortcut, like, apple, ask, give, time, question, make","#GiveSiriAFace Let’s ask apple to give siri face on her 10th birthday \n\nAs you may, or may no..."
1,1.0,0.911,"siri, phone, turn, use, hey_siri, work, screen, apple, listen, make",Siri no longer making a sound when activated? I have a tech question in regards to Siri no longe...
2,2.0,0.9392,"siri, play, song, app, like, add, ask_siri, music, apple_music, list","Is there a way to have Siri read my Outlook events? I use Outlook for my business, so all of my ..."
3,3.0,0.9093,"siri, say, call, name, set, alarm, use, contact, text, go","Siri got way dumber about alarms a couple of days ago Used to be, if it was 2 PM, I could say, ""..."
4,4.0,0.8647,"siri, iphone, voice, ios, work, issue, change, get, find, problem","Best device for using ""Hey Siri"" in home Hello,\n\nI've been waiting for a while now to get my h..."


In [27]:
sent_topics_sorteddf_mallet.to_csv("4_sent_topics_sorteddf_mallet.csv", encoding = 'utf-8',index=False) 