In [23]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['https_www','seems_like','do','not','imgur','tkg','https','http','could','www','com','ever','doesnt_seem',
                  'xxxx','else','would','also','ea','&amp','#x200B','oh','etc','yeah','nan','however','even','dont_know','sa',
                  "looks_like",'especially','may','sounds_like'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [24]:
# LoadDataset
df=pd.read_csv('googlehome_merged.csv')
print(df.shape)
df.head(1)

(59823, 12)


Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content
0,4m130w,"No surprise, Google Home is based on Chromecas...",,https://www.reddit.com/r/googlehome/comments/4...,seekweb,2,2016-06-01 20:53:18,4,/r/googlehome/comments/4m130w/no_surprise_goog...,,"[""Anybody actually use this sub, yet? Guess i...","No surprise, Google Home is based on Chromecas..."


In [25]:
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI)

def preprocess_tweet(row):
    text = row['content']
    text = text.replace('r/','')
    text = p.clean(text)
    text = clean(text,     
                 fix_unicode=True,              # fix various unicode errors
                 to_ascii=True,                 # transliterate to closest ASCII representation
                 lower=True,                    # lowercase text
                 no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                 no_urls=True,                  # replace all URLs with a special token
                 no_emails=True,                # replace all email addresses with a special token
                 no_phone_numbers=True,         # replace all phone numbers with a special token
                 no_numbers=True,               # replace all numbers with a special token
                 no_digits=True,                # replace all digits with a special token
                 no_currency_symbols=True,      # replace all currency symbols with a special token
                 no_punct=True,                 # remove punctuations
                 lang="en",                     # set to 'de' for German special handling
                 replace_with_punct="",          # instead of removing punctuations you may replace them
                 replace_with_url="",
                 replace_with_email="",
                 replace_with_phone_number="",
                 replace_with_number="",
                 replace_with_digit="",
                 replace_with_currency_symbol=""
                )
    text = text.replace('amp','')
    text = text.replace('nan','')
    return text

df['content'] = df.apply(preprocess_tweet, axis=1)
df

Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content
0,4m130w,"No surprise, Google Home is based on Chromecas...",,https://www.reddit.com/r/googlehome/comments/4...,seekweb,2,2016-06-01 20:53:18,4,/r/googlehome/comments/4m130w/no_surprise_goog...,,"[""Anybody actually use this sub, yet? Guess i...",no surprise google home is based on chromecast...
1,4mrqc7,IoT Is Getting Better With Google Home - An Am...,,https://www.reddit.com/r/googlehome/comments/4...,K2Bsolutions,1,2016-06-06 15:04:12,0,/r/googlehome/comments/4mrqc7/iot_is_getting_b...,,['nan'],iot is getting better with google home an amaz...
2,4zjsqh,Any update? Is thing happening?,,https://www.reddit.com/r/googlehome/comments/4...,[deleted],3,2016-08-26 01:45:35,3,/r/googlehome/comments/4zjsqh/any_update_is_th...,,"[""I've been wondering that myself. I can't fin...",any update is thing happening ive been wonderi...
3,50iyx8,Google is taking dozens of Nest engineers to w...,,https://www.reddit.com/r/googlehome/comments/5...,my_bday_is_tomorrow,2,2016-09-01 03:19:28,0,/r/googlehome/comments/50iyx8/google_is_taking...,,['nan'],google is taking dozens of nest engineers to w...
4,5469ci,Android Police: Google Home will cost $129,,https://www.reddit.com/r/googlehome/comments/5...,chopper_woot_woot,12,2016-09-24 02:41:27,0,/r/googlehome/comments/5469ci/android_police_g...,,['nan'],android police google home will cost
...,...,...,...,...,...,...,...,...,...,...,...,...
59818,np4n38,Just a little fun for the young kids - Use a t...,"I have speakers all throughout my house, so no...",https://www.reddit.com/r/googlehome/comments/n...,mywerkaccount,1,2021-05-31 22:27:27,1,/r/googlehome/comments/np4n38/just_a_little_fu...,Tips,['nan'],just a little fun for the young kids use a tex...
59819,np4xyn,I'm planning to get google nest mini (2nd Gen)...,i just wanna try a smart home speaker. this is...,https://www.reddit.com/r/googlehome/comments/n...,Embarrassed-Ad8685,1,2021-05-31 22:41:19,7,/r/googlehome/comments/np4xyn/im_planning_to_g...,,"['Yep, you should be fine, even if they should...",im planning to get google nest mini nd gen is ...
59820,np4zu4,Workday routine skipping all steps but last one,"For the past couple weeks, my morning routine ...",https://www.reddit.com/r/googlehome/comments/n...,LingonberryNarrow755,1,2021-05-31 22:43:42,0,/r/googlehome/comments/np4zu4/workday_routine_...,,['nan'],workday routine skipping all steps but last on...
59821,np5lzm,Forsage Busd Review 2021: Legit Or Scam? Read ...,,https://www.reddit.com/r/googlehome/comments/n...,Naijabizplug,1,2021-05-31 23:10:57,0,/r/googlehome/comments/np5lzm/forsage_busd_rev...,,['nan'],forsage busd review legit or scam read how it ...


In [26]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub(r'http\S+', '', sent) # remove http
        sent = re.sub(r'https\S+', '', sent) # remove https
        sent = re.sub('<[^>]+>', '', sent) # remove HTML tags
        sent = re.sub('<[^<]+?>', '', sent)
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = re.sub(r'[^\w\s]','',sent) # remove punctuations
        sent = gensim.utils.simple_preprocess(str(sent), min_len=2, deacc=True) 
        
        yield(sent)  

# # Convert to list
data = df.content.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['no', 'surprise', 'google', 'home', 'is', 'based', 'on', 'chromecast', 'not', 'android', 'anybody', 'actually', 'use', 'this', 'sub', 'yet', 'guess', 'it', 'kinda', 'hard', 'to', 'when', 'there', 'isnt', 'product', 'yetchromecast', 'is', 'based', 'more', 'on', 'android', 'than', 'on', 'chromeos', 'so', 'this', 'doesnt', 'mean', 'much']]


In [27]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=1,delimiter='_') # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=1, delimiter='_')  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Tag   Meaning                English Examples
# ADJ   adjective              new, good, high, special, big, local
# ADP   adposition             on, of, at, with, by, into, under
# ADV   adverb                 really, already, still, early, now
# CONJ  conjunction            and, or, but, if, while, although
# DET   determiner, article    the, a, some, most, every, no, which
# NOUN  noun                   year, home, costs, time, Africa
# NUM   numeral                twenty-four, fourth, 1991, 14:24
# PRT   particle               at, on, out, over per, that, up, with
# PRON  pronoun                he, their, her, its, my, I, us
# VERB  verb                   is, say, told, given, playing, would
# .     punctuation marks      . , ; !
# X     other                  ersatz, esprit, dunno, gr8, univeristy

# def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
def process_words(texts, stop_words=stop_words, disallowed_postags=['ADP', 'CONJ', 'DET', 'NUM', 'PRT','PRON','.','X']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ not in disallowed_postags])
#         texts_out.append([token.lemma_ for token in doc])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc), max_len=20) if word not in stop_words] for doc in texts_out] 
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!
print(data_ready[:1])

[['surprise', 'google_home', 'base', 'chromecast', 'android', 'actually_use', 'sub', 'yet', 'guess', 'kinda_hard', 'product', 'yetchromecast', 'base', 'android', 'chromeos', 'doesnt_mean', 'much']]


In [28]:
from gensim.corpora import Dictionary

# Create Dictionary
id2word = Dictionary(data_ready)
print('Number of unique words in initital documents:', len(id2word))

# Filter out words that occur less than 0.5% documents, or more than 20% of the documents.
id2word.filter_extremes(no_below = (round(((len(data_ready))*0.005))), no_above = 0.99)
print('Number of unique words after removing rare and common words:', len(id2word))

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
print('Number of documents: %d' % len(corpus))

Number of unique words in initital documents: 152005
Number of unique words after removing rare and common words: 1406
Number of documents: 59823


In [29]:
id2word.save("corpus_dict/dict")
corpora.MmCorpus.serialize("corpus_dict/corpus", corpus)

In [30]:
df['tokenz'] = [[(id2word[id]) for id, freq in cp] for cp in corpus[:]]
df.head(1)

Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content,tokenz
0,4m130w,"No surprise, Google Home is based on Chromecas...",,https://www.reddit.com/r/googlehome/comments/4...,seekweb,2,2016-06-01 20:53:18,4,/r/googlehome/comments/4m130w/no_surprise_goog...,,"[""Anybody actually use this sub, yet? Guess i...",no surprise google home is based on chromecast...,"[android, base, chromecast, google_home, guess..."


In [31]:
df.to_csv('1_df_content_tokenz.csv',index=False, encoding='utf-8')

In [32]:
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[('android', 2), ('base', 2), ('chromecast', 1), ('google_home', 1), ('guess', 1), ('much', 1), ('product', 1), ('sub', 1), ('surprise', 1), ('yet', 1)]]


In [33]:
# #tf-idf
# from gensim.models import TfidfModel

# # Create Dictionary
# from gensim import models

# tfidf = models.TfidfModel(corpus, id2word=id2word)  # step 1 -- initialize a model
# corpus = tfidf[corpus]
# for doc in corpus:
#     pprint(doc)
#     break

# # print('Number of unique tokens: %d' % len(id2word))
# # print('Number of documents: %d' % len(corpus))

In [34]:
# topWords = {}
# for doc in corpus:
#     for iWord, tf_idf in doc:
#         if iWord not in topWords:
#             topWords[iWord] = 0

#         if tf_idf > topWords[iWord]:
#             topWords[iWord] = tf_idf
# sum = 0
# term = []
# for i, item in enumerate(sorted(topWords.items(), key=lambda x: x[1], reverse=True), 1):
# #     print("%2s: %-13s %s" % (i, id2word[item[0]], item[1]))
#     term.append(id2word[item[0]])
#     sum += item[1]
# #     if i == 100: break
# # print (sum)
# mean = sum/i
# print ('Mean of tf-idf score: ' + str(mean))
# # print (term)

In [35]:
# #tf-idf
# from gensim.models import TfidfModel

# # Create Dictionary
# from gensim import models

# low_value = 0.271734994034526
# low_value_words = []

# tfidf = models.TfidfModel(corpus, id2word=id2word)  # step 1 -- initialize a model
# corpus = tfidf[corpus]
# for doc in corpus:
#     low_value_words += [id for id, value in tfidf[doc] if value < low_value]

In [36]:
# id2word.filter_tokens(bad_ids=low_value_words)
# print('Number of filtered unique tokens: %d' % len(id2word))
# print('Number of documents: %d' % len(corpus))

In [37]:
# corpus = [id2word.doc2bow(doc) for doc in data_ready]
# corpus = tfidf[corpus]
# for doc in corpus:
#     pprint(doc)

In [38]:
# print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]])

In [39]:
# # Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=10)

# pprint(lda_model.print_topics())

In [40]:
def compute_coherence_values(corpus, dictionary, num_topics, a, b):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics, 
                                                random_state=100,
                                                chunksize=100,
                                                passes=40,
                                                iterations=1000,
                                                alpha=a,
                                                eta=1/num_topics,
                                                eval_every = None)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['tokenz'], dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [41]:
# 1

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_1 = pd.DataFrame(model_results)
model_results_1.to_csv('tuning/00_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.46215727299492704
5
0.05
0.2
0.475949740884384
5
0.1
0.2
0.4725864403698793
5
0.2
0.2
0.462187508057516
5
0.5
0.2
0.5139876480935635
5
1
0.2
0.5226020993962525
10
0.01
0.1
0.44537656739566345
10
0.05
0.1
0.44249188779580917
10
0.1
0.1
0.42562174744265013
10
0.2
0.1
0.43234725698831583
10
0.5
0.1
0.48420929799217927
10
1
0.1
0.45106929489707104
20
0.01
0.05
0.5002477441153024
20
0.05
0.05
0.515560077269345
20
0.1
0.05
0.5190624611176899
20
0.2
0.05
0.4884948191470663
20
0.5
0.05
0.48577223614656584
20
1
0.05
0.49059211602464076
30
0.01
0.03333333333333333
0.5148438144572455
30
0.05
0.03333333333333333
0.510141542831873
30
0.1
0.03333333333333333
0.5057254134472735
30
0.2
0.03333333333333333
0.48142530321700044
30
0.5
0.03333333333333333
0.4640248946909255
30
1
0.03333333333333333
0.4685723248047855


In [42]:
# 2

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_2 = pd.DataFrame(model_results)
model_results_2.to_csv('tuning/11_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.46215727299492704
5
0.05
0.2
0.475949740884384
5
0.1
0.2
0.4725864403698793
5
0.2
0.2
0.462187508057516
5
0.5
0.2
0.5139876480935635
5
1
0.2
0.5226020993962525
10
0.01
0.1
0.44537656739566345
10
0.05
0.1
0.44249188779580917
10
0.1
0.1
0.42562174744265013
10
0.2
0.1
0.43234725698831583
10
0.5
0.1
0.48420929799217927
10
1
0.1
0.45106929489707104
20
0.01
0.05
0.5002477441153024
20
0.05
0.05
0.515560077269345
20
0.1
0.05
0.5190624611176899
20
0.2
0.05
0.4884948191470663
20
0.5
0.05
0.48577223614656584
20
1
0.05
0.49059211602464076
30
0.01
0.03333333333333333
0.5148438144572455
30
0.05
0.03333333333333333
0.510141542831873
30
0.1
0.03333333333333333
0.5057254134472735
30
0.2
0.03333333333333333
0.48142530321700044
30
0.5
0.03333333333333333
0.4640248946909255
30
1
0.03333333333333333
0.4685723248047855


In [43]:
# 3

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_3 = pd.DataFrame(model_results)
model_results_3.to_csv('tuning/22_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.46215727299492704
5
0.05
0.2
0.475949740884384
5
0.1
0.2
0.4725864403698793
5
0.2
0.2
0.462187508057516
5
0.5
0.2
0.5139876480935635
5
1
0.2
0.5226020993962525
10
0.01
0.1
0.44537656739566345
10
0.05
0.1
0.44249188779580917
10
0.1
0.1
0.42562174744265013
10
0.2
0.1
0.43234725698831583
10
0.5
0.1
0.48420929799217927
10
1
0.1
0.45106929489707104
20
0.01
0.05
0.5002477441153024
20
0.05
0.05
0.515560077269345
20
0.1
0.05
0.5190624611176899
20
0.2
0.05
0.4884948191470663
20
0.5
0.05
0.48577223614656584
20
1
0.05
0.49059211602464076
30
0.01
0.03333333333333333
0.5148438144572455
30
0.05
0.03333333333333333
0.510141542831873
30
0.1
0.03333333333333333
0.5057254134472735
30
0.2
0.03333333333333333
0.48142530321700044
30
0.5
0.03333333333333333
0.4640248946909255
30
1
0.03333333333333333
0.4685723248047855


In [44]:
# 4

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_4 = pd.DataFrame(model_results)
model_results_4.to_csv('tuning/33_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.46215727299492704
5
0.05
0.2
0.475949740884384
5
0.1
0.2
0.4725864403698793
5
0.2
0.2
0.462187508057516
5
0.5
0.2
0.5139876480935635
5
1
0.2
0.5226020993962525
10
0.01
0.1
0.44537656739566345
10
0.05
0.1
0.44249188779580917
10
0.1
0.1
0.42562174744265013
10
0.2
0.1
0.43234725698831583
10
0.5
0.1
0.48420929799217927
10
1
0.1
0.45106929489707104
20
0.01
0.05
0.5002477441153024
20
0.05
0.05
0.515560077269345
20
0.1
0.05
0.5190624611176899
20
0.2
0.05
0.4884948191470663
20
0.5
0.05
0.48577223614656584
20
1
0.05
0.49059211602464076
30
0.01
0.03333333333333333
0.5148438144572455
30
0.05
0.03333333333333333
0.510141542831873
30
0.1
0.03333333333333333
0.5057254134472735
30
0.2
0.03333333333333333
0.48142530321700044
30
0.5
0.03333333333333333
0.4640248946909255
30
1
0.03333333333333333
0.4685723248047855


In [45]:
# 5

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_5 = pd.DataFrame(model_results)
model_results_5.to_csv('tuning/44_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.46215727299492704
5
0.05
0.2
0.475949740884384
5
0.1
0.2
0.4725864403698793
5
0.2
0.2
0.462187508057516
5
0.5
0.2
0.5139876480935635
5
1
0.2
0.5226020993962525
10
0.01
0.1
0.44537656739566345
10
0.05
0.1
0.44249188779580917
10
0.1
0.1
0.42562174744265013
10
0.2
0.1
0.43234725698831583
10
0.5
0.1
0.48420929799217927
10
1
0.1
0.45106929489707104
20
0.01
0.05
0.5002477441153024
20
0.05
0.05
0.515560077269345
20
0.1
0.05
0.5190624611176899
20
0.2
0.05
0.4884948191470663
20
0.5
0.05
0.48577223614656584
20
1
0.05
0.49059211602464076
30
0.01
0.03333333333333333
0.5148438144572455
30
0.05
0.03333333333333333
0.510141542831873
30
0.1
0.03333333333333333
0.5057254134472735
30
0.2
0.03333333333333333
0.48142530321700044
30
0.5
0.03333333333333333
0.4640248946909255
30
1
0.03333333333333333
0.4685723248047855


In [46]:
# 6

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_6 = pd.DataFrame(model_results)
model_results_6.to_csv('tuning/55_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.46215727299492704
5
0.05
0.2
0.475949740884384
5
0.1
0.2
0.4725864403698793
5
0.2
0.2
0.462187508057516
5
0.5
0.2
0.5139876480935635
5
1
0.2
0.5226020993962525
10
0.01
0.1
0.44537656739566345
10
0.05
0.1
0.44249188779580917
10
0.1
0.1
0.42562174744265013
10
0.2
0.1
0.43234725698831583
10
0.5
0.1
0.48420929799217927
10
1
0.1
0.45106929489707104
20
0.01
0.05
0.5002477441153024
20
0.05
0.05
0.515560077269345
20
0.1
0.05
0.5190624611176899
20
0.2
0.05
0.4884948191470663
20
0.5
0.05
0.48577223614656584
20
1
0.05
0.49059211602464076
30
0.01
0.03333333333333333
0.5148438144572455
30
0.05
0.03333333333333333
0.510141542831873
30
0.1
0.03333333333333333
0.5057254134472735
30
0.2
0.03333333333333333
0.48142530321700044
30
0.5
0.03333333333333333
0.4640248946909255
30
1
0.03333333333333333
0.4685723248047855


In [47]:
# 7

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_7 = pd.DataFrame(model_results)
model_results_7.to_csv('tuning/66_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.46215727299492704
5
0.05
0.2
0.475949740884384
5
0.1
0.2
0.4725864403698793
5
0.2
0.2
0.462187508057516
5
0.5
0.2
0.5139876480935635
5
1
0.2
0.5226020993962525
10
0.01
0.1
0.44537656739566345
10
0.05
0.1
0.44249188779580917
10
0.1
0.1
0.42562174744265013
10
0.2
0.1
0.43234725698831583
10
0.5
0.1
0.48420929799217927
10
1
0.1
0.45106929489707104
20
0.01
0.05
0.5002477441153024
20
0.05
0.05
0.515560077269345
20
0.1
0.05
0.5190624611176899
20
0.2
0.05
0.4884948191470663
20
0.5
0.05
0.48577223614656584
20
1
0.05
0.49059211602464076
30
0.01
0.03333333333333333
0.5148438144572455
30
0.05
0.03333333333333333
0.510141542831873
30
0.1
0.03333333333333333
0.5057254134472735
30
0.2
0.03333333333333333
0.48142530321700044
30
0.5
0.03333333333333333
0.4640248946909255
30
1
0.03333333333333333
0.4685723248047855


In [48]:
# 8

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_8 = pd.DataFrame(model_results)
model_results_8.to_csv('tuning/77_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.46215727299492704
5
0.05
0.2
0.475949740884384
5
0.1
0.2
0.4725864403698793
5
0.2
0.2
0.462187508057516
5
0.5
0.2
0.5139876480935635
5
1
0.2
0.5226020993962525
10
0.01
0.1
0.44537656739566345
10
0.05
0.1
0.44249188779580917
10
0.1
0.1
0.42562174744265013
10
0.2
0.1
0.43234725698831583
10
0.5
0.1
0.48420929799217927
10
1
0.1
0.45106929489707104
20
0.01
0.05
0.5002477441153024
20
0.05
0.05
0.515560077269345
20
0.1
0.05
0.5190624611176899
20
0.2
0.05
0.4884948191470663
20
0.5
0.05
0.48577223614656584
20
1
0.05
0.49059211602464076
30
0.01
0.03333333333333333
0.5148438144572455
30
0.05
0.03333333333333333
0.510141542831873
30
0.1
0.03333333333333333
0.5057254134472735
30
0.2
0.03333333333333333
0.48142530321700044
30
0.5
0.03333333333333333
0.4640248946909255
30
1
0.03333333333333333
0.4685723248047855


In [None]:
# 9

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_9 = pd.DataFrame(model_results)
model_results_9.to_csv('tuning/88_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.46215727299492704
5
0.05
0.2
0.475949740884384
5
0.1
0.2
0.4725864403698793
5
0.2
0.2
0.462187508057516
5
0.5
0.2
0.5139876480935635
5
1
0.2
0.5226020993962525
10
0.01
0.1
0.44537656739566345
10
0.05
0.1
0.44249188779580917
10
0.1
0.1
0.42562174744265013
10
0.2
0.1
0.43234725698831583
10
0.5
0.1
0.48420929799217927
10
1
0.1
0.45106929489707104
20
0.01
0.05
0.5002477441153024
20
0.05
0.05
0.515560077269345
20
0.1
0.05
0.5190624611176899
20
0.2
0.05
0.4884948191470663
20
0.5
0.05
0.48577223614656584
20
1
0.05
0.49059211602464076
30
0.01
0.03333333333333333
0.5148438144572455
30
0.05
0.03333333333333333
0.510141542831873
30
0.1
0.03333333333333333
0.5057254134472735


In [None]:
# 10

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_10 = pd.DataFrame(model_results)
model_results_10.to_csv('tuning/99_lda_tuning_results.csv', index=False, encoding='utf-8')

In [None]:
model_results = pd.concat([model_results_1, model_results_2, model_results_3, model_results_4, model_results_5,
                          model_results_6, model_results_7, model_results_8, model_results_9, model_results_10])
model_results.to_csv("tuning/model_results.csv", index=False, encoding='utf-8-sig')

In [None]:
model_results = model_results.groupby(['Topics', 'Alpha'], as_index=False).mean()
model_results = model_results.sort_values(by='Coherence', ascending=False)
model_results.to_csv('2_lda_tuning_results.csv', index=False)

In [None]:
model_results

In [None]:
# priors = pd.pivot_table(model_results,index=["Topics"],columns=["Alpha"],values=['Coherence'])
# priors.columns = range(priors.shape[1])
# priors.columns = ['.01','.05','.1','.2','.5','1']
# df.head(1)
# priors = priors.reset_index()
# priors

In [None]:
# priors.to_csv("siri_lda_tuning_results.csv",index=True, encoding="utf-8")

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np
  
# # dummy data
# x1 = priors['Topics']
# A = priors['.01']
# B = priors['.05']
# C = priors['.1']
# D = priors['.2']
# E = priors['.5']
# F = priors['1']

# # creates two subplots
# # fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (24, 12))

# fig, ax = plt.subplots(2, 3, figsize = (24,12))

# # Plot without grid
# ax[0,0].plot(x1, A, label='0.01', color='tab:blue')
# ax[0,1].plot(x1, B, label='0.05', color='tab:orange')
# ax[0,2].plot(x1, C, label='0.1', color='tab:green')
# ax[1,0].plot(x1, D, label='0.2', color='tab:red')
# ax[1,1].plot(x1, E, label='0.5', color='tab:purple')
# ax[1,2].plot(x1, F, label='1', color='tab:brown')

# ax[0,0].set_xlim(xmin=9)
# ax[0,0].set_title('siri, α=.01, Beta=1/K')
# ax[0,0].set_xlabel('K')
# ax[0,0].set_ylabel('Cv')

# ax[0,1].set_xlim(xmin=9)
# ax[0,1].set_title('siri, α=.05, Beta=1/K')
# ax[0,1].set_xlabel('K')
# ax[0,1].set_ylabel('Cv')

# ax[0,2].set_xlim(xmin=9)
# ax[0,2].set_title('siri, α=.1, Beta=1/K')
# ax[0,2].set_xlabel('K')
# ax[0,2].set_ylabel('Cv')

# ax[1,0].set_xlim(xmin=9)
# ax[1,0].set_title('siri, α=.2, Beta=1/K')
# ax[1,0].set_xlabel('K')
# ax[1,0].set_ylabel('Cv')

# ax[1,1].set_xlim(xmin=9)
# ax[1,1].set_title('siri, α=.5, Beta=1/K')
# ax[1,1].set_xlabel('K')
# ax[1,1].set_ylabel('Cv')

# ax[1,2].set_xlim(xmin=9)
# ax[1,2].set_title('siri, α=1, Beta=1/K')
# ax[1,2].set_xlabel('K')
# ax[1,2].set_ylabel('Cv')

# # fig.tight_layout()
# fig.set_facecolor("w")
# plt.show()

# K = 5

In [None]:
lda_model_5 = gensim.models.LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=5, 
                                         random_state=100,
                                         chunksize=100,
                                         passes=40,
                                         iterations=1000,
                                         alpha=0.10,
                                         eta=1/5,
                                         eval_every=None)

In [None]:
for idx, topic in lda_model_5.show_topics(num_topics=5, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

In [None]:
# Visualize the topics
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_5, corpus, id2word, R=30, sort_topics=False) #mds='tsne' 
vis

In [None]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_5.html')

# K = 10

In [None]:
lda_model_10 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=10, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=0.05,
                                          eta=1/10,
                                          eval_every=None)

In [None]:
for idx, topic in lda_model_10.show_topics(num_topics=10, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

In [None]:
# Visualize the topics
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_10, corpus, id2word, R=30, sort_topics=False) #mds='tsne' 
vis

In [None]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_10.html')

# K = 20

In [None]:
lda_model_20 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=20, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=0.01,
                                          eta=1/20,
                                          eval_every=None)

In [None]:
for idx, topic in lda_model_20.show_topics(num_topics=20, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

In [None]:
# Visualize the topics
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_20, corpus, id2word, R=30, sort_topics=False) #mds='tsne' 
vis

In [None]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_20.html')

# K =  30

In [None]:
lda_model_30 = gensim.models.LdaMulticore(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=30, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=40,
                                          iterations=1000,
                                          alpha=0.1,
                                          eta=1/30,
                                          eval_every=None)

In [None]:
for idx, topic in lda_model_30.show_topics(num_topics=30, num_words=30, log=False, formatted=False):
    pprint('Topic {}: {}'.format(idx, ', '.join([w[0] for w in topic])))
    print(" ")

In [None]:
# Visualize the topics
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_30, corpus, id2word, R=30, sort_topics=False) #mds='tsne' 
vis

In [None]:
pyLDAvis.save_html(vis, 'LDAvis/lda_model_30.html')

In [None]:
from gensim.test.utils import datapath
# Save model to disk.
lda_model_5.save("model/lda_model_5")
lda_model_10.save("model/lda_model_10")
lda_model_20.save("model/lda_model_20")
lda_model_30.save("model/lda_model_30")

# Finding the dominant topic in each document

In [None]:
id2word = corpora.Dictionary.load("corpus_dict/dict")
corpus = corpora.MmCorpus("corpus_dict/corpus")
df=pd.read_csv('googlehome_merged.csv',encoding="utf-8")
lda = gensim.models.ldamodel.LdaModel.load("model/lda_model_5")

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=df['content']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=df['content'])
# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Content']
df_dominant_topic.head(10)

In [None]:
df_dominant_topic.to_csv("3_df_dominant_topic.csv", encoding = 'utf-8',index=False) 

In [None]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Link"]

# Show
sent_topics_sorteddf_mallet.head(10)

In [None]:
sent_topics_sorteddf_mallet.to_csv("4_sent_topics_sorteddf_mallet.csv", encoding = 'utf-8') 