In [24]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['https_www','seems_like','do','not','imgur','tkg','https','http','could','www','com','ever','doesnt_seem',
                  'xxxx','else','would','also','ea','&amp','#x200B','oh','etc','yeah','nan','however','even','dont_know','sa',
                  "looks_like",'especially','may','sounds_like'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [25]:
# LoadDataset
df=pd.read_csv('amazonecho_merged.csv')
print(df.shape)
df.head(1)

(33548, 12)


Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content
0,4lvz5u,"I just bought an echo, but what I really want ...",For me the echo is really just an interim piec...,https://www.reddit.com/r/amazonecho/comments/4...,pencock,1,2016-06-01 00:08:22,27,/r/amazonecho/comments/4lvz5u/i_just_bought_an...,,['Not what you want =/= piece of crap. \nThat ...,"I just bought an echo, but what I really want ..."


In [26]:
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI)

def preprocess_tweet(row):
    text = row['content']
    text = text.replace('r/','')
    text = p.clean(text)
    text = clean(text,     
                 fix_unicode=True,              # fix various unicode errors
                 to_ascii=True,                 # transliterate to closest ASCII representation
                 lower=True,                    # lowercase text
                 no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                 no_urls=True,                  # replace all URLs with a special token
                 no_emails=True,                # replace all email addresses with a special token
                 no_phone_numbers=True,         # replace all phone numbers with a special token
                 no_numbers=True,               # replace all numbers with a special token
                 no_digits=True,                # replace all digits with a special token
                 no_currency_symbols=True,      # replace all currency symbols with a special token
                 no_punct=True,                 # remove punctuations
                 lang="en",                     # set to 'de' for German special handling
                 replace_with_punct="",          # instead of removing punctuations you may replace them
                 replace_with_url="",
                 replace_with_email="",
                 replace_with_phone_number="",
                 replace_with_number="",
                 replace_with_digit="",
                 replace_with_currency_symbol=""
                )
    text = text.replace('amp','')
    text = text.replace('nan','')
    return text

df['content'] = df.apply(preprocess_tweet, axis=1)
df

Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content
0,4lvz5u,"I just bought an echo, but what I really want ...",For me the echo is really just an interim piec...,https://www.reddit.com/r/amazonecho/comments/4...,pencock,1,2016-06-01 00:08:22,27,/r/amazonecho/comments/4lvz5u/i_just_bought_an...,,['Not what you want =/= piece of crap. \nThat ...,i just bought an echo but what i really want i...
1,4lwgw6,Order an Amazon Dot w/ out an Echo link inside,,https://www.reddit.com/r/amazonecho/comments/4...,TheSyntaxEra,1,2016-06-01 01:42:04,0,/r/amazonecho/comments/4lwgw6/order_an_amazon_...,,['nan'],order an amazon dot w out an echo link inside
2,4lwvlu,Alexa getting worse at comprehending me. Anybo...,I'm not sure if this is new or I just noticed ...,https://www.reddit.com/r/amazonecho/comments/4...,ASeriousUser,73,2016-06-01 02:59:31,25,/r/amazonecho/comments/4lwvlu/alexa_getting_wo...,,"['[deleted],I have definitely noticed the same...",alexa getting worse at comprehending me anybod...
3,4lx14a,Alexa on a browser,Alexa can now be accessed via a browser at htt...,https://www.reddit.com/r/amazonecho/comments/4...,layboy,6,2016-06-01 03:29:19,0,/r/amazonecho/comments/4lx14a/alexa_on_a_browser/,,['nan'],alexa on a browser alexa can now be accessed v...
4,4lxok6,More info on Google Home...,,https://www.reddit.com/r/amazonecho/comments/4...,TheSyntaxEra,0,2016-06-01 05:36:07,7,/r/amazonecho/comments/4lxok6/more_info_on_goo...,,['Actually makes a lot of sense and can be a h...,more info on google home actually makes a lot ...
...,...,...,...,...,...,...,...,...,...,...,...,...
33543,np1pv8,Is there a way to hide my photos from amazon p...,,https://www.reddit.com/r/amazonecho/comments/n...,maa112,1,2021-05-31 19:57:14,7,/r/amazonecho/comments/np1pv8/is_there_a_way_t...,Question,"['Yes! In the Amazon photos app, select the pi...",is there a way to hide my photos from amazon p...
33544,np206u,I can't connect to my amazon account.,"When I got my Echo, I created an Amazon accoun...",https://www.reddit.com/r/amazonecho/comments/n...,Loose_Drink,1,2021-05-31 20:13:10,0,/r/amazonecho/comments/np206u/i_cant_connect_t...,,['nan'],i cant connect to my amazon account when i got...
33545,np3lb9,Cruella soundtrack on Alexa?,Can anybody get Alexa to play the Cruela movie...,https://www.reddit.com/r/amazonecho/comments/n...,Hot_Concentrate_2255,1,2021-05-31 21:37:16,1,/r/amazonecho/comments/np3lb9/cruella_soundtra...,Question,['Which subscription level?\n\nEdit: Try askin...,cruella soundtrack on alexa can anybody get al...
33546,np4hmy,Spinning light won’t stop,The light on my dot is spinning constantly blu...,https://www.reddit.com/r/amazonecho/comments/n...,1pornstarmartini,1,2021-05-31 22:20:06,8,/r/amazonecho/comments/np4hmy/spinning_light_w...,Technical Issue,"[""Disable guard mode.,I think a constant spinn...",spinning light wont stop the light on my dot i...


In [27]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub(r'http\S+', '', sent) # remove http
        sent = re.sub(r'https\S+', '', sent) # remove https
        sent = re.sub('<[^>]+>', '', sent) # remove HTML tags
        sent = re.sub('<[^<]+?>', '', sent)
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = re.sub(r'[^\w\s]','',sent) # remove punctuations
        sent = gensim.utils.simple_preprocess(str(sent), min_len=2, deacc=True) 
        
        yield(sent)  

# # Convert to list
data = df.content.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['just', 'bought', 'an', 'echo', 'but', 'what', 'really', 'want', 'is', 'dot', 'can', 'use', 'the', 'echo', 'to', 'order', 'dot', 'and', 'then', 'return', 'the', 'echo', 'or', 'are', 'they', 'somehow', 'linked', 'and', 'amazon', 'will', 'disable', 'the', 'dot', 'for', 'me', 'the', 'echo', 'is', 'really', 'just', 'an', 'interim', 'piece', 'of', 'equipment', 'to', 'get', 'used', 'to', 'using', 'alexa', 'have', 'full', 'bluetooth', 'home', 'theater', 'system', 'and', 'of', 'course', 'the', 'echo', 'is', 'piece', 'of', 'crap', 'with', 'no', 'way', 'to', 'connect', 'external', 'speakers', 'to', 'it', 'basically', 'will', 'the', 'dot', 'be', 'linked', 'to', 'my', 'echo', 'and', 'become', 'useless', 'if', 'return', 'the', 'echo', 'not', 'what', 'you', 'want', 'piece', 'of', 'crap', 'that', 'said', 'dont', 'see', 'why', 'you', 'cant', 'do', 'as', 'you', 'suggest', 'order', 'the', 'dot', 'wait', 'for', 'the', 'email', 'confirmation', 'then', 'return', 'the', 'echo', 'or', 'sell', 'it', 'if', '

In [28]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=1) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=1)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Tag   Meaning                English Examples
# ADJ   adjective              new, good, high, special, big, local
# ADP   adposition             on, of, at, with, by, into, under
# ADV   adverb                 really, already, still, early, now
# CONJ  conjunction            and, or, but, if, while, although
# DET   determiner, article    the, a, some, most, every, no, which
# NOUN  noun                   year, home, costs, time, Africa
# NUM   numeral                twenty-four, fourth, 1991, 14:24
# PRT   particle               at, on, out, over per, that, up, with
# PRON  pronoun                he, their, her, its, my, I, us
# VERB  verb                   is, say, told, given, playing, would
# .     punctuation marks      . , ; !
# X     other                  ersatz, esprit, dunno, gr8, univeristy

# def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
def process_words(texts, stop_words=stop_words, disallowed_postags=['ADP', 'CONJ', 'DET', 'NUM', 'PRT','PRON','.','X']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ not in disallowed_postags])
#         texts_out.append([token.lemma_ for token in doc])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc), max_len=20) if word not in stop_words] for doc in texts_out] 
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!
print(data_ready[:1])

[['buy', 'echo', 'really_want', 'dot', 'use', 'echo', 'order', 'dot', 'return', 'echo', 'somehow', 'link', 'amazon', 'disable', 'dot', 'echo', 'really', 'interim', 'piece', 'equipment', 'get', 'used', 'using_alexa', 'full', 'bluetooth', 'home_theater_system', 'course', 'echo', 'piece', 'crap', 'way', 'connect', 'external_speaker', 'basically', 'dot', 'link', 'echo', 'become_useless', 'return', 'echo', 'want', 'piece', 'crap', 'say', 'dont_see', 'suggest', 'order', 'dot', 'wait', 'email', 'confirmation', 'return', 'echo', 'sell', 'easy', 'three_echoe', 'various_times', 'remove', 'never', 'cause', 'issue', 'order', 'dot', 'amazon_app', 'phone', 'using_voice', 'workaround', 'dot', 'arrive', 'echo', 'im_sure', 'ok', 'keep', 'mind', 'shipping', 'date', 'dot', 'pretty', 'far', 'order', 'march', 'order', 'dot', 'return', 'echo', 'planning', 'use', 'echodot', 'primarily', 'listen', 'musicother', 'audio', 'wrong', 'bt', 'home_theater', 'system', 'work', 'pretty_annoye', 'change', 'bt', 'input',

In [29]:
from gensim.corpora import Dictionary

# Create Dictionary
id2word = Dictionary(data_ready)
print('Number of unique words in initital documents:', len(id2word))

# Filter out words that occur less than 0.5% documents, or more than 20% of the documents.
id2word.filter_extremes(no_below = (round(((len(data_ready))*0.005))), no_above = 0.99)
print('Number of unique words after removing rare and common words:', len(id2word))

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
print('Number of documents: %d' % len(corpus))

Number of unique words in initital documents: 112052
Number of unique words after removing rare and common words: 1413
Number of documents: 33548


In [30]:
id2word.save("corpus_dict/dict")
corpora.MmCorpus.serialize("corpus_dict/corpus", corpus)

In [31]:
df['tokenz'] = [[(id2word[id]) for id, freq in cp] for cp in corpus[:]]
df.head(1)

Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content,tokenz
0,4lvz5u,"I just bought an echo, but what I really want ...",For me the echo is really just an interim piec...,https://www.reddit.com/r/amazonecho/comments/4...,pencock,1,2016-06-01 00:08:22,27,/r/amazonecho/comments/4lvz5u/i_just_bought_an...,,['Not what you want =/= piece of crap. \nThat ...,i just bought an echo but what i really want i...,"[alexa, always, amazon, arrive, audio, basical..."


In [32]:
df.to_csv('1_df_content_tokenz.csv',index=False, encoding='utf-8')

In [33]:
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[('alexa', 1), ('always', 1), ('amazon', 1), ('arrive', 1), ('audio', 1), ('basically', 1), ('bluetooth', 1), ('bt', 2), ('buy', 1), ('cause', 1), ('change', 1), ('connect', 1), ('course', 1), ('crap', 2), ('date', 1), ('disable', 1), ('dont_see', 1), ('dont_use', 1), ('dot', 10), ('easy', 1), ('echo', 12), ('email', 1), ('every_time', 1), ('external_speaker', 1), ('far', 1), ('full', 1), ('get', 1), ('hear', 1), ('im_sure', 1), ('input', 1), ('issue', 1), ('keep', 1), ('like', 1), ('link', 2), ('listen', 1), ('mind', 1), ('much', 1), ('never', 1), ('ok', 1), ('order', 5), ('phone', 1), ('piece', 3), ('pretty', 1), ('really', 1), ('really_want', 1), ('remove', 1), ('response', 1), ('return', 4), ('say', 1), ('sell', 1), ('somehow', 1), ('suggest', 1), ('system', 1), ('use', 2), ('using_alexa', 1), ('wait', 1), ('want', 2), ('way', 1), ('work', 1), ('workaround', 1), ('wrong', 1)]]


In [34]:
# #tf-idf
# from gensim.models import TfidfModel

# # Create Dictionary
# from gensim import models

# tfidf = models.TfidfModel(corpus, id2word=id2word)  # step 1 -- initialize a model
# corpus = tfidf[corpus]
# for doc in corpus:
#     pprint(doc)
#     break

# # print('Number of unique tokens: %d' % len(id2word))
# # print('Number of documents: %d' % len(corpus))

In [35]:
# topWords = {}
# for doc in corpus:
#     for iWord, tf_idf in doc:
#         if iWord not in topWords:
#             topWords[iWord] = 0

#         if tf_idf > topWords[iWord]:
#             topWords[iWord] = tf_idf
# sum = 0
# term = []
# for i, item in enumerate(sorted(topWords.items(), key=lambda x: x[1], reverse=True), 1):
# #     print("%2s: %-13s %s" % (i, id2word[item[0]], item[1]))
#     term.append(id2word[item[0]])
#     sum += item[1]
# #     if i == 100: break
# # print (sum)
# mean = sum/i
# print ('Mean of tf-idf score: ' + str(mean))
# # print (term)

In [36]:
# #tf-idf
# from gensim.models import TfidfModel

# # Create Dictionary
# from gensim import models

# low_value = 0.271734994034526
# low_value_words = []

# tfidf = models.TfidfModel(corpus, id2word=id2word)  # step 1 -- initialize a model
# corpus = tfidf[corpus]
# for doc in corpus:
#     low_value_words += [id for id, value in tfidf[doc] if value < low_value]

In [37]:
# id2word.filter_tokens(bad_ids=low_value_words)
# print('Number of filtered unique tokens: %d' % len(id2word))
# print('Number of documents: %d' % len(corpus))

In [38]:
# corpus = [id2word.doc2bow(doc) for doc in data_ready]
# corpus = tfidf[corpus]
# for doc in corpus:
#     pprint(doc)

In [39]:
# print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]])

In [40]:
# # Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=10)

# pprint(lda_model.print_topics())

In [41]:
# id2word = corpora.Dictionary.load("corpus_dict/dict")
# corpus = corpora.MmCorpus("corpus_dict/corpus")
# df=pd.read_csv('1_df_content_tokenz.csv',encoding="utf-8")

In [42]:
# def compute_coherence_values(corpus, dictionary, num_topics, a, b):
    
#     lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                                 id2word=id2word,
#                                                 num_topics=num_topics, 
#                                                 random_state=100,
#                                                 chunksize=100,
#                                                 passes=40,
#                                                 iterations=1000,
#                                                 alpha=a,
#                                                 eta=1/num_topics,
#                                                 eval_every=None)
    
#     coherence_model_lda = CoherenceModel(model=lda_model, texts=df['tokenz'], dictionary=id2word, coherence='c_v')
    
#     return coherence_model_lda.get_coherence()

In [43]:
# 1

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_1 = pd.DataFrame(model_results)
model_results_1.to_csv('tuning/00_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5127485847290596
5
0.05
0.2
0.5130582265648347
5
0.1
0.2
0.5141374109721817
5
0.2
0.2
0.5319717505276262
5
0.5
0.2
0.5637860818774417
5
1
0.2
0.5584031823641136
10
0.01
0.1
0.47649098320580335
10
0.05
0.1
0.46415591816227114
10
0.1
0.1
0.46182912337108
10
0.2
0.1
0.4772920123740688
10
0.5
0.1
0.4828764015685888
10
1
0.1
0.51943043069385
20
0.01
0.05
0.48851405072202975
20
0.05
0.05
0.47959092180941243
20
0.1
0.05
0.48789884722068155
20
0.2
0.05
0.4344581627166882
20
0.5
0.05
0.48096529144727523
20
1
0.05
0.5176443685299258
30
0.01
0.03333333333333333
0.45842350247164027
30
0.05
0.03333333333333333
0.44843689774859086
30
0.1
0.03333333333333333
0.45285383533509793
30
0.2
0.03333333333333333
0.4616838054157074
30
0.5
0.03333333333333333
0.4636644084768945
30
1
0.03333333333333333
0.47586757895574405


In [44]:
# 2

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_2 = pd.DataFrame(model_results)
model_results_2.to_csv('tuning/11_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5127485847290596
5
0.05
0.2
0.5130582265648347
5
0.1
0.2
0.5141374109721817
5
0.2
0.2
0.5319717505276262
5
0.5
0.2
0.5637860818774417
5
1
0.2
0.5584031823641136
10
0.01
0.1
0.47649098320580335
10
0.05
0.1
0.46415591816227114
10
0.1
0.1
0.46182912337108
10
0.2
0.1
0.4772920123740688
10
0.5
0.1
0.4828764015685888
10
1
0.1
0.51943043069385
20
0.01
0.05
0.48851405072202975
20
0.05
0.05
0.47959092180941243
20
0.1
0.05
0.48789884722068155
20
0.2
0.05
0.4344581627166882
20
0.5
0.05
0.48096529144727523
20
1
0.05
0.5176443685299258
30
0.01
0.03333333333333333
0.45842350247164027
30
0.05
0.03333333333333333
0.44843689774859086
30
0.1
0.03333333333333333
0.45285383533509793
30
0.2
0.03333333333333333
0.4616838054157074
30
0.5
0.03333333333333333
0.4636644084768945
30
1
0.03333333333333333
0.47586757895574405


In [45]:
# 3

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_3 = pd.DataFrame(model_results)
model_results_3.to_csv('tuning/22_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5127485847290596
5
0.05
0.2
0.5130582265648347
5
0.1
0.2
0.5141374109721817
5
0.2
0.2
0.5319717505276262
5
0.5
0.2
0.5637860818774417
5
1
0.2
0.5584031823641136
10
0.01
0.1
0.47649098320580335
10
0.05
0.1
0.46415591816227114
10
0.1
0.1
0.46182912337108
10
0.2
0.1
0.4772920123740688
10
0.5
0.1
0.4828764015685888
10
1
0.1
0.51943043069385
20
0.01
0.05
0.48851405072202975
20
0.05
0.05
0.47959092180941243
20
0.1
0.05
0.48789884722068155
20
0.2
0.05
0.4344581627166882
20
0.5
0.05
0.48096529144727523
20
1
0.05
0.5176443685299258
30
0.01
0.03333333333333333
0.45842350247164027
30
0.05
0.03333333333333333
0.44843689774859086
30
0.1
0.03333333333333333
0.45285383533509793
30
0.2
0.03333333333333333
0.4616838054157074
30
0.5
0.03333333333333333
0.4636644084768945
30
1
0.03333333333333333
0.47586757895574405


In [46]:
# 4

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_4 = pd.DataFrame(model_results)
model_results_4.to_csv('tuning/33_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5127485847290596
5
0.05
0.2
0.5130582265648347
5
0.1
0.2
0.5141374109721817
5
0.2
0.2
0.5319717505276262
5
0.5
0.2
0.5637860818774417
5
1
0.2
0.5584031823641136
10
0.01
0.1
0.47649098320580335
10
0.05
0.1
0.46415591816227114
10
0.1
0.1
0.46182912337108
10
0.2
0.1
0.4772920123740688
10
0.5
0.1
0.4828764015685888
10
1
0.1
0.51943043069385
20
0.01
0.05
0.48851405072202975
20
0.05
0.05
0.47959092180941243
20
0.1
0.05
0.48789884722068155
20
0.2
0.05
0.4344581627166882
20
0.5
0.05
0.48096529144727523
20
1
0.05
0.5176443685299258
30
0.01
0.03333333333333333
0.45842350247164027
30
0.05
0.03333333333333333
0.44843689774859086
30
0.1
0.03333333333333333
0.45285383533509793
30
0.2
0.03333333333333333
0.4616838054157074
30
0.5
0.03333333333333333
0.4636644084768945
30
1
0.03333333333333333
0.47586757895574405


In [47]:
# 5

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_5 = pd.DataFrame(model_results)
model_results_5.to_csv('tuning/44_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5127485847290596
5
0.05
0.2
0.5130582265648347
5
0.1
0.2
0.5141374109721817
5
0.2
0.2
0.5319717505276262
5
0.5
0.2
0.5637860818774417
5
1
0.2
0.5584031823641136
10
0.01
0.1
0.47649098320580335
10
0.05
0.1
0.46415591816227114
10
0.1
0.1
0.46182912337108
10
0.2
0.1
0.4772920123740688
10
0.5
0.1
0.4828764015685888
10
1
0.1
0.51943043069385
20
0.01
0.05
0.48851405072202975
20
0.05
0.05
0.47959092180941243
20
0.1
0.05
0.48789884722068155
20
0.2
0.05
0.4344581627166882
20
0.5
0.05
0.48096529144727523
20
1
0.05
0.5176443685299258
30
0.01
0.03333333333333333
0.45842350247164027
30
0.05
0.03333333333333333
0.44843689774859086
30
0.1
0.03333333333333333
0.45285383533509793
30
0.2
0.03333333333333333
0.4616838054157074
30
0.5
0.03333333333333333
0.4636644084768945
30
1
0.03333333333333333
0.47586757895574405


In [48]:
# 6

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_6 = pd.DataFrame(model_results)
model_results_6.to_csv('tuning/55_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5127485847290596
5
0.05
0.2
0.5130582265648347
5
0.1
0.2
0.5141374109721817
5
0.2
0.2
0.5319717505276262
5
0.5
0.2
0.5637860818774417
5
1
0.2
0.5584031823641136
10
0.01
0.1
0.47649098320580335
10
0.05
0.1
0.46415591816227114
10
0.1
0.1
0.46182912337108
10
0.2
0.1
0.4772920123740688
10
0.5
0.1
0.4828764015685888
10
1
0.1
0.51943043069385
20
0.01
0.05
0.48851405072202975
20
0.05
0.05
0.47959092180941243
20
0.1
0.05
0.48789884722068155
20
0.2
0.05
0.4344581627166882
20
0.5
0.05
0.48096529144727523
20
1
0.05
0.5176443685299258
30
0.01
0.03333333333333333
0.45842350247164027
30
0.05
0.03333333333333333
0.44843689774859086
30
0.1
0.03333333333333333
0.45285383533509793
30
0.2
0.03333333333333333
0.4616838054157074
30
0.5
0.03333333333333333
0.4636644084768945
30
1
0.03333333333333333
0.47586757895574405


In [49]:
# 7

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_7 = pd.DataFrame(model_results)
model_results_7.to_csv('tuning/66_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5127485847290596
5
0.05
0.2
0.5130582265648347
5
0.1
0.2
0.5141374109721817
5
0.2
0.2
0.5319717505276262
5
0.5
0.2
0.5637860818774417
5
1
0.2
0.5584031823641136
10
0.01
0.1
0.47649098320580335
10
0.05
0.1
0.46415591816227114
10
0.1
0.1
0.46182912337108
10
0.2
0.1
0.4772920123740688
10
0.5
0.1
0.4828764015685888
10
1
0.1
0.51943043069385
20
0.01
0.05
0.48851405072202975
20
0.05
0.05
0.47959092180941243
20
0.1
0.05
0.48789884722068155
20
0.2
0.05
0.4344581627166882
20
0.5
0.05
0.48096529144727523
20
1
0.05
0.5176443685299258
30
0.01
0.03333333333333333
0.45842350247164027
30
0.05
0.03333333333333333
0.44843689774859086
30
0.1
0.03333333333333333
0.45285383533509793
30
0.2
0.03333333333333333
0.4616838054157074
30
0.5
0.03333333333333333
0.4636644084768945
30
1
0.03333333333333333
0.47586757895574405


In [50]:
# 8

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_8 = pd.DataFrame(model_results)
model_results_8.to_csv('tuning/77_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5127485847290596
5
0.05
0.2
0.5130582265648347
5
0.1
0.2
0.5141374109721817
5
0.2
0.2
0.5319717505276262
5
0.5
0.2
0.5637860818774417
5
1
0.2
0.5584031823641136
10
0.01
0.1
0.47649098320580335
10
0.05
0.1
0.46415591816227114
10
0.1
0.1
0.46182912337108
10
0.2
0.1
0.4772920123740688
10
0.5
0.1
0.4828764015685888
10
1
0.1
0.51943043069385
20
0.01
0.05
0.48851405072202975
20
0.05
0.05
0.47959092180941243
20
0.1
0.05
0.48789884722068155
20
0.2
0.05
0.4344581627166882
20
0.5
0.05
0.48096529144727523
20
1
0.05
0.5176443685299258
30
0.01
0.03333333333333333
0.45842350247164027
30
0.05
0.03333333333333333
0.44843689774859086
30
0.1
0.03333333333333333
0.45285383533509793
30
0.2
0.03333333333333333
0.4616838054157074
30
0.5
0.03333333333333333
0.4636644084768945
30
1
0.03333333333333333
0.47586757895574405


In [51]:
# 9

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_9 = pd.DataFrame(model_results)
model_results_9.to_csv('tuning/88_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5127485847290596
5
0.05
0.2
0.5130582265648347
5
0.1
0.2
0.5141374109721817
5
0.2
0.2
0.5319717505276262
5
0.5
0.2
0.5637860818774417
5
1
0.2
0.5584031823641136
10
0.01
0.1
0.47649098320580335
10
0.05
0.1
0.46415591816227114
10
0.1
0.1
0.46182912337108
10
0.2
0.1
0.4772920123740688
10
0.5
0.1
0.4828764015685888
10
1
0.1
0.51943043069385
20
0.01
0.05
0.48851405072202975
20
0.05
0.05
0.47959092180941243
20
0.1
0.05
0.48789884722068155
20
0.2
0.05
0.4344581627166882
20
0.5
0.05
0.48096529144727523
20
1
0.05
0.5176443685299258
30
0.01
0.03333333333333333
0.45842350247164027
30
0.05
0.03333333333333333
0.44843689774859086
30
0.1
0.03333333333333333
0.45285383533509793
30
0.2
0.03333333333333333
0.4616838054157074
30
0.5
0.03333333333333333
0.4636644084768945
30
1
0.03333333333333333
0.47586757895574405


In [53]:
# 10

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_10 = pd.DataFrame(model_results)
model_results_10.to_csv('tuning/99_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5127485847290596
5
0.05
0.2
0.5130582265648347
5
0.1
0.2
0.5141374109721817
5
0.2
0.2
0.5319717505276262
5
0.5
0.2
0.5637860818774417
5
1
0.2
0.5584031823641136
10
0.01
0.1
0.47649098320580335
10
0.05
0.1
0.46415591816227114
10
0.1
0.1
0.46182912337108
10
0.2
0.1
0.4772920123740688
10
0.5
0.1
0.4828764015685888
10
1
0.1
0.51943043069385
20
0.01
0.05
0.48851405072202975
20
0.05
0.05
0.47959092180941243
20
0.1
0.05
0.48789884722068155
20
0.2
0.05
0.4344581627166882
20
0.5
0.05
0.48096529144727523
20
1
0.05
0.5176443685299258
30
0.01
0.03333333333333333
0.45842350247164027
30
0.05
0.03333333333333333
0.44843689774859086
30
0.1
0.03333333333333333
0.45285383533509793
30
0.2
0.03333333333333333
0.4616838054157074
30
0.5
0.03333333333333333
0.4636644084768945
30
1
0.03333333333333333
0.47586757895574405


In [54]:
model_results = pd.concat([model_results_1, model_results_2, model_results_3, model_results_4, model_results_5,
                          model_results_6, model_results_7, model_results_8, model_results_9, model_results_10])
model_results.to_csv("tuning/model_results.csv", index=False, encoding='utf-8-sig')

In [55]:
model_results = model_results.groupby(['Topics', 'Alpha'], as_index=False).mean()
model_results = model_results.sort_values(by='Coherence', ascending=False)
model_results.to_csv('2_lda_tuning_results.csv', index=False)

In [56]:
model_results

Unnamed: 0,Topics,Alpha,Beta,Coherence
4,5,0.5,0.2,0.563786
5,5,1.0,0.2,0.558403
3,5,0.2,0.2,0.531972
11,10,1.0,0.1,0.51943
17,20,1.0,0.05,0.517644
2,5,0.1,0.2,0.514137
1,5,0.05,0.2,0.513058
0,5,0.01,0.2,0.512749
12,20,0.01,0.05,0.488514
14,20,0.1,0.05,0.487899


In [None]:
# priors = pd.pivot_table(model_results,index=["Topics"],columns=["Alpha"],values=['Coherence'])
# priors.columns = range(priors.shape[1])
# priors.columns = ['.01','.05','.1','.2','.5','1']
# df.head(1)
# priors = priors.reset_index()
# priors

In [None]:
# priors.to_csv("siri_lda_tuning_results.csv",index=True, encoding="utf-8")

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np
  
# # dummy data
# x1 = priors['Topics']
# A = priors['.01']
# B = priors['.05']
# C = priors['.1']
# D = priors['.2']
# E = priors['.5']
# F = priors['1']

# # creates two subplots
# # fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (24, 12))

# fig, ax = plt.subplots(2, 3, figsize = (24,12))

# # Plot without grid
# ax[0,0].plot(x1, A, label='0.01', color='tab:blue')
# ax[0,1].plot(x1, B, label='0.05', color='tab:orange')
# ax[0,2].plot(x1, C, label='0.1', color='tab:green')
# ax[1,0].plot(x1, D, label='0.2', color='tab:red')
# ax[1,1].plot(x1, E, label='0.5', color='tab:purple')
# ax[1,2].plot(x1, F, label='1', color='tab:brown')

# ax[0,0].set_xlim(xmin=9)
# ax[0,0].set_title('siri, α=.01, Beta=1/K')
# ax[0,0].set_xlabel('K')
# ax[0,0].set_ylabel('Cv')

# ax[0,1].set_xlim(xmin=9)
# ax[0,1].set_title('siri, α=.05, Beta=1/K')
# ax[0,1].set_xlabel('K')
# ax[0,1].set_ylabel('Cv')

# ax[0,2].set_xlim(xmin=9)
# ax[0,2].set_title('siri, α=.1, Beta=1/K')
# ax[0,2].set_xlabel('K')
# ax[0,2].set_ylabel('Cv')

# ax[1,0].set_xlim(xmin=9)
# ax[1,0].set_title('siri, α=.2, Beta=1/K')
# ax[1,0].set_xlabel('K')
# ax[1,0].set_ylabel('Cv')

# ax[1,1].set_xlim(xmin=9)
# ax[1,1].set_title('siri, α=.5, Beta=1/K')
# ax[1,1].set_xlabel('K')
# ax[1,1].set_ylabel('Cv')

# ax[1,2].set_xlim(xmin=9)
# ax[1,2].set_title('siri, α=1, Beta=1/K')
# ax[1,2].set_xlabel('K')
# ax[1,2].set_ylabel('Cv')

# # fig.tight_layout()
# fig.set_facecolor("w")
# plt.show()