In [1]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['https_www','seems_like','do','not','imgur','tkg','https','http','could','www','com','ever','doesnt_seem',
                  'xxxx','else','would','also','ea','&amp','#x200B','oh','etc','yeah','nan','however','even','dont_know','sa',
                  "looks_like",'especially','may','sounds_like'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [2]:
# LoadDataset
df=pd.read_csv('googleassistant_merged.csv')
print(df.shape)
df.head(1)

(3021, 12)


Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content
0,5447ao,What is Google Assistant and how does it work?,"Recently, Google launched a product Google Ass...",https://www.reddit.com/r/googleassistant/comme...,TricksNDeals,2,2016-09-23 19:18:25,0,/r/googleassistant/comments/5447ao/what_is_goo...,,['nan'],What is Google Assistant and how does it work?...


In [3]:
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI)

def preprocess_tweet(row):
    text = row['content']
    text = text.replace('r/','')
    text = p.clean(text)
    text = clean(text,     
                 fix_unicode=True,              # fix various unicode errors
                 to_ascii=True,                 # transliterate to closest ASCII representation
                 lower=True,                    # lowercase text
                 no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                 no_urls=True,                  # replace all URLs with a special token
                 no_emails=True,                # replace all email addresses with a special token
                 no_phone_numbers=True,         # replace all phone numbers with a special token
                 no_numbers=True,               # replace all numbers with a special token
                 no_digits=True,                # replace all digits with a special token
                 no_currency_symbols=True,      # replace all currency symbols with a special token
                 no_punct=True,                 # remove punctuations
                 lang="en",                     # set to 'de' for German special handling
                 replace_with_punct="",          # instead of removing punctuations you may replace them
                 replace_with_url="",
                 replace_with_email="",
                 replace_with_phone_number="",
                 replace_with_number="",
                 replace_with_digit="",
                 replace_with_currency_symbol=""
                )
    text = text.replace('amp','')
    text = text.replace('nan','')
    return text

df['content'] = df.apply(preprocess_tweet, axis=1)
df

Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content
0,5447ao,What is Google Assistant and how does it work?,"Recently, Google launched a product Google Ass...",https://www.reddit.com/r/googleassistant/comme...,TricksNDeals,2,2016-09-23 19:18:25,0,/r/googleassistant/comments/5447ao/what_is_goo...,,['nan'],what is google assistant and how does it work ...
1,5448oa,"What is Google Assistant, how does it work and...",,https://www.reddit.com/r/googleassistant/comme...,TricksNDeals,2,2016-09-23 19:30:43,0,/r/googleassistant/comments/5448oa/what_is_goo...,,['nan'],what is google assistant how does it work and ...
2,54larn,Allo Easter Egg (All your base),,https://www.reddit.com/r/googleassistant/comme...,[deleted],1,2016-09-27 00:00:31,0,/r/googleassistant/comments/54larn/allo_easter...,,['nan'],allo easter egg all your base
3,55vthv,Google Assistant in Nexus,Will google assistant(Like the one showed toda...,https://www.reddit.com/r/googleassistant/comme...,pra_van,2,2016-10-05 05:06:52,1,/r/googleassistant/comments/55vthv/google_assi...,,"['It will not, not in 7.1']",google assistant in nexus will google assistan...
4,55wfv5,Offical Site,,https://www.reddit.com/r/googleassistant/comme...,YePitch,1,2016-10-05 07:14:59,0,/r/googleassistant/comments/55wfv5/offical_site/,,['nan'],offical site
...,...,...,...,...,...,...,...,...,...,...,...,...
3016,no1u1w,What? I just really don't know what to say,,https://www.reddit.com/r/googleassistant/comme...,Debris_Ninja_Fighter,1,2021-05-30 10:41:11,8,/r/googleassistant/comments/no1u1w/what_i_just...,Bug,['https://i.imgur.com/1veAlMl.jpg This is just...,what i just really dont know what to say this ...
3017,no9xnj,Is there any way to stop Assistant from tellin...,Getting a bit narked off with Assistant now th...,https://www.reddit.com/r/googleassistant/comme...,ErTnEc,1,2021-05-30 20:13:24,5,/r/googleassistant/comments/no9xnj/is_there_an...,Tech Support,"[""Are you on iOS? Because I'm betting Apple Mu...",is there any way to stop assistant from tellin...
3018,nodi7h,Can I use Google Assistant to play music using...,"Hola people,\n\nWhenever I'm cycling, I don't ...",https://www.reddit.com/r/googleassistant/comme...,An-Onymous-Name,1,2021-05-30 23:27:17,1,/r/googleassistant/comments/nodi7h/can_i_use_g...,,['I had no success with Audify but Pulsar work...,can i use google assistant to play music using...
3019,noogbk,Does Google assistant have its own data Or if ...,,https://www.reddit.com/r/googleassistant/comme...,Sleepingtide,1,2021-05-31 08:34:33,0,/r/googleassistant/comments/noogbk/does_google...,Question,['nan'],does google assistant have its own data or if ...


In [4]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub(r'http\S+', '', sent) # remove http
        sent = re.sub(r'https\S+', '', sent) # remove https
        sent = re.sub('<[^>]+>', '', sent) # remove HTML tags
        sent = re.sub('<[^<]+?>', '', sent)
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = re.sub(r'[^\w\s]','',sent) # remove punctuations
        sent = gensim.utils.simple_preprocess(str(sent), min_len=2, deacc=True) 
        
        yield(sent)  

# # Convert to list
data = df.content.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['what', 'is', 'google', 'assistant', 'and', 'how', 'does', 'it', 'work', 'recently', 'google', 'launched', 'product', 'google', 'assistant', 'that', 'has', 'the', 'caliber', 'of', 'taking', 'over', 'the', 'tech', 'market', 'google', 'has', 'planned', 'to', 'go', 'face', 'to', 'face', 'with', 'apples', 'siri', 'and', 'amazons', 'alexa', 'due', 'to', 'the', 'fear', 'of', 'losing', 'the', 'market', 'of', 'artificial', 'intelligenceai', 'google', 'has', 'come', 'up', 'with', 'an', 'innovative', 'and', 'user', 'friendly', 'product', 'known', 'as', 'google', 'assistant', 'it', 'can', 'be', 'taken', 'as', 'an', 'upgraded', 'version', 'of', 'google', 'now', 'google', 'now', 'can', 'do', 'many', 'things', 'for', 'you', 'like', 'searching', 'and', 'the', 'most', 'important', 'feature', 'being', 'the', 'google', 'now', 'cards', 'that', 'shows', 'you', 'useful', 'information', 'about', 'almost', 'everything', 'you', 'need', 'in', 'the', 'annual', 'google', 'io', 'developer', 'conference', 'back'

In [5]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=1,delimiter='_') # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=1, delimiter='_')  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Tag   Meaning                English Examples
# ADJ   adjective              new, good, high, special, big, local
# ADP   adposition             on, of, at, with, by, into, under
# ADV   adverb                 really, already, still, early, now
# CONJ  conjunction            and, or, but, if, while, although
# DET   determiner, article    the, a, some, most, every, no, which
# NOUN  noun                   year, home, costs, time, Africa
# NUM   numeral                twenty-four, fourth, 1991, 14:24
# PRT   particle               at, on, out, over per, that, up, with
# PRON  pronoun                he, their, her, its, my, I, us
# VERB  verb                   is, say, told, given, playing, would
# .     punctuation marks      . , ; !
# X     other                  ersatz, esprit, dunno, gr8, univeristy

# def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
def process_words(texts, stop_words=stop_words, disallowed_postags=['ADP', 'CONJ', 'DET', 'NUM', 'PRT','PRON','.','X']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ not in disallowed_postags])
#         texts_out.append([token.lemma_ for token in doc])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc), max_len=20) if word not in stop_words] for doc in texts_out] 
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!
print(data_ready[:1])

[['google_assistant', 'work', 'recently', 'google', 'launch', 'product', 'google_assistant', 'caliber', 'take', 'tech', 'market', 'google', 'plan', 'go', 'face', 'face', 'apple', 'siri', 'amazon', 'alexa', 'due', 'fear', 'lose', 'market', 'artificial', 'intelligenceai', 'google', 'come', 'innovative', 'user', 'friendly', 'product', 'know', 'google_assistant', 'take', 'upgrade', 'version', 'google', 'google', 'many', 'things_like', 'search', 'important', 'feature', 'google', 'card', 'show', 'useful', 'information', 'almost', 'need', 'annual', 'google_io', 'developer', 'conference', 'google', 'talk', 'google_assistant', 'main', 'motive', 'new', 'virtual', 'assistant', 'see', 'improvement', 'google', 'way', 'conversation', 'experience', 'doubt', 'thing', 'talk', 'till', 'worry', 'discuss', 'detail', 'google_assistant', 'google_assistant', 'new', 'virtual', 'assistant', 'google', 'see', 'improvise', 'version', 'current', 'google_assistant', 'see', 'upgrade', 'extended', 'version', 'google'

In [6]:
from gensim.corpora import Dictionary

# Create Dictionary
id2word = Dictionary(data_ready)
print('Number of unique words in initital documents:', len(id2word))

# Filter out words that occur less than 0.5% documents, or more than 20% of the documents.
id2word.filter_extremes(no_below = (round(((len(data_ready))*0.005))), no_above = 0.99)
print('Number of unique words after removing rare and common words:', len(id2word))

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
print('Number of documents: %d' % len(corpus))

Number of unique words in initital documents: 10770
Number of unique words after removing rare and common words: 1087
Number of documents: 3021


In [7]:
id2word.save("corpus_dict/dict")
corpora.MmCorpus.serialize("corpus_dict/corpus", corpus)

In [8]:
df['tokenz'] = [[(id2word[id]) for id, freq in cp] for cp in corpus[:]]
df.head(1)

Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content,tokenz
0,5447ao,What is Google Assistant and how does it work?,"Recently, Google launched a product Google Ass...",https://www.reddit.com/r/googleassistant/comme...,TricksNDeals,2,2016-09-23 19:18:25,0,/r/googleassistant/comments/5447ao/what_is_goo...,,['nan'],what is google assistant and how does it work ...,"[able, access, achieve, act, add, ai, alarm, a..."


In [9]:
df.to_csv('1_df_content_tokenz.csv',index=False, encoding='utf-8')

In [10]:
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[('able', 2), ('access', 1), ('achieve', 2), ('act', 1), ('add', 2), ('ai', 6), ('alarm', 2), ('alexa', 4), ('almost', 3), ('already', 3), ('amazing', 1), ('amazon', 1), ('android', 6), ('announce', 1), ('answer', 4), ('app', 4), ('apple', 4), ('application', 1), ('area', 2), ('ask', 2), ('ask_google_assistant', 1), ('ask_question', 1), ('assistant', 11), ('automatically', 1), ('available', 4), ('aware', 1), ('back', 2), ('basic', 1), ('best', 1), ('better', 1), ('big', 1), ('book', 2), ('bot', 2), ('break', 1), ('bring', 1), ('buy', 1), ('call', 3), ('car', 1), ('card', 3), ('case', 3), ('certain', 1), ('chance', 1), ('check', 1), ('click', 1), ('come', 3), ('command', 1), ('common', 1), ('commute', 1), ('company', 2), ('computer', 3), ('connect', 3), ('control', 3), ('conversation', 3), ('current', 1), ('daily', 1), ('date', 1), ('definitely', 1), ('detail', 1), ('developer', 4), ('device', 2), ('different', 1), ('display', 1), ('dont_need', 1), ('door', 1), ('due', 1), ('easily', 1

In [11]:
# #tf-idf
# from gensim.models import TfidfModel

# # Create Dictionary
# from gensim import models

# tfidf = models.TfidfModel(corpus, id2word=id2word)  # step 1 -- initialize a model
# corpus = tfidf[corpus]
# for doc in corpus:
#     pprint(doc)
#     break

# # print('Number of unique tokens: %d' % len(id2word))
# # print('Number of documents: %d' % len(corpus))

In [12]:
# topWords = {}
# for doc in corpus:
#     for iWord, tf_idf in doc:
#         if iWord not in topWords:
#             topWords[iWord] = 0

#         if tf_idf > topWords[iWord]:
#             topWords[iWord] = tf_idf
# sum = 0
# term = []
# for i, item in enumerate(sorted(topWords.items(), key=lambda x: x[1], reverse=True), 1):
# #     print("%2s: %-13s %s" % (i, id2word[item[0]], item[1]))
#     term.append(id2word[item[0]])
#     sum += item[1]
# #     if i == 100: break
# # print (sum)
# mean = sum/i
# print ('Mean of tf-idf score: ' + str(mean))
# # print (term)

In [13]:
# #tf-idf
# from gensim.models import TfidfModel

# # Create Dictionary
# from gensim import models

# low_value = 0.271734994034526
# low_value_words = []

# tfidf = models.TfidfModel(corpus, id2word=id2word)  # step 1 -- initialize a model
# corpus = tfidf[corpus]
# for doc in corpus:
#     low_value_words += [id for id, value in tfidf[doc] if value < low_value]

In [14]:
# id2word.filter_tokens(bad_ids=low_value_words)
# print('Number of filtered unique tokens: %d' % len(id2word))
# print('Number of documents: %d' % len(corpus))

In [15]:
# corpus = [id2word.doc2bow(doc) for doc in data_ready]
# corpus = tfidf[corpus]
# for doc in corpus:
#     pprint(doc)

In [16]:
# print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]])

In [17]:
# # Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=10)

# pprint(lda_model.print_topics())

In [18]:
def compute_coherence_values(corpus, dictionary, num_topics, a, b):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics, 
                                                random_state=100,
                                                chunksize=100,
                                                passes=40,
                                                iterations=1000,
                                                alpha=a,
                                                eta=1/num_topics,
                                                eval_every=None)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['tokenz'], dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [19]:
# 1

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_1 = pd.DataFrame(model_results)
model_results_1.to_csv('tuning/00_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5267041879488744
5
0.05
0.2
0.5255925037964797
5
0.1
0.2
0.5222324395553346
5
0.2
0.2
0.5700077763053673
5
0.5
0.2
0.6070406397399928
5
1
0.2
0.6087348690047772
10
0.01
0.1
0.46230994156252275
10
0.05
0.1
0.47100868112304184
10
0.1
0.1
0.4645652436245665
10
0.2
0.1
0.45068854320280816
10
0.5
0.1
0.5386711743273569
10
1
0.1
0.5905648087936453
20
0.01
0.05
0.37419160074902696
20
0.05
0.05
0.4036052238359666
20
0.1
0.05
0.39344677109201076
20
0.2
0.05
0.4141246498303642
20
0.5
0.05
0.44710526955742225
20
1
0.05
0.4593533697380291
30
0.01
0.03333333333333333
0.4080856618215486
30
0.05
0.03333333333333333
0.41045594125848694
30
0.1
0.03333333333333333
0.4037927010196475
30
0.2
0.03333333333333333
0.38635844119574325
30
0.5
0.03333333333333333
0.3409825088934663
30
1
0.03333333333333333
0.3825148046459521


In [20]:
# 2

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_2 = pd.DataFrame(model_results)
model_results_2.to_csv('tuning/11_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5267041879488744
5
0.05
0.2
0.5255925037964797
5
0.1
0.2
0.5222324395553346
5
0.2
0.2
0.5700077763053673
5
0.5
0.2
0.6070406397399928
5
1
0.2
0.6087348690047772
10
0.01
0.1
0.46230994156252275
10
0.05
0.1
0.47100868112304184
10
0.1
0.1
0.4645652436245665
10
0.2
0.1
0.45068854320280816
10
0.5
0.1
0.5386711743273569
10
1
0.1
0.5905648087936453
20
0.01
0.05
0.37419160074902696
20
0.05
0.05
0.4036052238359666
20
0.1
0.05
0.39344677109201076
20
0.2
0.05
0.4141246498303642
20
0.5
0.05
0.44710526955742225
20
1
0.05
0.4593533697380291
30
0.01
0.03333333333333333
0.4080856618215486
30
0.05
0.03333333333333333
0.41045594125848694
30
0.1
0.03333333333333333
0.4037927010196475
30
0.2
0.03333333333333333
0.38635844119574325
30
0.5
0.03333333333333333
0.3409825088934663
30
1
0.03333333333333333
0.3825148046459521


In [21]:
# 3

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_3 = pd.DataFrame(model_results)
model_results_3.to_csv('tuning/22_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5267041879488744
5
0.05
0.2
0.5255925037964797
5
0.1
0.2
0.5222324395553346
5
0.2
0.2
0.5700077763053673
5
0.5
0.2
0.6070406397399928
5
1
0.2
0.6087348690047772
10
0.01
0.1
0.46230994156252275
10
0.05
0.1
0.47100868112304184
10
0.1
0.1
0.4645652436245665
10
0.2
0.1
0.45068854320280816
10
0.5
0.1
0.5386711743273569
10
1
0.1
0.5905648087936453
20
0.01
0.05
0.37419160074902696
20
0.05
0.05
0.4036052238359666
20
0.1
0.05
0.39344677109201076
20
0.2
0.05
0.4141246498303642
20
0.5
0.05
0.44710526955742225
20
1
0.05
0.4593533697380291
30
0.01
0.03333333333333333
0.4080856618215486
30
0.05
0.03333333333333333
0.41045594125848694
30
0.1
0.03333333333333333
0.4037927010196475
30
0.2
0.03333333333333333
0.38635844119574325
30
0.5
0.03333333333333333
0.3409825088934663
30
1
0.03333333333333333
0.3825148046459521


In [22]:
# 4

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_4 = pd.DataFrame(model_results)
model_results_4.to_csv('tuning/33_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5267041879488744
5
0.05
0.2
0.5255925037964797
5
0.1
0.2
0.5222324395553346
5
0.2
0.2
0.5700077763053673
5
0.5
0.2
0.6070406397399928
5
1
0.2
0.6087348690047772
10
0.01
0.1
0.46230994156252275
10
0.05
0.1
0.47100868112304184
10
0.1
0.1
0.4645652436245665
10
0.2
0.1
0.45068854320280816
10
0.5
0.1
0.5386711743273569
10
1
0.1
0.5905648087936453
20
0.01
0.05
0.37419160074902696
20
0.05
0.05
0.4036052238359666
20
0.1
0.05
0.39344677109201076
20
0.2
0.05
0.4141246498303642
20
0.5
0.05
0.44710526955742225
20
1
0.05
0.4593533697380291
30
0.01
0.03333333333333333
0.4080856618215486
30
0.05
0.03333333333333333
0.41045594125848694
30
0.1
0.03333333333333333
0.4037927010196475
30
0.2
0.03333333333333333
0.38635844119574325
30
0.5
0.03333333333333333
0.3409825088934663
30
1
0.03333333333333333
0.3825148046459521


In [23]:
# 5

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_5 = pd.DataFrame(model_results)
model_results_5.to_csv('tuning/44_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5267041879488744
5
0.05
0.2
0.5255925037964797
5
0.1
0.2
0.5222324395553346
5
0.2
0.2
0.5700077763053673
5
0.5
0.2
0.6070406397399928
5
1
0.2
0.6087348690047772
10
0.01
0.1
0.46230994156252275
10
0.05
0.1
0.47100868112304184
10
0.1
0.1
0.4645652436245665
10
0.2
0.1
0.45068854320280816
10
0.5
0.1
0.5386711743273569
10
1
0.1
0.5905648087936453
20
0.01
0.05
0.37419160074902696
20
0.05
0.05
0.4036052238359666
20
0.1
0.05
0.39344677109201076
20
0.2
0.05
0.4141246498303642
20
0.5
0.05
0.44710526955742225
20
1
0.05
0.4593533697380291
30
0.01
0.03333333333333333
0.4080856618215486
30
0.05
0.03333333333333333
0.41045594125848694
30
0.1
0.03333333333333333
0.4037927010196475
30
0.2
0.03333333333333333
0.38635844119574325
30
0.5
0.03333333333333333
0.3409825088934663
30
1
0.03333333333333333
0.3825148046459521


In [24]:
# 6

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_6 = pd.DataFrame(model_results)
model_results_6.to_csv('tuning/55_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5267041879488744
5
0.05
0.2
0.5255925037964797
5
0.1
0.2
0.5222324395553346
5
0.2
0.2
0.5700077763053673
5
0.5
0.2
0.6070406397399928
5
1
0.2
0.6087348690047772
10
0.01
0.1
0.46230994156252275
10
0.05
0.1
0.47100868112304184
10
0.1
0.1
0.4645652436245665
10
0.2
0.1
0.45068854320280816
10
0.5
0.1
0.5386711743273569
10
1
0.1
0.5905648087936453
20
0.01
0.05
0.37419160074902696
20
0.05
0.05
0.4036052238359666
20
0.1
0.05
0.39344677109201076
20
0.2
0.05
0.4141246498303642
20
0.5
0.05
0.44710526955742225
20
1
0.05
0.4593533697380291
30
0.01
0.03333333333333333
0.4080856618215486
30
0.05
0.03333333333333333
0.41045594125848694
30
0.1
0.03333333333333333
0.4037927010196475
30
0.2
0.03333333333333333
0.38635844119574325
30
0.5
0.03333333333333333
0.3409825088934663
30
1
0.03333333333333333
0.3825148046459521


In [25]:
# 7

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_7 = pd.DataFrame(model_results)
model_results_7.to_csv('tuning/66_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5267041879488744
5
0.05
0.2
0.5255925037964797
5
0.1
0.2
0.5222324395553346
5
0.2
0.2
0.5700077763053673
5
0.5
0.2
0.6070406397399928
5
1
0.2
0.6087348690047772
10
0.01
0.1
0.46230994156252275
10
0.05
0.1
0.47100868112304184
10
0.1
0.1
0.4645652436245665
10
0.2
0.1
0.45068854320280816
10
0.5
0.1
0.5386711743273569
10
1
0.1
0.5905648087936453
20
0.01
0.05
0.37419160074902696
20
0.05
0.05
0.4036052238359666
20
0.1
0.05
0.39344677109201076
20
0.2
0.05
0.4141246498303642
20
0.5
0.05
0.44710526955742225
20
1
0.05
0.4593533697380291
30
0.01
0.03333333333333333
0.4080856618215486
30
0.05
0.03333333333333333
0.41045594125848694
30
0.1
0.03333333333333333
0.4037927010196475
30
0.2
0.03333333333333333
0.38635844119574325
30
0.5
0.03333333333333333
0.3409825088934663
30
1
0.03333333333333333
0.3825148046459521


In [26]:
# 8

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_8 = pd.DataFrame(model_results)
model_results_8.to_csv('tuning/77_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5267041879488744
5
0.05
0.2
0.5255925037964797
5
0.1
0.2
0.5222324395553346
5
0.2
0.2
0.5700077763053673
5
0.5
0.2
0.6070406397399928
5
1
0.2
0.6087348690047772
10
0.01
0.1
0.46230994156252275
10
0.05
0.1
0.47100868112304184
10
0.1
0.1
0.4645652436245665
10
0.2
0.1
0.45068854320280816
10
0.5
0.1
0.5386711743273569
10
1
0.1
0.5905648087936453
20
0.01
0.05
0.37419160074902696
20
0.05
0.05
0.4036052238359666
20
0.1
0.05
0.39344677109201076
20
0.2
0.05
0.4141246498303642
20
0.5
0.05
0.44710526955742225
20
1
0.05
0.4593533697380291
30
0.01
0.03333333333333333
0.4080856618215486
30
0.05
0.03333333333333333
0.41045594125848694
30
0.1
0.03333333333333333
0.4037927010196475
30
0.2
0.03333333333333333
0.38635844119574325
30
0.5
0.03333333333333333
0.3409825088934663
30
1
0.03333333333333333
0.3825148046459521


In [27]:
# 9

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_9 = pd.DataFrame(model_results)
model_results_9.to_csv('tuning/88_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5267041879488744
5
0.05
0.2
0.5255925037964797
5
0.1
0.2
0.5222324395553346
5
0.2
0.2
0.5700077763053673
5
0.5
0.2
0.6070406397399928
5
1
0.2
0.6087348690047772
10
0.01
0.1
0.46230994156252275
10
0.05
0.1
0.47100868112304184
10
0.1
0.1
0.4645652436245665
10
0.2
0.1
0.45068854320280816
10
0.5
0.1
0.5386711743273569
10
1
0.1
0.5905648087936453
20
0.01
0.05
0.37419160074902696
20
0.05
0.05
0.4036052238359666
20
0.1
0.05
0.39344677109201076
20
0.2
0.05
0.4141246498303642
20
0.5
0.05
0.44710526955742225
20
1
0.05
0.4593533697380291
30
0.01
0.03333333333333333
0.4080856618215486
30
0.05
0.03333333333333333
0.41045594125848694
30
0.1
0.03333333333333333
0.4037927010196475
30
0.2
0.03333333333333333
0.38635844119574325
30
0.5
0.03333333333333333
0.3409825088934663
30
1
0.03333333333333333
0.3825148046459521


In [28]:
# 10

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_10 = pd.DataFrame(model_results)
model_results_10.to_csv('tuning/99_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.5267041879488744
5
0.05
0.2
0.5255925037964797
5
0.1
0.2
0.5222324395553346
5
0.2
0.2
0.5700077763053673
5
0.5
0.2
0.6070406397399928
5
1
0.2
0.6087348690047772
10
0.01
0.1
0.46230994156252275
10
0.05
0.1
0.47100868112304184
10
0.1
0.1
0.4645652436245665
10
0.2
0.1
0.45068854320280816
10
0.5
0.1
0.5386711743273569
10
1
0.1
0.5905648087936453
20
0.01
0.05
0.37419160074902696
20
0.05
0.05
0.4036052238359666
20
0.1
0.05
0.39344677109201076
20
0.2
0.05
0.4141246498303642
20
0.5
0.05
0.44710526955742225
20
1
0.05
0.4593533697380291
30
0.01
0.03333333333333333
0.4080856618215486
30
0.05
0.03333333333333333
0.41045594125848694
30
0.1
0.03333333333333333
0.4037927010196475
30
0.2
0.03333333333333333
0.38635844119574325
30
0.5
0.03333333333333333
0.3409825088934663
30
1
0.03333333333333333
0.3825148046459521


In [29]:
model_results = pd.concat([model_results_1, model_results_2, model_results_3, model_results_4, model_results_5,
                          model_results_6, model_results_7, model_results_8, model_results_9, model_results_10])
model_results.to_csv("tuning/model_results.csv", index=False, encoding='utf-8-sig')

In [30]:
model_results = model_results.groupby(['Topics', 'Alpha'], as_index=False).mean()
model_results = model_results.sort_values(by='Coherence', ascending=False)
model_results.to_csv('2_lda_tuning_results.csv', index=False)

In [31]:
model_results

Unnamed: 0,Topics,Alpha,Beta,Coherence
5,5,1.0,0.2,0.608735
4,5,0.5,0.2,0.607041
11,10,1.0,0.1,0.590565
3,5,0.2,0.2,0.570008
10,10,0.5,0.1,0.538671
0,5,0.01,0.2,0.526704
1,5,0.05,0.2,0.525593
2,5,0.1,0.2,0.522232
7,10,0.05,0.1,0.471009
8,10,0.1,0.1,0.464565


In [32]:
# priors = pd.pivot_table(model_results,index=["Topics"],columns=["Alpha"],values=['Coherence'])
# priors.columns = range(priors.shape[1])
# priors.columns = ['.01','.05','.1','.2','.5','1']
# df.head(1)
# priors = priors.reset_index()
# priors

In [33]:
# priors.to_csv("siri_lda_tuning_results.csv",index=True, encoding="utf-8")

In [34]:
# import matplotlib.pyplot as plt
# import numpy as np
  
# # dummy data
# x1 = priors['Topics']
# A = priors['.01']
# B = priors['.05']
# C = priors['.1']
# D = priors['.2']
# E = priors['.5']
# F = priors['1']

# # creates two subplots
# # fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (24, 12))

# fig, ax = plt.subplots(2, 3, figsize = (24,12))

# # Plot without grid
# ax[0,0].plot(x1, A, label='0.01', color='tab:blue')
# ax[0,1].plot(x1, B, label='0.05', color='tab:orange')
# ax[0,2].plot(x1, C, label='0.1', color='tab:green')
# ax[1,0].plot(x1, D, label='0.2', color='tab:red')
# ax[1,1].plot(x1, E, label='0.5', color='tab:purple')
# ax[1,2].plot(x1, F, label='1', color='tab:brown')

# ax[0,0].set_xlim(xmin=9)
# ax[0,0].set_title('siri, α=.01, Beta=1/K')
# ax[0,0].set_xlabel('K')
# ax[0,0].set_ylabel('Cv')

# ax[0,1].set_xlim(xmin=9)
# ax[0,1].set_title('siri, α=.05, Beta=1/K')
# ax[0,1].set_xlabel('K')
# ax[0,1].set_ylabel('Cv')

# ax[0,2].set_xlim(xmin=9)
# ax[0,2].set_title('siri, α=.1, Beta=1/K')
# ax[0,2].set_xlabel('K')
# ax[0,2].set_ylabel('Cv')

# ax[1,0].set_xlim(xmin=9)
# ax[1,0].set_title('siri, α=.2, Beta=1/K')
# ax[1,0].set_xlabel('K')
# ax[1,0].set_ylabel('Cv')

# ax[1,1].set_xlim(xmin=9)
# ax[1,1].set_title('siri, α=.5, Beta=1/K')
# ax[1,1].set_xlabel('K')
# ax[1,1].set_ylabel('Cv')

# ax[1,2].set_xlim(xmin=9)
# ax[1,2].set_title('siri, α=1, Beta=1/K')
# ax[1,2].set_xlabel('K')
# ax[1,2].set_ylabel('Cv')

# # fig.tight_layout()
# fig.set_facecolor("w")
# plt.show()