In [1]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['https_www','seems_like','do','not','imgur','tkg','https','http','could','www','com','ever','doesnt_seem',
                  'xxxx','else','would','also','ea','&amp','#x200B','oh','etc','yeah','nan','however','even','dont_know','sa',
                  "looks_like",'especially','may','sounds_like'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [2]:
# LoadDataset
df=pd.read_csv('alexa_merged.csv')
print(df.shape)
df.head(1)

(18580, 12)


Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content
0,4lx0dn,Alexa as an Android app. Who needs the Alexa d...,,https://www.reddit.com/r/alexa/comments/4lx0dn...,layboy,2,2016-06-01 03:25:15,0,/r/alexa/comments/4lx0dn/alexa_as_an_android_a...,,['nan'],Alexa as an Android app. Who needs the Alexa d...


In [3]:
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI)

def preprocess_tweet(row):
    text = row['content']
    text = text.replace('r/','')
    text = p.clean(text)
    text = clean(text,     
                 fix_unicode=True,              # fix various unicode errors
                 to_ascii=True,                 # transliterate to closest ASCII representation
                 lower=True,                    # lowercase text
                 no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                 no_urls=True,                  # replace all URLs with a special token
                 no_emails=True,                # replace all email addresses with a special token
                 no_phone_numbers=True,         # replace all phone numbers with a special token
                 no_numbers=True,               # replace all numbers with a special token
                 no_digits=True,                # replace all digits with a special token
                 no_currency_symbols=True,      # replace all currency symbols with a special token
                 no_punct=True,                 # remove punctuations
                 lang="en",                     # set to 'de' for German special handling
                 replace_with_punct="",          # instead of removing punctuations you may replace them
                 replace_with_url="",
                 replace_with_email="",
                 replace_with_phone_number="",
                 replace_with_number="",
                 replace_with_digit="",
                 replace_with_currency_symbol=""
                )
    text = text.replace('amp','')
    text = text.replace('nan','')
    return text

df['content'] = df.apply(preprocess_tweet, axis=1)
df

Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content
0,4lx0dn,Alexa as an Android app. Who needs the Alexa d...,,https://www.reddit.com/r/alexa/comments/4lx0dn...,layboy,2,2016-06-01 03:25:15,0,/r/alexa/comments/4lx0dn/alexa_as_an_android_a...,,['nan'],alexa as an android app who needs the alexa de...
1,4m8rd3,Let Alexa transform Pebble Core into the comba...,,https://www.reddit.com/r/alexa/comments/4m8rd3...,nnrR0b0t,4,2016-06-03 02:59:42,0,/r/alexa/comments/4m8rd3/let_alexa_transform_p...,,['nan'],let alexa transform pebble core into the comba...
2,4me8xk,Sample Alexa Custom Skill for BART train times...,,https://www.reddit.com/r/alexa/comments/4me8xk...,[deleted],1,2016-06-04 02:19:44,0,/r/alexa/comments/4me8xk/sample_alexa_custom_s...,,['nan'],sle alexa custom skill for bart train times in...
3,4meeyz,Sample Alexa Custom Skill for SF BART transit ...,,https://www.reddit.com/r/alexa/comments/4meeyz...,simonprickett,3,2016-06-04 02:55:29,0,/r/alexa/comments/4meeyz/sample_alexa_custom_s...,,['nan'],sle alexa custom skill for sf bart transit in ...
4,4mghpo,Alexa on pebble watches,,https://www.reddit.com/r/alexa/comments/4mghpo...,layboy,2,2016-06-04 11:01:10,0,/r/alexa/comments/4mghpo/alexa_on_pebble_watches/,,['nan'],alexa on pebble watches
...,...,...,...,...,...,...,...,...,...,...,...,...
18575,np0tuf,How is a post deleted before it ever actually ...,,https://www.reddit.com/r/alexa/comments/np0tuf...,Stamp_My_Art,1,2021-05-31 19:05:32,0,/r/alexa/comments/np0tuf/how_is_a_post_deleted...,,['nan'],how is a post deleted before it ever actually ...
18576,np0vru,Issue when turning on a lamp,,https://www.reddit.com/r/alexa/comments/np0vru...,fiorenzoalumide,1,2021-05-31 19:08:43,0,/r/alexa/comments/np0vru/issue_when_turning_on...,,['nan'],issue when turning on a l
18577,np47mn,Why does Alexa think my AC is a thermostat and...,,https://www.reddit.com/r/alexa/comments/np47mn...,stgleason,1,2021-05-31 22:06:50,9,/r/alexa/comments/np47mn/why_does_alexa_think_...,,"['So, I recently reconfigured my network and c...",why does alexa think my ac is a thermostat and...
18578,np5p0z,Music not working when using Alexa with a trav...,,https://www.reddit.com/r/alexa/comments/np5p0z...,bznelson91,1,2021-05-31 23:14:44,0,/r/alexa/comments/np5p0z/music_not_working_whe...,,['nan'],music not working when using alexa with a trav...


In [4]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub(r'http\S+', '', sent) # remove http
        sent = re.sub(r'https\S+', '', sent) # remove https
        sent = re.sub('<[^>]+>', '', sent) # remove HTML tags
        sent = re.sub('<[^<]+?>', '', sent)
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = re.sub(r'[^\w\s]','',sent) # remove punctuations
        sent = gensim.utils.simple_preprocess(str(sent), min_len=2, deacc=True) 
        
        yield(sent)  

# # Convert to list
data = df.content.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['alexa', 'as', 'an', 'android', 'app', 'who', 'needs', 'the', 'alexa', 'devices']]


In [5]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=1,delimiter='_') # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=1, delimiter='_')  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Tag   Meaning                English Examples
# ADJ   adjective              new, good, high, special, big, local
# ADP   adposition             on, of, at, with, by, into, under
# ADV   adverb                 really, already, still, early, now
# CONJ  conjunction            and, or, but, if, while, although
# DET   determiner, article    the, a, some, most, every, no, which
# NOUN  noun                   year, home, costs, time, Africa
# NUM   numeral                twenty-four, fourth, 1991, 14:24
# PRT   particle               at, on, out, over per, that, up, with
# PRON  pronoun                he, their, her, its, my, I, us
# VERB  verb                   is, say, told, given, playing, would
# .     punctuation marks      . , ; !
# X     other                  ersatz, esprit, dunno, gr8, univeristy

# def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
def process_words(texts, stop_words=stop_words, disallowed_postags=['ADP', 'CONJ', 'DET', 'NUM', 'PRT','PRON','.','X']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ not in disallowed_postags])
#         texts_out.append([token.lemma_ for token in doc])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc), max_len=20) if word not in stop_words] for doc in texts_out] 
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!
print(data_ready[:2])

[['alexa', 'android_app', 'need', 'alexa_device'], ['let_alexa', 'transform', 'pebble', 'core', 'combadge', 'dream']]


In [6]:
print(data_ready[:2])

[['alexa', 'android_app', 'need', 'alexa_device'], ['let_alexa', 'transform', 'pebble', 'core', 'combadge', 'dream']]


In [7]:
from gensim.corpora import Dictionary

# Create Dictionary
id2word = Dictionary(data_ready)
print('Number of unique words in initital documents:', len(id2word))

# Filter out words that occur less than 0.5% documents, or more than 20% of the documents.
id2word.filter_extremes(no_below = (round(((len(data_ready))*0.005))), no_above = 0.99)
print('Number of unique words after removing rare and common words:', len(id2word))

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
print('Number of documents: %d' % len(corpus))

Number of unique words in initital documents: 47597
Number of unique words after removing rare and common words: 1170
Number of documents: 18580


In [8]:
id2word.save("corpus_dict/dict")
corpora.MmCorpus.serialize("corpus_dict/corpus", corpus)

In [9]:
df['tokenz'] = [[(id2word[id]) for id, freq in cp] for cp in corpus[:]]
df.head(1)

Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content,tokenz
0,4lx0dn,Alexa as an Android app. Who needs the Alexa d...,,https://www.reddit.com/r/alexa/comments/4lx0dn...,layboy,2,2016-06-01 03:25:15,0,/r/alexa/comments/4lx0dn/alexa_as_an_android_a...,,['nan'],alexa as an android app who needs the alexa de...,"[alexa, alexa_device, need]"


In [10]:
df.to_csv('1_df_content_tokenz.csv',index=False, encoding='utf-8')

In [11]:
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[('alexa', 1), ('alexa_device', 1), ('need', 1)]]


In [12]:
# #tf-idf
# from gensim.models import TfidfModel

# # Create Dictionary
# from gensim import models

# tfidf = models.TfidfModel(corpus, id2word=id2word)  # step 1 -- initialize a model
# corpus = tfidf[corpus]
# for doc in corpus:
#     pprint(doc)
#     break

# # print('Number of unique tokens: %d' % len(id2word))
# # print('Number of documents: %d' % len(corpus))

In [13]:
# topWords = {}
# for doc in corpus:
#     for iWord, tf_idf in doc:
#         if iWord not in topWords:
#             topWords[iWord] = 0

#         if tf_idf > topWords[iWord]:
#             topWords[iWord] = tf_idf
# sum = 0
# term = []
# for i, item in enumerate(sorted(topWords.items(), key=lambda x: x[1], reverse=True), 1):
# #     print("%2s: %-13s %s" % (i, id2word[item[0]], item[1]))
#     term.append(id2word[item[0]])
#     sum += item[1]
# #     if i == 100: break
# # print (sum)
# mean = sum/i
# print ('Mean of tf-idf score: ' + str(mean))
# # print (term)

In [14]:
# #tf-idf
# from gensim.models import TfidfModel

# # Create Dictionary
# from gensim import models

# low_value = 0.271734994034526
# low_value_words = []

# tfidf = models.TfidfModel(corpus, id2word=id2word)  # step 1 -- initialize a model
# corpus = tfidf[corpus]
# for doc in corpus:
#     low_value_words += [id for id, value in tfidf[doc] if value < low_value]

In [15]:
# id2word.filter_tokens(bad_ids=low_value_words)
# print('Number of filtered unique tokens: %d' % len(id2word))
# print('Number of documents: %d' % len(corpus))

In [16]:
# corpus = [id2word.doc2bow(doc) for doc in data_ready]
# corpus = tfidf[corpus]
# for doc in corpus:
#     pprint(doc)

In [17]:
# print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]])

In [18]:
# # Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=10)

# pprint(lda_model.print_topics())

In [19]:
def compute_coherence_values(corpus, dictionary, num_topics, a, b):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics, 
                                                random_state=100,
                                                chunksize=100,
                                                passes=40,
                                                iterations=1000,
                                                alpha=a,
                                                eta=1/num_topics,
                                                eval_every=None)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['tokenz'], dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [20]:
# 1

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_1 = pd.DataFrame(model_results)
model_results_1.to_csv('tuning/00_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6332098218190001
5
0.05
0.2
0.6280225817485954
5
0.1
0.2
0.6284308781661567
5
0.2
0.2
0.6209430848367596
5
0.5
0.2
0.6569419559467187
5
1
0.2
0.644788659013701
10
0.01
0.1
0.5832443382600146
10
0.05
0.1
0.5808886033045311
10
0.1
0.1
0.5805628030106785
10
0.2
0.1
0.5944779488300023
10
0.5
0.1
0.6188388138776549
10
1
0.1
0.6533911858489322
20
0.01
0.05
0.5751609996065101
20
0.05
0.05
0.5738476396599087
20
0.1
0.05
0.5732348187383194
20
0.2
0.05
0.5613988757020796
20
0.5
0.05
0.6159233463283882
20
1
0.05
0.6348736288013815
30
0.01
0.03333333333333333
0.5223318713542239
30
0.05
0.03333333333333333
0.5372625551335076
30
0.1
0.03333333333333333
0.5319188701208576
30
0.2
0.03333333333333333
0.5387423235417802
30
0.5
0.03333333333333333
0.5459476996815646
30
1
0.03333333333333333
0.5510036885053755


In [21]:
# 2

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_2 = pd.DataFrame(model_results)
model_results_2.to_csv('tuning/11_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6332098218190001
5
0.05
0.2
0.6280225817485954
5
0.1
0.2
0.6284308781661567
5
0.2
0.2
0.6209430848367596
5
0.5
0.2
0.6569419559467187
5
1
0.2
0.644788659013701
10
0.01
0.1
0.5832443382600146
10
0.05
0.1
0.5808886033045311
10
0.1
0.1
0.5805628030106785
10
0.2
0.1
0.5944779488300023
10
0.5
0.1
0.6188388138776549
10
1
0.1
0.6533911858489322
20
0.01
0.05
0.5751609996065101
20
0.05
0.05
0.5738476396599087
20
0.1
0.05
0.5732348187383194
20
0.2
0.05
0.5613988757020796
20
0.5
0.05
0.6159233463283882
20
1
0.05
0.6348736288013815
30
0.01
0.03333333333333333
0.5223318713542239
30
0.05
0.03333333333333333
0.5372625551335076
30
0.1
0.03333333333333333
0.5319188701208576
30
0.2
0.03333333333333333
0.5387423235417802
30
0.5
0.03333333333333333
0.5459476996815646
30
1
0.03333333333333333
0.5510036885053755


In [22]:
# 3

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_3 = pd.DataFrame(model_results)
model_results_3.to_csv('tuning/22_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6332098218190001
5
0.05
0.2
0.6280225817485954
5
0.1
0.2
0.6284308781661567
5
0.2
0.2
0.6209430848367596
5
0.5
0.2
0.6569419559467187
5
1
0.2
0.644788659013701
10
0.01
0.1
0.5832443382600146
10
0.05
0.1
0.5808886033045311
10
0.1
0.1
0.5805628030106785
10
0.2
0.1
0.5944779488300023
10
0.5
0.1
0.6188388138776549
10
1
0.1
0.6533911858489322
20
0.01
0.05
0.5751609996065101
20
0.05
0.05
0.5738476396599087
20
0.1
0.05
0.5732348187383194
20
0.2
0.05
0.5613988757020796
20
0.5
0.05
0.6159233463283882
20
1
0.05
0.6348736288013815
30
0.01
0.03333333333333333
0.5223318713542239
30
0.05
0.03333333333333333
0.5372625551335076
30
0.1
0.03333333333333333
0.5319188701208576
30
0.2
0.03333333333333333
0.5387423235417802
30
0.5
0.03333333333333333
0.5459476996815646
30
1
0.03333333333333333
0.5510036885053755


In [23]:
# 4

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_4 = pd.DataFrame(model_results)
model_results_4.to_csv('tuning/33_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6332098218190001
5
0.05
0.2
0.6280225817485954
5
0.1
0.2
0.6284308781661567
5
0.2
0.2
0.6209430848367596
5
0.5
0.2
0.6569419559467187
5
1
0.2
0.644788659013701
10
0.01
0.1
0.5832443382600146
10
0.05
0.1
0.5808886033045311
10
0.1
0.1
0.5805628030106785
10
0.2
0.1
0.5944779488300023
10
0.5
0.1
0.6188388138776549
10
1
0.1
0.6533911858489322
20
0.01
0.05
0.5751609996065101
20
0.05
0.05
0.5738476396599087
20
0.1
0.05
0.5732348187383194
20
0.2
0.05
0.5613988757020796
20
0.5
0.05
0.6159233463283882
20
1
0.05
0.6348736288013815
30
0.01
0.03333333333333333
0.5223318713542239
30
0.05
0.03333333333333333
0.5372625551335076
30
0.1
0.03333333333333333
0.5319188701208576
30
0.2
0.03333333333333333
0.5387423235417802
30
0.5
0.03333333333333333
0.5459476996815646
30
1
0.03333333333333333
0.5510036885053755


In [24]:
# 5

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_5 = pd.DataFrame(model_results)
model_results_5.to_csv('tuning/44_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6332098218190001
5
0.05
0.2
0.6280225817485954
5
0.1
0.2
0.6284308781661567
5
0.2
0.2
0.6209430848367596
5
0.5
0.2
0.6569419559467187
5
1
0.2
0.644788659013701
10
0.01
0.1
0.5832443382600146
10
0.05
0.1
0.5808886033045311
10
0.1
0.1
0.5805628030106785
10
0.2
0.1
0.5944779488300023
10
0.5
0.1
0.6188388138776549
10
1
0.1
0.6533911858489322
20
0.01
0.05
0.5751609996065101
20
0.05
0.05
0.5738476396599087
20
0.1
0.05
0.5732348187383194
20
0.2
0.05
0.5613988757020796
20
0.5
0.05
0.6159233463283882
20
1
0.05
0.6348736288013815
30
0.01
0.03333333333333333
0.5223318713542239
30
0.05
0.03333333333333333
0.5372625551335076
30
0.1
0.03333333333333333
0.5319188701208576
30
0.2
0.03333333333333333
0.5387423235417802
30
0.5
0.03333333333333333
0.5459476996815646
30
1
0.03333333333333333
0.5510036885053755


In [25]:
# 6

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_6 = pd.DataFrame(model_results)
model_results_6.to_csv('tuning/55_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6332098218190001
5
0.05
0.2
0.6280225817485954
5
0.1
0.2
0.6284308781661567
5
0.2
0.2
0.6209430848367596
5
0.5
0.2
0.6569419559467187
5
1
0.2
0.644788659013701
10
0.01
0.1
0.5832443382600146
10
0.05
0.1
0.5808886033045311
10
0.1
0.1
0.5805628030106785
10
0.2
0.1
0.5944779488300023
10
0.5
0.1
0.6188388138776549
10
1
0.1
0.6533911858489322
20
0.01
0.05
0.5751609996065101
20
0.05
0.05
0.5738476396599087
20
0.1
0.05
0.5732348187383194
20
0.2
0.05
0.5613988757020796
20
0.5
0.05
0.6159233463283882
20
1
0.05
0.6348736288013815
30
0.01
0.03333333333333333
0.5223318713542239
30
0.05
0.03333333333333333
0.5372625551335076
30
0.1
0.03333333333333333
0.5319188701208576
30
0.2
0.03333333333333333
0.5387423235417802
30
0.5
0.03333333333333333
0.5459476996815646
30
1
0.03333333333333333
0.5510036885053755


In [26]:
# 7

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_7 = pd.DataFrame(model_results)
model_results_7.to_csv('tuning/66_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6332098218190001
5
0.05
0.2
0.6280225817485954
5
0.1
0.2
0.6284308781661567
5
0.2
0.2
0.6209430848367596
5
0.5
0.2
0.6569419559467187
5
1
0.2
0.644788659013701
10
0.01
0.1
0.5832443382600146
10
0.05
0.1
0.5808886033045311
10
0.1
0.1
0.5805628030106785
10
0.2
0.1
0.5944779488300023
10
0.5
0.1
0.6188388138776549
10
1
0.1
0.6533911858489322
20
0.01
0.05
0.5751609996065101
20
0.05
0.05
0.5738476396599087
20
0.1
0.05
0.5732348187383194
20
0.2
0.05
0.5613988757020796
20
0.5
0.05
0.6159233463283882
20
1
0.05
0.6348736288013815
30
0.01
0.03333333333333333
0.5223318713542239
30
0.05
0.03333333333333333
0.5372625551335076
30
0.1
0.03333333333333333
0.5319188701208576
30
0.2
0.03333333333333333
0.5387423235417802
30
0.5
0.03333333333333333
0.5459476996815646
30
1
0.03333333333333333
0.5510036885053755


In [27]:
# 8

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_8 = pd.DataFrame(model_results)
model_results_8.to_csv('tuning/77_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6332098218190001
5
0.05
0.2
0.6280225817485954
5
0.1
0.2
0.6284308781661567
5
0.2
0.2
0.6209430848367596
5
0.5
0.2
0.6569419559467187
5
1
0.2
0.644788659013701
10
0.01
0.1
0.5832443382600146
10
0.05
0.1
0.5808886033045311
10
0.1
0.1
0.5805628030106785
10
0.2
0.1
0.5944779488300023
10
0.5
0.1
0.6188388138776549
10
1
0.1
0.6533911858489322
20
0.01
0.05
0.5751609996065101
20
0.05
0.05
0.5738476396599087
20
0.1
0.05
0.5732348187383194
20
0.2
0.05
0.5613988757020796
20
0.5
0.05
0.6159233463283882
20
1
0.05
0.6348736288013815
30
0.01
0.03333333333333333
0.5223318713542239
30
0.05
0.03333333333333333
0.5372625551335076
30
0.1
0.03333333333333333
0.5319188701208576
30
0.2
0.03333333333333333
0.5387423235417802
30
0.5
0.03333333333333333
0.5459476996815646
30
1
0.03333333333333333
0.5510036885053755


In [28]:
# 9

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_9 = pd.DataFrame(model_results)
model_results_9.to_csv('tuning/88_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6332098218190001
5
0.05
0.2
0.6280225817485954
5
0.1
0.2
0.6284308781661567
5
0.2
0.2
0.6209430848367596
5
0.5
0.2
0.6569419559467187
5
1
0.2
0.644788659013701
10
0.01
0.1
0.5832443382600146
10
0.05
0.1
0.5808886033045311
10
0.1
0.1
0.5805628030106785
10
0.2
0.1
0.5944779488300023
10
0.5
0.1
0.6188388138776549
10
1
0.1
0.6533911858489322
20
0.01
0.05
0.5751609996065101
20
0.05
0.05
0.5738476396599087
20
0.1
0.05
0.5732348187383194
20
0.2
0.05
0.5613988757020796
20
0.5
0.05
0.6159233463283882
20
1
0.05
0.6348736288013815
30
0.01
0.03333333333333333
0.5223318713542239
30
0.05
0.03333333333333333
0.5372625551335076
30
0.1
0.03333333333333333
0.5319188701208576
30
0.2
0.03333333333333333
0.5387423235417802
30
0.5
0.03333333333333333
0.5459476996815646
30
1
0.03333333333333333
0.5510036885053755


In [29]:
# 10

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_10 = pd.DataFrame(model_results)
model_results_10.to_csv('tuning/99_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6332098218190001
5
0.05
0.2
0.6280225817485954
5
0.1
0.2
0.6284308781661567
5
0.2
0.2
0.6209430848367596
5
0.5
0.2
0.6569419559467187
5
1
0.2
0.644788659013701
10
0.01
0.1
0.5832443382600146
10
0.05
0.1
0.5808886033045311
10
0.1
0.1
0.5805628030106785
10
0.2
0.1
0.5944779488300023
10
0.5
0.1
0.6188388138776549
10
1
0.1
0.6533911858489322
20
0.01
0.05
0.5751609996065101
20
0.05
0.05
0.5738476396599087
20
0.1
0.05
0.5732348187383194
20
0.2
0.05
0.5613988757020796
20
0.5
0.05
0.6159233463283882
20
1
0.05
0.6348736288013815
30
0.01
0.03333333333333333
0.5223318713542239
30
0.05
0.03333333333333333
0.5372625551335076
30
0.1
0.03333333333333333
0.5319188701208576
30
0.2
0.03333333333333333
0.5387423235417802
30
0.5
0.03333333333333333
0.5459476996815646
30
1
0.03333333333333333
0.5510036885053755


In [30]:
model_results = pd.concat([model_results_1, model_results_2, model_results_3, model_results_4, model_results_5,
                          model_results_6, model_results_7, model_results_8, model_results_9, model_results_10])
model_results.to_csv("tuning/model_results.csv", index=False, encoding='utf-8-sig')

In [31]:
model_results = model_results.groupby(['Topics', 'Alpha'], as_index=False).mean()
model_results = model_results.sort_values(by='Coherence', ascending=False)
model_results.to_csv('2_lda_tuning_results.csv', index=False)

In [32]:
model_results

Unnamed: 0,Topics,Alpha,Beta,Coherence
4,5,0.5,0.2,0.656942
11,10,1.0,0.1,0.653391
5,5,1.0,0.2,0.644789
17,20,1.0,0.05,0.634874
0,5,0.01,0.2,0.63321
2,5,0.1,0.2,0.628431
1,5,0.05,0.2,0.628023
3,5,0.2,0.2,0.620943
10,10,0.5,0.1,0.618839
16,20,0.5,0.05,0.615923


In [33]:
# priors = pd.pivot_table(model_results,index=["Topics"],columns=["Alpha"],values=['Coherence'])
# priors.columns = range(priors.shape[1])
# priors.columns = ['.01','.05','.1','.2','.5','1']
# df.head(1)
# priors = priors.reset_index()
# priors

In [34]:
# priors.to_csv("siri_lda_tuning_results.csv",index=True, encoding="utf-8")

In [35]:
# import matplotlib.pyplot as plt
# import numpy as np
  
# # dummy data
# x1 = priors['Topics']
# A = priors['.01']
# B = priors['.05']
# C = priors['.1']
# D = priors['.2']
# E = priors['.5']
# F = priors['1']

# # creates two subplots
# # fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (24, 12))

# fig, ax = plt.subplots(2, 3, figsize = (24,12))

# # Plot without grid
# ax[0,0].plot(x1, A, label='0.01', color='tab:blue')
# ax[0,1].plot(x1, B, label='0.05', color='tab:orange')
# ax[0,2].plot(x1, C, label='0.1', color='tab:green')
# ax[1,0].plot(x1, D, label='0.2', color='tab:red')
# ax[1,1].plot(x1, E, label='0.5', color='tab:purple')
# ax[1,2].plot(x1, F, label='1', color='tab:brown')

# ax[0,0].set_xlim(xmin=9)
# ax[0,0].set_title('siri, α=.01, Beta=1/K')
# ax[0,0].set_xlabel('K')
# ax[0,0].set_ylabel('Cv')

# ax[0,1].set_xlim(xmin=9)
# ax[0,1].set_title('siri, α=.05, Beta=1/K')
# ax[0,1].set_xlabel('K')
# ax[0,1].set_ylabel('Cv')

# ax[0,2].set_xlim(xmin=9)
# ax[0,2].set_title('siri, α=.1, Beta=1/K')
# ax[0,2].set_xlabel('K')
# ax[0,2].set_ylabel('Cv')

# ax[1,0].set_xlim(xmin=9)
# ax[1,0].set_title('siri, α=.2, Beta=1/K')
# ax[1,0].set_xlabel('K')
# ax[1,0].set_ylabel('Cv')

# ax[1,1].set_xlim(xmin=9)
# ax[1,1].set_title('siri, α=.5, Beta=1/K')
# ax[1,1].set_xlabel('K')
# ax[1,1].set_ylabel('Cv')

# ax[1,2].set_xlim(xmin=9)
# ax[1,2].set_title('siri, α=1, Beta=1/K')
# ax[1,2].set_xlabel('K')
# ax[1,2].set_ylabel('Cv')

# # fig.tight_layout()
# fig.set_facecolor("w")
# plt.show()