In [1]:
import sys
import re, numpy as np, pandas as pd
import tqdm
import glob
from cleantext import clean
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['https_www','seems_like','do','not','imgur','tkg','https','http','could','www','com','ever','doesnt_seem',
                  'xxxx','else','would','also','ea','&amp','#x200B','oh','etc','yeah','nan','however','even','dont_know','sa',
                  "looks_like",'especially','may','sounds_like'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [2]:
# LoadDataset
df=pd.read_csv('siri_merged.csv')
print(df.shape)
df.head(1)

(2235, 12)


Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content
0,4lzda4,Apple is working on an AI system that wipes th...,,https://www.reddit.com/r/Siri/comments/4lzda4/...,dunkin1980,6,2016-06-01 11:54:39,2,/r/Siri/comments/4lzda4/apple_is_working_on_an...,,"[""Man I hope they don't screw this up.,If so, ...",Apple is working on an AI system that wipes th...


In [3]:
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI)

def preprocess_tweet(row):
    text = row['content']
    text = text.replace('r/','')
    text = p.clean(text)
    text = clean(text,     
                 fix_unicode=True,              # fix various unicode errors
                 to_ascii=True,                 # transliterate to closest ASCII representation
                 lower=True,                    # lowercase text
                 no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                 no_urls=True,                  # replace all URLs with a special token
                 no_emails=True,                # replace all email addresses with a special token
                 no_phone_numbers=True,         # replace all phone numbers with a special token
                 no_numbers=True,               # replace all numbers with a special token
                 no_digits=True,                # replace all digits with a special token
                 no_currency_symbols=True,      # replace all currency symbols with a special token
                 no_punct=True,                 # remove punctuations
                 lang="en",                     # set to 'de' for German special handling
                 replace_with_punct="",          # instead of removing punctuations you may replace them
                 replace_with_url="",
                 replace_with_email="",
                 replace_with_phone_number="",
                 replace_with_number="",
                 replace_with_digit="",
                 replace_with_currency_symbol=""
                )
    text = text.replace('amp','')
    text = text.replace('nan','')
    return text

df['content'] = df.apply(preprocess_tweet, axis=1)
df

Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content
0,4lzda4,Apple is working on an AI system that wipes th...,,https://www.reddit.com/r/Siri/comments/4lzda4/...,dunkin1980,6,2016-06-01 11:54:39,2,/r/Siri/comments/4lzda4/apple_is_working_on_an...,,"[""Man I hope they don't screw this up.,If so, ...",apple is working on an ai system that wipes th...
1,4lzi6i,"I asked my 5S Siri to flip a coin for me, had ...",,https://www.reddit.com/r/Siri/comments/4lzi6i/...,JangoBK,7,2016-06-01 12:32:01,1,/r/Siri/comments/4lzi6i/i_asked_my_5s_siri_to_...,,"[""http://imgur.com/tKg26ea\n\nShe doesn't have...",i asked my s siri to flip a coin for me had a ...
2,4m2z9l,If Apple's latest commercial was honest...,,https://www.reddit.com/r/Siri/comments/4m2z9l/...,[deleted],6,2016-06-02 03:15:08,0,/r/Siri/comments/4m2z9l/if_apples_latest_comme...,,['nan'],if apples latest commercial was honest
3,4m3w5b,Worldwide exclusive interview with Siri,,https://www.reddit.com/r/Siri/comments/4m3w5b/...,tinycomet,1,2016-06-02 06:13:25,0,/r/Siri/comments/4m3w5b/worldwide_exclusive_in...,,['nan'],worldwide exclusive interview with siri
4,4m5buw,"Siri responds to AT&amp;T ""Hostess with the Mo...","When Lily (in the commercial) says ""OK Siri"", ...",https://www.reddit.com/r/Siri/comments/4m5buw/...,unsubscribe__,1,2016-06-02 11:54:11,0,/r/Siri/comments/4m5buw/siri_responds_to_att_h...,,['nan'],siri responds to att hostess with the mostest ...
...,...,...,...,...,...,...,...,...,...,...,...,...
2230,nk8f2b,So I was just sitting talking about AI with my...,Anyone who can explain how this is possible? I...,https://www.reddit.com/r/Siri/comments/nk8f2b/...,ridiculus97,1,2021-05-25 05:06:42,2,/r/Siri/comments/nk8f2b/so_i_was_just_sitting_...,,"[""She's just trying to include herself in the ...",so i was just sitting talking about ai with my...
2231,nmys6b,Long time problem with Siri: While listening t...,It will not switch to the Podcasts app nor wil...,https://www.reddit.com/r/Siri/comments/nmys6b/...,ThunderHawkLives,1,2021-05-28 22:15:45,0,/r/Siri/comments/nmys6b/long_time_problem_with...,,['nan'],long time problem with siri while listening to...
2232,nns10e,My default search engine is DuckDuckGo on Safa...,"I checked Siri settings, there is no option fo...",https://www.reddit.com/r/Siri/comments/nns10e/...,hemeka,1,2021-05-30 01:20:41,0,/r/Siri/comments/nns10e/my_default_search_engi...,,['nan'],my default search engine is duckduckgo on safa...
2233,nns990,I can not set default search engine on Siri.,,https://www.reddit.com/r/Siri/comments/nns990/...,hemeka,1,2021-05-30 01:32:08,3,/r/Siri/comments/nns990/i_can_not_set_default_...,,"[""You can't change Siri's settings, Google pay...",i can not set default search engine on siri yo...


In [4]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub(r'http\S+', '', sent) # remove http
        sent = re.sub(r'https\S+', '', sent) # remove https
        sent = re.sub('<[^>]+>', '', sent) # remove HTML tags
        sent = re.sub('<[^<]+?>', '', sent)
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = re.sub(r'[^\w\s]','',sent) # remove punctuations
        sent = gensim.utils.simple_preprocess(str(sent), min_len=2, deacc=True) 
        
        yield(sent)  

# # Convert to list
data = df.content.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['apple', 'is', 'working', 'on', 'an', 'ai', 'system', 'that', 'wipes', 'the', 'floor', 'with', 'google', 'and', 'everyone', 'else', 'man', 'hope', 'they', 'dont', 'screw', 'this', 'upif', 'so', 'it', 'wasnt', 'announced', 'at', 'wwdc', 'and', 'doesnt', 'seem', 'to', 'be', 'in', 'ios', 'hopefully', 'we', 'wont', 'have', 'to', 'wait', 'until', 'ios', 'or', 'later', 'to', 'see', 'it', 'because', 'siri', 'is', 'useless', 'now', 'and', 'has', 'been', 'for', 'long', 'time']]


In [5]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=1,delimiter='_') # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=1, delimiter='_')  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Tag   Meaning                English Examples
# ADJ   adjective              new, good, high, special, big, local
# ADP   adposition             on, of, at, with, by, into, under
# ADV   adverb                 really, already, still, early, now
# CONJ  conjunction            and, or, but, if, while, although
# DET   determiner, article    the, a, some, most, every, no, which
# NOUN  noun                   year, home, costs, time, Africa
# NUM   numeral                twenty-four, fourth, 1991, 14:24
# PRT   particle               at, on, out, over per, that, up, with
# PRON  pronoun                he, their, her, its, my, I, us
# VERB  verb                   is, say, told, given, playing, would
# .     punctuation marks      . , ; !
# X     other                  ersatz, esprit, dunno, gr8, univeristy

# def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
def process_words(texts, stop_words=stop_words, disallowed_postags=['ADP', 'CONJ', 'DET', 'NUM', 'PRT','PRON','.','X']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ not in disallowed_postags])
#         texts_out.append([token.lemma_ for token in doc])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc), max_len=20) if word not in stop_words] for doc in texts_out] 
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!
print(data_ready[:1])

[['apple', 'work', 'ai', 'system', 'wipe', 'floor', 'google', 'everyone', 'man', 'hope', 'screw', 'upif', 'announce', 'wwdc', 'ios', 'hopefully', 'wait', 'later', 'see', 'siri', 'useless', 'long_time']]


In [6]:
from gensim.corpora import Dictionary

# Create Dictionary
id2word = Dictionary(data_ready)
print('Number of unique words in initital documents:', len(id2word))

# Filter out words that occur less than 0.5% documents, or more than 20% of the documents.
id2word.filter_extremes(no_below = (round(((len(data_ready))*0.005))), no_above = 0.99)
print('Number of unique words after removing rare and common words:', len(id2word))

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
print('Number of documents: %d' % len(corpus))

Number of unique words in initital documents: 7629
Number of unique words after removing rare and common words: 820
Number of documents: 2235


In [7]:
id2word.save("corpus_dict/dict")
corpora.MmCorpus.serialize("corpus_dict/corpus", corpus)

In [8]:
df['tokenz'] = [[(id2word[id]) for id, freq in cp] for cp in corpus[:]]
df.head(1)

Unnamed: 0,post_id,title,selftext,full_link,author,score,publish_date,num_of_comments,permalink,flair,comment_msg,content,tokenz
0,4lzda4,Apple is working on an AI system that wipes th...,,https://www.reddit.com/r/Siri/comments/4lzda4/...,dunkin1980,6,2016-06-01 11:54:39,2,/r/Siri/comments/4lzda4/apple_is_working_on_an...,,"[""Man I hope they don't screw this up.,If so, ...",apple is working on an ai system that wipes th...,"[ai, announce, apple, google, hope, hopefully,..."


In [9]:
df.to_csv('1_df_content_tokenz.csv',index=False, encoding='utf-8')

In [10]:
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[('ai', 1), ('announce', 1), ('apple', 1), ('google', 1), ('hope', 1), ('hopefully', 1), ('ios', 1), ('later', 1), ('long_time', 1), ('man', 1), ('see', 1), ('siri', 1), ('system', 1), ('useless', 1), ('wait', 1), ('work', 1)]]


In [11]:
# #tf-idf
# from gensim.models import TfidfModel

# # Create Dictionary
# from gensim import models

# tfidf = models.TfidfModel(corpus, id2word=id2word)  # step 1 -- initialize a model
# corpus = tfidf[corpus]
# for doc in corpus:
#     pprint(doc)
#     break

# # print('Number of unique tokens: %d' % len(id2word))
# # print('Number of documents: %d' % len(corpus))

In [12]:
# topWords = {}
# for doc in corpus:
#     for iWord, tf_idf in doc:
#         if iWord not in topWords:
#             topWords[iWord] = 0

#         if tf_idf > topWords[iWord]:
#             topWords[iWord] = tf_idf
# sum = 0
# term = []
# for i, item in enumerate(sorted(topWords.items(), key=lambda x: x[1], reverse=True), 1):
# #     print("%2s: %-13s %s" % (i, id2word[item[0]], item[1]))
#     term.append(id2word[item[0]])
#     sum += item[1]
# #     if i == 100: break
# # print (sum)
# mean = sum/i
# print ('Mean of tf-idf score: ' + str(mean))
# # print (term)

In [13]:
# #tf-idf
# from gensim.models import TfidfModel

# # Create Dictionary
# from gensim import models

# low_value = 0.271734994034526
# low_value_words = []

# tfidf = models.TfidfModel(corpus, id2word=id2word)  # step 1 -- initialize a model
# corpus = tfidf[corpus]
# for doc in corpus:
#     low_value_words += [id for id, value in tfidf[doc] if value < low_value]

In [14]:
# id2word.filter_tokens(bad_ids=low_value_words)
# print('Number of filtered unique tokens: %d' % len(id2word))
# print('Number of documents: %d' % len(corpus))

In [15]:
# corpus = [id2word.doc2bow(doc) for doc in data_ready]
# corpus = tfidf[corpus]
# for doc in corpus:
#     pprint(doc)

In [16]:
# print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]])

In [17]:
# # Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=10)

# pprint(lda_model.print_topics())

In [18]:
def compute_coherence_values(corpus, dictionary, num_topics, a, b):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics, 
                                                random_state=100,
                                                chunksize=100,
                                                passes=40,
                                                iterations=1000,
                                                alpha=a,
                                                eta=1/num_topics,
                                                eval_every=None)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['tokenz'], dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [19]:
# 1

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_1 = pd.DataFrame(model_results)
model_results_1.to_csv('tuning/00_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6016461949722964
5
0.05
0.2
0.6034813900035713
5
0.1
0.2
0.5846629674154018
5
0.2
0.2
0.553893964134093
5
0.5
0.2
0.668860293968037
5
1
0.2
0.695418050515428
10
0.01
0.1
0.51671445113345
10
0.05
0.1
0.5315092553314879
10
0.1
0.1
0.5384734432878137
10
0.2
0.1
0.5461184029321892
10
0.5
0.1
0.6253460472935441
10
1
0.1
0.6559895048483556
20
0.01
0.05
0.4680231027918258
20
0.05
0.05
0.46113381470649656
20
0.1
0.05
0.4760800899091716
20
0.2
0.05
0.4632987362986919
20
0.5
0.05
0.44176104165882446
20
1
0.05
0.43589633755299617
30
0.01
0.03333333333333333
0.42312646461950004
30
0.05
0.03333333333333333
0.4137063548348686
30
0.1
0.03333333333333333
0.4242686298931834
30
0.2
0.03333333333333333
0.4009006394139174
30
0.5
0.03333333333333333
0.3601669465594338
30
1
0.03333333333333333
0.38665094275510115


In [20]:
# 2

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_2 = pd.DataFrame(model_results)
model_results_2.to_csv('tuning/11_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6016461949722964
5
0.05
0.2
0.6034813900035713
5
0.1
0.2
0.5846629674154018
5
0.2
0.2
0.553893964134093
5
0.5
0.2
0.668860293968037
5
1
0.2
0.695418050515428
10
0.01
0.1
0.51671445113345
10
0.05
0.1
0.5315092553314879
10
0.1
0.1
0.5384734432878137
10
0.2
0.1
0.5461184029321892
10
0.5
0.1
0.6253460472935441
10
1
0.1
0.6559895048483556
20
0.01
0.05
0.4680231027918258
20
0.05
0.05
0.46113381470649656
20
0.1
0.05
0.4760800899091716
20
0.2
0.05
0.4632987362986919
20
0.5
0.05
0.44176104165882446
20
1
0.05
0.43589633755299617
30
0.01
0.03333333333333333
0.42312646461950004
30
0.05
0.03333333333333333
0.4137063548348686
30
0.1
0.03333333333333333
0.4242686298931834
30
0.2
0.03333333333333333
0.4009006394139174
30
0.5
0.03333333333333333
0.3601669465594338
30
1
0.03333333333333333
0.38665094275510115


In [21]:
# 3

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_3 = pd.DataFrame(model_results)
model_results_3.to_csv('tuning/22_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6016461949722964
5
0.05
0.2
0.6034813900035713
5
0.1
0.2
0.5846629674154018
5
0.2
0.2
0.553893964134093
5
0.5
0.2
0.668860293968037
5
1
0.2
0.695418050515428
10
0.01
0.1
0.51671445113345
10
0.05
0.1
0.5315092553314879
10
0.1
0.1
0.5384734432878137
10
0.2
0.1
0.5461184029321892
10
0.5
0.1
0.6253460472935441
10
1
0.1
0.6559895048483556
20
0.01
0.05
0.4680231027918258
20
0.05
0.05
0.46113381470649656
20
0.1
0.05
0.4760800899091716
20
0.2
0.05
0.4632987362986919
20
0.5
0.05
0.44176104165882446
20
1
0.05
0.43589633755299617
30
0.01
0.03333333333333333
0.42312646461950004
30
0.05
0.03333333333333333
0.4137063548348686
30
0.1
0.03333333333333333
0.4242686298931834
30
0.2
0.03333333333333333
0.4009006394139174
30
0.5
0.03333333333333333
0.3601669465594338
30
1
0.03333333333333333
0.38665094275510115


In [22]:
# 4

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_4 = pd.DataFrame(model_results)
model_results_4.to_csv('tuning/33_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6016461949722964
5
0.05
0.2
0.6034813900035713
5
0.1
0.2
0.5846629674154018
5
0.2
0.2
0.553893964134093
5
0.5
0.2
0.668860293968037
5
1
0.2
0.695418050515428
10
0.01
0.1
0.51671445113345
10
0.05
0.1
0.5315092553314879
10
0.1
0.1
0.5384734432878137
10
0.2
0.1
0.5461184029321892
10
0.5
0.1
0.6253460472935441
10
1
0.1
0.6559895048483556
20
0.01
0.05
0.4680231027918258
20
0.05
0.05
0.46113381470649656
20
0.1
0.05
0.4760800899091716
20
0.2
0.05
0.4632987362986919
20
0.5
0.05
0.44176104165882446
20
1
0.05
0.43589633755299617
30
0.01
0.03333333333333333
0.42312646461950004
30
0.05
0.03333333333333333
0.4137063548348686
30
0.1
0.03333333333333333
0.4242686298931834
30
0.2
0.03333333333333333
0.4009006394139174
30
0.5
0.03333333333333333
0.3601669465594338
30
1
0.03333333333333333
0.38665094275510115


In [23]:
# 5

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_5 = pd.DataFrame(model_results)
model_results_5.to_csv('tuning/44_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6016461949722964
5
0.05
0.2
0.6034813900035713
5
0.1
0.2
0.5846629674154018
5
0.2
0.2
0.553893964134093
5
0.5
0.2
0.668860293968037
5
1
0.2
0.695418050515428
10
0.01
0.1
0.51671445113345
10
0.05
0.1
0.5315092553314879
10
0.1
0.1
0.5384734432878137
10
0.2
0.1
0.5461184029321892
10
0.5
0.1
0.6253460472935441
10
1
0.1
0.6559895048483556
20
0.01
0.05
0.4680231027918258
20
0.05
0.05
0.46113381470649656
20
0.1
0.05
0.4760800899091716
20
0.2
0.05
0.4632987362986919
20
0.5
0.05
0.44176104165882446
20
1
0.05
0.43589633755299617
30
0.01
0.03333333333333333
0.42312646461950004
30
0.05
0.03333333333333333
0.4137063548348686
30
0.1
0.03333333333333333
0.4242686298931834
30
0.2
0.03333333333333333
0.4009006394139174
30
0.5
0.03333333333333333
0.3601669465594338
30
1
0.03333333333333333
0.38665094275510115


In [24]:
# 6

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_6 = pd.DataFrame(model_results)
model_results_6.to_csv('tuning/55_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6016461949722964
5
0.05
0.2
0.6034813900035713
5
0.1
0.2
0.5846629674154018
5
0.2
0.2
0.553893964134093
5
0.5
0.2
0.668860293968037
5
1
0.2
0.695418050515428
10
0.01
0.1
0.51671445113345
10
0.05
0.1
0.5315092553314879
10
0.1
0.1
0.5384734432878137
10
0.2
0.1
0.5461184029321892
10
0.5
0.1
0.6253460472935441
10
1
0.1
0.6559895048483556
20
0.01
0.05
0.4680231027918258
20
0.05
0.05
0.46113381470649656
20
0.1
0.05
0.4760800899091716
20
0.2
0.05
0.4632987362986919
20
0.5
0.05
0.44176104165882446
20
1
0.05
0.43589633755299617
30
0.01
0.03333333333333333
0.42312646461950004
30
0.05
0.03333333333333333
0.4137063548348686
30
0.1
0.03333333333333333
0.4242686298931834
30
0.2
0.03333333333333333
0.4009006394139174
30
0.5
0.03333333333333333
0.3601669465594338
30
1
0.03333333333333333
0.38665094275510115


In [25]:
# 7

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_7 = pd.DataFrame(model_results)
model_results_7.to_csv('tuning/66_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6016461949722964
5
0.05
0.2
0.6034813900035713
5
0.1
0.2
0.5846629674154018
5
0.2
0.2
0.553893964134093
5
0.5
0.2
0.668860293968037
5
1
0.2
0.695418050515428
10
0.01
0.1
0.51671445113345
10
0.05
0.1
0.5315092553314879
10
0.1
0.1
0.5384734432878137
10
0.2
0.1
0.5461184029321892
10
0.5
0.1
0.6253460472935441
10
1
0.1
0.6559895048483556
20
0.01
0.05
0.4680231027918258
20
0.05
0.05
0.46113381470649656
20
0.1
0.05
0.4760800899091716
20
0.2
0.05
0.4632987362986919
20
0.5
0.05
0.44176104165882446
20
1
0.05
0.43589633755299617
30
0.01
0.03333333333333333
0.42312646461950004
30
0.05
0.03333333333333333
0.4137063548348686
30
0.1
0.03333333333333333
0.4242686298931834
30
0.2
0.03333333333333333
0.4009006394139174
30
0.5
0.03333333333333333
0.3601669465594338
30
1
0.03333333333333333
0.38665094275510115


In [26]:
# 8

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_8 = pd.DataFrame(model_results)
model_results_8.to_csv('tuning/77_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6016461949722964
5
0.05
0.2
0.6034813900035713
5
0.1
0.2
0.5846629674154018
5
0.2
0.2
0.553893964134093
5
0.5
0.2
0.668860293968037
5
1
0.2
0.695418050515428
10
0.01
0.1
0.51671445113345
10
0.05
0.1
0.5315092553314879
10
0.1
0.1
0.5384734432878137
10
0.2
0.1
0.5461184029321892
10
0.5
0.1
0.6253460472935441
10
1
0.1
0.6559895048483556
20
0.01
0.05
0.4680231027918258
20
0.05
0.05
0.46113381470649656
20
0.1
0.05
0.4760800899091716
20
0.2
0.05
0.4632987362986919
20
0.5
0.05
0.44176104165882446
20
1
0.05
0.43589633755299617
30
0.01
0.03333333333333333
0.42312646461950004
30
0.05
0.03333333333333333
0.4137063548348686
30
0.1
0.03333333333333333
0.4242686298931834
30
0.2
0.03333333333333333
0.4009006394139174
30
0.5
0.03333333333333333
0.3601669465594338
30
1
0.03333333333333333
0.38665094275510115


In [27]:
# 9

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_9 = pd.DataFrame(model_results)
model_results_9.to_csv('tuning/88_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6016461949722964
5
0.05
0.2
0.6034813900035713
5
0.1
0.2
0.5846629674154018
5
0.2
0.2
0.553893964134093
5
0.5
0.2
0.668860293968037
5
1
0.2
0.695418050515428
10
0.01
0.1
0.51671445113345
10
0.05
0.1
0.5315092553314879
10
0.1
0.1
0.5384734432878137
10
0.2
0.1
0.5461184029321892
10
0.5
0.1
0.6253460472935441
10
1
0.1
0.6559895048483556
20
0.01
0.05
0.4680231027918258
20
0.05
0.05
0.46113381470649656
20
0.1
0.05
0.4760800899091716
20
0.2
0.05
0.4632987362986919
20
0.5
0.05
0.44176104165882446
20
1
0.05
0.43589633755299617
30
0.01
0.03333333333333333
0.42312646461950004
30
0.05
0.03333333333333333
0.4137063548348686
30
0.1
0.03333333333333333
0.4242686298931834
30
0.2
0.03333333333333333
0.4009006394139174
30
0.5
0.03333333333333333
0.3601669465594338
30
1
0.03333333333333333
0.38665094275510115


In [28]:
# 10

topics_range = [5,10,20,30]

# Alpha parameter
alpha = [.01,.05,.1,.2,.5,1]
# alpha.append('symmetric')
# alpha.append('asymmetric')

# Beta parameter
# beta = [1/num_topics]
# beta.append('symmetric')
# beta.append('auto')

model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# iterate through number of topics
for num_topics in topics_range:
    # iterate through alpha values
    for a in alpha:
        # iterare through beta values
#         for b in beta:
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                      num_topics=num_topics, a=a, b=1/num_topics)
        # Save the model results
        model_results['Topics'].append(num_topics)
        model_results['Alpha'].append(a)
        model_results['Beta'].append(1/num_topics)
        model_results['Coherence'].append(cv)
        print (num_topics)
        print (a)
        print (1/num_topics)
        print (cv)

model_results_10 = pd.DataFrame(model_results)
model_results_10.to_csv('tuning/99_lda_tuning_results.csv', index=False, encoding='utf-8')

5
0.01
0.2
0.6016461949722964
5
0.05
0.2
0.6034813900035713
5
0.1
0.2
0.5846629674154018
5
0.2
0.2
0.553893964134093
5
0.5
0.2
0.668860293968037
5
1
0.2
0.695418050515428
10
0.01
0.1
0.51671445113345
10
0.05
0.1
0.5315092553314879
10
0.1
0.1
0.5384734432878137
10
0.2
0.1
0.5461184029321892
10
0.5
0.1
0.6253460472935441
10
1
0.1
0.6559895048483556
20
0.01
0.05
0.4680231027918258
20
0.05
0.05
0.46113381470649656
20
0.1
0.05
0.4760800899091716
20
0.2
0.05
0.4632987362986919
20
0.5
0.05
0.44176104165882446
20
1
0.05
0.43589633755299617
30
0.01
0.03333333333333333
0.42312646461950004
30
0.05
0.03333333333333333
0.4137063548348686
30
0.1
0.03333333333333333
0.4242686298931834
30
0.2
0.03333333333333333
0.4009006394139174
30
0.5
0.03333333333333333
0.3601669465594338
30
1
0.03333333333333333
0.38665094275510115


In [29]:
model_results = pd.concat([model_results_1, model_results_2, model_results_3, model_results_4, model_results_5,
                          model_results_6, model_results_7, model_results_8, model_results_9, model_results_10])
model_results.to_csv("tuning/model_results.csv", index=False, encoding='utf-8-sig')

In [30]:
model_results = model_results.groupby(['Topics', 'Alpha'], as_index=False).mean()
model_results = model_results.sort_values(by='Coherence', ascending=False)
model_results.to_csv('2_lda_tuning_results.csv', index=False)

In [31]:
model_results

Unnamed: 0,Topics,Alpha,Beta,Coherence
5,5,1.0,0.2,0.695418
4,5,0.5,0.2,0.66886
11,10,1.0,0.1,0.65599
10,10,0.5,0.1,0.625346
1,5,0.05,0.2,0.603481
0,5,0.01,0.2,0.601646
2,5,0.1,0.2,0.584663
3,5,0.2,0.2,0.553894
9,10,0.2,0.1,0.546118
8,10,0.1,0.1,0.538473


In [32]:
# priors = pd.pivot_table(model_results,index=["Topics"],columns=["Alpha"],values=['Coherence'])
# priors.columns = range(priors.shape[1])
# priors.columns = ['.01','.05','.1','.2','.5','1']
# df.head(1)
# priors = priors.reset_index()
# priors

In [33]:
# priors.to_csv("siri_lda_tuning_results.csv",index=True, encoding="utf-8")

In [34]:
# import matplotlib.pyplot as plt
# import numpy as np
  
# # dummy data
# x1 = priors['Topics']
# A = priors['.01']
# B = priors['.05']
# C = priors['.1']
# D = priors['.2']
# E = priors['.5']
# F = priors['1']

# # creates two subplots
# # fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (24, 12))

# fig, ax = plt.subplots(2, 3, figsize = (24,12))

# # Plot without grid
# ax[0,0].plot(x1, A, label='0.01', color='tab:blue')
# ax[0,1].plot(x1, B, label='0.05', color='tab:orange')
# ax[0,2].plot(x1, C, label='0.1', color='tab:green')
# ax[1,0].plot(x1, D, label='0.2', color='tab:red')
# ax[1,1].plot(x1, E, label='0.5', color='tab:purple')
# ax[1,2].plot(x1, F, label='1', color='tab:brown')

# ax[0,0].set_xlim(xmin=9)
# ax[0,0].set_title('siri, α=.01, Beta=1/K')
# ax[0,0].set_xlabel('K')
# ax[0,0].set_ylabel('Cv')

# ax[0,1].set_xlim(xmin=9)
# ax[0,1].set_title('siri, α=.05, Beta=1/K')
# ax[0,1].set_xlabel('K')
# ax[0,1].set_ylabel('Cv')

# ax[0,2].set_xlim(xmin=9)
# ax[0,2].set_title('siri, α=.1, Beta=1/K')
# ax[0,2].set_xlabel('K')
# ax[0,2].set_ylabel('Cv')

# ax[1,0].set_xlim(xmin=9)
# ax[1,0].set_title('siri, α=.2, Beta=1/K')
# ax[1,0].set_xlabel('K')
# ax[1,0].set_ylabel('Cv')

# ax[1,1].set_xlim(xmin=9)
# ax[1,1].set_title('siri, α=.5, Beta=1/K')
# ax[1,1].set_xlabel('K')
# ax[1,1].set_ylabel('Cv')

# ax[1,2].set_xlim(xmin=9)
# ax[1,2].set_title('siri, α=1, Beta=1/K')
# ax[1,2].set_xlabel('K')
# ax[1,2].set_ylabel('Cv')

# # fig.tight_layout()
# fig.set_facecolor("w")
# plt.show()