In [21]:
"""author: sarah konrad, data+ 2023
create lda topic modeling clusters on our corpus!
"""
import os
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

#converts texts into a master list of lists of strings (needed for BOW)
def textstolist(folder): 
    texts = []
    for file in os.scandir(folder):
        path = os.path.join(folder,file)
        name = os.path.basename(file)
        f = open(path,'r', encoding="utf-8")
        data = f.read()
        texts.append(data)
        f.close()
    textlists = []
    for text in texts: 
        text = text.split(" ")
        textlists.append(text)
    return textlists

processed_texts = textstolist(input("What texts would you like to use? (file directory): "))
tokendict = gensim.corpora.Dictionary(processed_texts)

[nltk_data] Downloading package wordnet to
[nltk_data]     /hpc/home/sek45/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


What texts would you like to use? (file directory):  /hpc/group/datap2023ecbc/Time Series Modeling Ranges/1610-1619_texts


In [40]:
#for testing functionality of the dictionary being built for topic modeling
count = 0
for key, value in tokendict.iteritems():
    print(key, value)
    count += 1
    if count > 10:
        break

print(len(tokendict))

0 ^ll
1 alarm
2 barrel
3 bleed
4 bound
5 britain
6 broil
7 castle
8 claim
9 commodity
10 consequence
16547


In [41]:
#remove tokens that appear in less than 15 texts, more than 0.5 documents
#keep 100,000 most frequent tokens
#this step helps improve the accuracy of topic modeling by removing irrelevant terms!
tokendict.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

#transform into bag o' words
bagofwords_corpus = [tokendict.doc2bow(text) for text in processed_texts]

#now run tf-idf on the bag of words to see how relevant terms are to the entire corpus!!
#note to self: perhaps add a functionality that maps the tf-idf score of a word to the word in a dictionary 

tfidf_model = models.TfidfModel(bagofwords_corpus)
corpus_tfidf = tfidf_model[bagofwords_corpus]

#train and obtain output from the lda model
lda_model = gensim.models.LdaMulticore(bagofwords_corpus, num_topics=10, id2word=tokendict, passes=20, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.005*"affliction" + 0.002*"meditation" + 0.002*"temptation" + 0.002*"humility" + 0.002*"chap" + 0.002*"righteousness" + 0.002*"sect" + 0.002*"accord" + 0.002*"prosperity" + 0.002*"poverty"
Topic: 1 
Words: 0.006*"fortune" + 0.004*"quote" + 0.003*"amadis" + 0.003*"fame" + 0.003*"greek" + 0.003*"ere" + 0.002*"brave" + 0.002*"foe" + 0.002*"oft" + 0.002*"chance"
Topic: 2 
Words: 0.007*"hawk" + 0.006*"chap" + 0.005*"angle" + 0.004*"oil" + 0.004*"quantity" + 0.004*"leg" + 0.004*"figure" + 0.004*"medicine" + 0.003*"leaf" + 0.003*"boil"
Topic: 3 
Words: 0.016*"bishop" + 0.013*"pope" + 0.011*"catholic" + 0.007*"temporal" + 0.006*"protestant" + 0.006*"augustine" + 0.006*"doctor" + 0.006*"council" + 0.005*"bellarmine" + 0.005*"adversary"
Topic: 4 
Words: 0.019*"earl" + 0.014*"henry" + 0.012*"duke" + 0.009*"edward" + 0.007*"william" + 0.007*"britain" + 0.007*"london" + 0.006*"richard" + 0.005*"french" + 0.005*"castle"
Topic: 5 
Words: 0.018*"muscle" + 0.018*"castile" + 0.015*"vei

In [36]:
#run it with tfidf
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=tokendict, passes=20, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))

Topic: 0 
Word: 0.004*"realm" + 0.003*"proclamation" + 0.003*"london" + 0.003*"highness" + 0.002*"earl" + 0.002*"tobacco" + 0.002*"muse" + 0.002*"quote" + 0.002*"fate" + 0.002*"westminster"
Topic: 1 
Word: 0.001*"andrews" + 0.000*"harper" + 0.000*"alva" + 0.000*"maxwell" + 0.000*"flushing" + 0.000*"staple" + 0.000*"marinus" + 0.000*"rog" + 0.000*"ptolomey" + 0.000*"converter"
Topic: 2 
Word: 0.002*"pope" + 0.001*"righteousness" + 0.001*"bishop" + 0.001*"sacrament" + 0.001*"chap" + 0.001*"catholic" + 0.001*"job" + 0.001*"heb" + 0.001*"affliction" + 0.001*"righteous"
Topic: 3 
Word: 0.002*"epig" + 0.001*"balsam" + 0.000*"quint" + 0.000*"te" + 0.000*"mets" + 0.000*"plane" + 0.000*"ang" + 0.000*"commoner" + 0.000*"astronomical" + 0.000*"semidiameter"
Topic: 4 
Word: 0.001*"martini" + 0.001*"thais" + 0.001*"downam" + 0.001*"cromwell" + 0.001*"felt" + 0.000*"marg" + 0.000*"eustace" + 0.000*"bavier" + 0.000*"barcelona" + 0.000*"sophronius"
Topic: 5 
Word: 0.001*"hugh" + 0.000*"cullen" + 0.000