In [1]:
# Heavily cribbed from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
# and https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

# Import necessary items from the Classical Languages Toolkit and elsewhere for cleaning the data.
import gensim
from cltk.tokenize.word import nltk_tokenize_words
from cltk.tokenize.sentence import TokenizeSentence
tokenizer = TokenizeSentence('latin')
from cltk.stop.latin import CorpusStoplist
from cltk.stop.latin import STOPS_LIST
from cltk.stem.lemma import LemmaReplacer
from cltk.stem.latin.j_v import JVReplacer

jv_replacer = JVReplacer()
tokenizer = TokenizeSentence('latin')
lemmatizer = LemmaReplacer('latin')

In [2]:
# A function to prepare the raw text and turn it into tokens.

def prepare_text(text):
    # tokenize the individual words
    tokens = nltk_tokenize_words(text)
    # remove any tokens smaller than 4 characters in length
    tokens = [token for token in tokens if len(token) > 4]
    # replace j and v with i and u.
    tokens = [jv_replacer.replace(token) for token in tokens]
    # make all words lower-case
    tokens = [token.lower() for token in tokens]
    # remove punctuation
    tokens = [token for token in tokens if token not in ['.', ',','!','?','"',':', ';','[',']']]
    # remove line numbers
    tokens = [x for x in tokens if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())]
    # turn all of the tokens into their dictionary forms
    tokens = [''.join(lemmatizer.lemmatize(token)) for token in tokens]
    # remove common words
    tokens = [token for token in tokens if token not in STOPS_LIST]
    return tokens

In [3]:
# We know from experience that the data include certain words that are generic, so we remove them here.
add_stops = ['noster','nos','ille','quoque','primus']
for stop in add_stops:
    STOPS_LIST.append(stop)

In [4]:
# Open the text files and process them.
import random
import os, glob
text_data = []
folder_path = '/home/sjhuskey/Dropbox/What-is-digital-latin/texts'
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
    with open(filename, 'r') as f:
        text = f.read()
        tokens = prepare_text(text)
        if random.random() > .8: # What does this do?
            text_data.append(tokens)

In [5]:
text_data

[['fortis',
  'mico',
  'seni',
  'canthus1',
  'miconis',
  'alumnus',
  'torreo',
  'patulus',
  'vito',
  'ilex',
  'solo1',
  'juvenis',
  'seni',
  'praecipio',
  'do',
  'alumnus',
  'talis',
  'verbum',
  'refero',
  'tremulus',
  'titubo',
  'labrum1',
  'erro1',
  'video',
  'dumetum',
  'capella1',
  'canus',
  'lascivus',
  'concaedes',
  'gramen',
  'mordeo',
  'canthus1',
  'grex',
  'mons',
  'removeo',
  'cerno',
  'apricus',
  'decerpo',
  'gramen',
  'campus1',
  'seni',
  'juvenis',
  'pater',
  'tueor',
  'accipio',
  'certus',
  'insudo',
  'labor2',
  'gnauam',
  'exerceo',
  'juventa',
  'ad-specio',
  'aetas',
  'mille',
  'querela',
  'affero',
  'baculus',
  'premo',
  'inclino',
  'senectus2',
  'rego',
  'amo',
  'lustro1',
  'capella1',
  'bonus',
  'pratum',
  'erro1',
  'mollis',
  'agna1',
  'percipio',
  'tinnio',
  'volucris',
  'incipio',
  'nidus',
  'revorto',
  'luto1',
  'hirundo',
  'protinus',
  'hiberno',
  'pecus2',
  'mouebis',
  'ovile',
  'e

In [6]:
# Do a basic count of the words in the texts, just to get a rough idea of the data.
from collections import Counter
text_list = []
for i in text_data:
    for word in i:
        text_list.append(word)
Counter(text_list).most_common(20)

[('carmen1', 36),
 ('canto', 22),
 ('silva', 16),
 ('venio', 16),
 ('amor', 14),
 ('video', 13),
 ('dico2', 13),
 ('grex', 12),
 ('pecus2', 12),
 ('fero', 12),
 ('omne', 12),
 ('curo', 12),
 ('tempus', 11),
 ('levo1', 11),
 ('cano', 11),
 ('herba', 10),
 ('incipio', 9),
 ('tantus', 9),
 ('sequor', 9),
 ('calamus', 9)]

In [7]:
# Make a dictionary out of the data
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [8]:
# Turn the dictionary into a "bag of words" corpus.
corpus = [dictionary.doc2bow(text) for text in text_data]

In [9]:
# Display the "bag of words" corpus.
corpus

[[(0, 2),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 3),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 2),
  (12, 1),
  (13, 2),
  (14, 1),
  (15, 2),
  (16, 1),
  (17, 2),
  (18, 1),
  (19, 1),
  (20, 2),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 2),
  (29, 1),
  (30, 1),
  (31, 2),
  (32, 1),
  (33, 2),
  (34, 1),
  (35, 1),
  (36, 2),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 3),
  (44, 1),
  (45, 3),
  (46, 1),
  (47, 3),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 2),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 2),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 2),
  (71, 1),
  (72, 2),
  (73, 1),
  (74, 2),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 2),
  (87, 2),
  (88, 1),
  (89, 1),
  (90, 2),
  (91, 1)

In [10]:
# Save the dictionary
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [11]:
# Creat TF-IDF model object
from gensim import corpora, models

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.036009385221539396),
 (1, 0.04654145523585096),
 (2, 0.04654145523585096),
 (3, 0.04654145523585096),
 (4, 0.04654145523585096),
 (5, 0.04654145523585096),
 (6, 0.04654145523585096),
 (7, 0.0856102878752438),
 (8, 0.04654145523585096),
 (9, 0.018004692610769698),
 (10, 0.028536762625081268),
 (11, 0.09308291047170192),
 (12, 0.04654145523585096),
 (13, 0.09308291047170192),
 (14, 0.028536762625081268),
 (15, 0.09308291047170192),
 (16, 0.018004692610769698),
 (18, 0.04654145523585096),
 (19, 0.018004692610769698),
 (20, 0.02106414002862313),
 (21, 0.04654145523585096),
 (22, 0.04654145523585096),
 (23, 0.04654145523585096),
 (24, 0.04654145523585096),
 (25, 0.04654145523585096),
 (26, 0.04654145523585096),
 (27, 0.04654145523585096),
 (28, 0.09308291047170192),
 (29, 0.04654145523585096),
 (30, 0.04654145523585096),
 (31, 0.09308291047170192),
 (32, 0.028536762625081268),
 (33, 0.036009385221539396),
 (34, 0.04654145523585096),
 (35, 0.04654145523585096),
 (36, 0.057073525250162

In [12]:
# Train the lda model using gensim.models.LdaMulticore and save it to ‘lda_model’
lda_model = gensim.models.LdaMulticore(corpus, num_topics=4, id2word=dictionary, passes=10, workers=2)

lda_model.save('model1.gensim')

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.009*"venio" + 0.007*"carmen1" + 0.007*"puer" + 0.005*"pecus2" + 0.005*"omne" + 0.005*"facio" + 0.005*"pecus1" + 0.005*"vitula" + 0.004*"video" + 0.004*"pasco"
Topic: 1 
Words: 0.001*"carmen1" + 0.001*"canto" + 0.001*"amor" + 0.001*"dico2" + 0.001*"venio" + 0.001*"silva" + 0.001*"grex" + 0.001*"video" + 0.001*"incipio" + 0.001*"herba"
Topic: 2 
Words: 0.018*"carmen1" + 0.012*"canto" + 0.007*"curo" + 0.007*"silva" + 0.007*"amor" + 0.007*"cano" + 0.007*"dico2" + 0.006*"levo1" + 0.005*"fero" + 0.005*"venio"
Topic: 3 
Words: 0.008*"grex" + 0.007*"pecus2" + 0.005*"tempus" + 0.005*"herba" + 0.005*"ovile" + 0.005*"tremulus" + 0.004*"video" + 0.004*"silva" + 0.004*"no1" + 0.004*"sum1"


In [13]:
# Run LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=4, id2word=dictionary, passes=10, workers=2)

lda_model_tfidf.save('model2.gensim')

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.002*"levo1" + 0.001*"meroe" + 0.001*"curo" + 0.001*"vitula" + 0.001*"perdo" + 0.001*"tandem" + 0.001*"tremulus" + 0.001*"ovile" + 0.001*"poculum" + 0.001*"phyllida"
Topic: 1 Word: 0.001*"venio" + 0.001*"facio" + 0.001*"vitula" + 0.001*"caper" + 0.001*"alter" + 0.001*"carmen1" + 0.001*"pollio" + 0.001*"pecus1" + 0.001*"pono" + 0.001*"munero"
Topic: 2 Word: 0.002*"gallus1" + 0.001*"venio" + 0.001*"cresco" + 0.001*"vester" + 0.001*"capio" + 0.001*"laboro" + 0.001*"lycoris" + 0.001*"arcades" + 0.001*"paeniteo" + 0.001*"phyllon"
Topic: 3 Word: 0.002*"meliboee" + 0.001*"pectus" + 0.001*"felix1" + 0.001*"do" + 0.001*"pinus" + 0.001*"benignus" + 0.001*"suadeo" + 0.001*"liquor2" + 0.001*"carpo" + 0.001*"patior"


In [14]:
# Test an unseen document against the lda models. This is one of Boccaccio's bucolic poems, so it should have a high score.

unseen_document1 = open('/home/sjhuskey/Dropbox/What-is-digital-latin/test_texts/boccaccio1.txt')
unseen_text1 = unseen_document1.read()
bow_vector = dictionary.doc2bow(prepare_text(unseen_text1))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("LDA Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
    
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("\nTF-IDF Score: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

LDA Score: 0.6724274754524231	 Topic: 0.018*"carmen1" + 0.012*"canto" + 0.007*"curo" + 0.007*"silva" + 0.007*"amor"
LDA Score: 0.16603590548038483	 Topic: 0.008*"grex" + 0.007*"pecus2" + 0.005*"tempus" + 0.005*"herba" + 0.005*"ovile"
LDA Score: 0.1608441174030304	 Topic: 0.009*"venio" + 0.007*"carmen1" + 0.007*"puer" + 0.005*"pecus2" + 0.005*"omne"

TF-IDF Score: 0.7392052412033081	 
Topic: 0.002*"levo1" + 0.001*"meroe" + 0.001*"curo" + 0.001*"vitula" + 0.001*"perdo"

TF-IDF Score: 0.25934189558029175	 
Topic: 0.002*"gallus1" + 0.001*"venio" + 0.001*"cresco" + 0.001*"vester" + 0.001*"capio"


In [15]:
# Test an unseen document against the lda models. This is another one of Boccaccio's bucolic poems, so it should have a high score.

unseen_document2 = open('/home/sjhuskey/Dropbox/What-is-digital-latin/test_texts/boccaccio2.txt')
unseen_text2 = unseen_document2.read()
bow_vector = dictionary.doc2bow(prepare_text(unseen_text2))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("LDA Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
    
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("\nTF-IDF Score: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

LDA Score: 0.6933873891830444	 Topic: 0.018*"carmen1" + 0.012*"canto" + 0.007*"curo" + 0.007*"silva" + 0.007*"amor"
LDA Score: 0.165932759642601	 Topic: 0.009*"venio" + 0.007*"carmen1" + 0.007*"puer" + 0.005*"pecus2" + 0.005*"omne"
LDA Score: 0.14000031352043152	 Topic: 0.008*"grex" + 0.007*"pecus2" + 0.005*"tempus" + 0.005*"herba" + 0.005*"ovile"

TF-IDF Score: 0.8080786466598511	 
Topic: 0.002*"levo1" + 0.001*"meroe" + 0.001*"curo" + 0.001*"vitula" + 0.001*"perdo"

TF-IDF Score: 0.19049717485904694	 
Topic: 0.002*"gallus1" + 0.001*"venio" + 0.001*"cresco" + 0.001*"vester" + 0.001*"capio"


In [16]:
# Test another unseen document against the lda models. This is a letter by Boccaccio, so it shouldn't have a high score.

unseen_document3 = open('/home/sjhuskey/Dropbox/What-is-digital-latin/test_texts/boccaccioEp1.txt')
unseen_text3 = unseen_document3.read()
bow_vector = dictionary.doc2bow(prepare_text(unseen_text3))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("LDA Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
    
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("\nTF-IDF Score: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

LDA Score: 0.7241349816322327	 Topic: 0.018*"carmen1" + 0.012*"canto" + 0.007*"curo" + 0.007*"silva" + 0.007*"amor"
LDA Score: 0.2666381299495697	 Topic: 0.009*"venio" + 0.007*"carmen1" + 0.007*"puer" + 0.005*"pecus2" + 0.005*"omne"

TF-IDF Score: 0.5357410311698914	 
Topic: 0.002*"gallus1" + 0.001*"venio" + 0.001*"cresco" + 0.001*"vester" + 0.001*"capio"

TF-IDF Score: 0.4554351270198822	 
Topic: 0.002*"levo1" + 0.001*"meroe" + 0.001*"curo" + 0.001*"vitula" + 0.001*"perdo"


In [17]:
# Test another unseen document against the lda models. This is a theological text.

unseen_document4 = open('/home/sjhuskey/Dropbox/What-is-digital-latin/test_texts/lactantius.txt')
unseen_text4 = unseen_document4.read()
bow_vector = dictionary.doc2bow(prepare_text(unseen_text4))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("LDA Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
    
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("\nTF-IDF Score: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

LDA Score: 0.6372178196907043	 Topic: 0.018*"carmen1" + 0.012*"canto" + 0.007*"curo" + 0.007*"silva" + 0.007*"amor"
LDA Score: 0.22623443603515625	 Topic: 0.009*"venio" + 0.007*"carmen1" + 0.007*"puer" + 0.005*"pecus2" + 0.005*"omne"
LDA Score: 0.1364828199148178	 Topic: 0.008*"grex" + 0.007*"pecus2" + 0.005*"tempus" + 0.005*"herba" + 0.005*"ovile"

TF-IDF Score: 0.6848052740097046	 
Topic: 0.002*"levo1" + 0.001*"meroe" + 0.001*"curo" + 0.001*"vitula" + 0.001*"perdo"

TF-IDF Score: 0.3150583505630493	 
Topic: 0.002*"gallus1" + 0.001*"venio" + 0.001*"cresco" + 0.001*"vester" + 0.001*"capio"


In [18]:
# Test a short passage against the lda models. This is passage from Caesar's commentary on the civil war.

unseen_text5 = '''Litteris C. Caesaris consulibus redditis aegre ab his impetratum est summa tribunorum plebis contentione, ut in senatu recitarentur; ut vero ex litteris ad senatum referretur, impetrari non potuit. Referunt consules de re publica [in civitate]. [Incitat] L. Lentulus consul senatu rei publicae se non defuturum pollicetur, si audacter ac fortiter sententias dicere velint; sin Caesarem respiciant atque eius gratiam sequantur, ut superioribus fecerint temporibus, se sibi consilium capturum neque senatus auctoritati obtemperaturum: habere se quoque ad Caesaris gratiam atque amicitiam receptum.'''
bow_vector = dictionary.doc2bow(prepare_text(unseen_text5))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("LDA Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
    
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("\nTF-IDF Score: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

LDA Score: 0.7339155673980713	 Topic: 0.018*"carmen1" + 0.012*"canto" + 0.007*"curo" + 0.007*"silva" + 0.007*"amor"
LDA Score: 0.1444583684206009	 Topic: 0.009*"venio" + 0.007*"carmen1" + 0.007*"puer" + 0.005*"pecus2" + 0.005*"omne"
LDA Score: 0.10685240477323532	 Topic: 0.008*"grex" + 0.007*"pecus2" + 0.005*"tempus" + 0.005*"herba" + 0.005*"ovile"
LDA Score: 0.014773598872125149	 Topic: 0.001*"carmen1" + 0.001*"canto" + 0.001*"amor" + 0.001*"dico2" + 0.001*"venio"

TF-IDF Score: 0.8613746166229248	 
Topic: 0.002*"levo1" + 0.001*"meroe" + 0.001*"curo" + 0.001*"vitula" + 0.001*"perdo"

TF-IDF Score: 0.10730376839637756	 
Topic: 0.002*"gallus1" + 0.001*"venio" + 0.001*"cresco" + 0.001*"vester" + 0.001*"capio"

TF-IDF Score: 0.0158771313726902	 
Topic: 0.002*"meliboee" + 0.001*"pectus" + 0.001*"felix1" + 0.001*"do" + 0.001*"pinus"

TF-IDF Score: 0.015444422140717506	 
Topic: 0.001*"venio" + 0.001*"facio" + 0.001*"vitula" + 0.001*"caper" + 0.001*"alter"


In [19]:
# Visualize topics.
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model1.gensim')

import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# Saliency: a measure of how much the term tells you about the topic.

# Relevance: a weighted average of the probability of the word given the topic and the word given the topic normalized by the probability of the topic.

# The size of the bubble measures the importance of the topics, relative to the data.

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [40]:
# Visualize topics.
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model2.gensim')

import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# Saliency: a measure of how much the term tells you about the topic.

# Relevance: a weighted average of the probability of the word given the topic and the word given the topic normalized by the probability of the topic.

# The size of the bubble measures the importance of the topics, relative to the data.

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
