In [4]:
# import relative packages
import numpy as np
from gensim import corpora, models, similarities
from cheaters import dctConstr
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
# loading carmilla.txt
with open('carmilla.txt', 'r') as f:
    corp = f.read()

dct = dctConstr(stop_words=["i", "you", "a"], ignore_case=True)
dct.constructor(corp)

def split_by_paragraphs(data:str) -> []:
    processed=data.lower()
    while '\n\n\n' in processed:
        processed=processed.replace('\n\n\n','\n\n')
    out = processed.split('\n\n')
    return [o.replace("\n", " ") for o in out]

pcorp = split_by_paragraphs(corp)
pbow = [dct(para) for para in pcorp]
ptfidf = [dct.tfidf(para) for para in pcorp]
pvec = [dct.bow_to_vec(p) for p in pbow]
idx_to_terms = {i:j for j, i in dct.terms.items()} # flip dictionary for reversal
print(f"The corpus consists of {len(pvec[0])} terms and {len(pcorp)} documents")

The corpus consists of 4204 terms and 676 documents


In [6]:
# data prepraring
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in pcorp
]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

len(dictionary)
print(list(dictionary.values())[:10])
print(list(dct.terms.keys())[:10])

2023-01-13 13:26:43,048 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2023-01-13 13:26:43,069 : INFO : built Dictionary(6062 unique tokens: ['carmilla', 'j.', 'lefanu', 'sheridan', '1872']...) from 676 documents (total 22887 corpus positions)
2023-01-13 13:26:43,070 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(6062 unique tokens: ['carmilla', 'j.', 'lefanu', 'sheridan', '1872']...) from 676 documents (total 22887 corpus positions)", 'datetime': '2023-01-13T13:26:43.070541', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 18:29:29) \n[Clang 12.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


['carmilla', 'j.', 'lefanu', 'sheridan', '1872', 'prologue', '_upon', 'accompanies', 'attached', 'doctor']
['⊹', '⊰', '⊱', '∃', '∀', '⊤', '⊥', '∊', '⋃', '⋼']


In [7]:
# calculate perplexity
lda = models.ldamodel.LdaModel(corpus=pbow, id2word=idx_to_terms, num_topics=11)
perplexity = lda.log_perplexity(pbow)
print("The perplexity for the txt：")
print(perplexity)

2023-01-13 13:26:44,035 : INFO : using symmetric alpha at 0.09090909090909091
2023-01-13 13:26:44,043 : INFO : using symmetric eta at 0.09090909090909091
2023-01-13 13:26:44,045 : INFO : using serial LDA version on this node
2023-01-13 13:26:44,053 : INFO : running online (single-pass) LDA training, 11 topics, 1 passes over the supplied corpus of 676 documents, updating model once every 676 documents, evaluating perplexity every 676 documents, iterating 50x with a convergence threshold of 0.001000
2023-01-13 13:26:44,382 : INFO : -11.099 per-word bound, 2194.2 perplexity estimate based on a held-out corpus of 676 documents with 26584 words
2023-01-13 13:26:44,382 : INFO : PROGRESS: pass 0, at document #676/676
2023-01-13 13:26:44,632 : INFO : topic #4 (0.091): 0.029*"and" + 0.023*"in" + 0.023*"to" + 0.022*"the" + 0.019*""" + 0.019*"my" + 0.017*"it" + 0.017*"her" + 0.016*"she" + 0.015*"was"
2023-01-13 13:26:44,633 : INFO : topic #8 (0.091): 0.071*"the" + 0.032*"of" + 0.027*"and" + 0.019

The perplexity for the txt：
-7.318121069523275


In [8]:
# 80% training and 20% test sets
import random
random.seed(11) #set random seed
# shuffle corpus
cp = list(corpus)
random.shuffle(cp)

# split into 80% training and 20% test sets
p = int(len(cp) * .8)
cp_train = pbow[0:p]
cp_test = pbow[p:]
lda = models.ldamodel.LdaModel(corpus=cp_train, id2word=idx_to_terms, num_topics=11)
perplexity = lda.log_perplexity(cp_test)
print("The perplexity for the text：")
print(perplexity)

2023-01-13 13:26:45,589 : INFO : using symmetric alpha at 0.09090909090909091
2023-01-13 13:26:45,592 : INFO : using symmetric eta at 0.09090909090909091
2023-01-13 13:26:45,594 : INFO : using serial LDA version on this node
2023-01-13 13:26:45,603 : INFO : running online (single-pass) LDA training, 11 topics, 1 passes over the supplied corpus of 540 documents, updating model once every 540 documents, evaluating perplexity every 540 documents, iterating 50x with a convergence threshold of 0.001000
2023-01-13 13:26:45,867 : INFO : -11.768 per-word bound, 3488.3 perplexity estimate based on a held-out corpus of 540 documents with 20241 words
2023-01-13 13:26:45,867 : INFO : PROGRESS: pass 0, at document #540/540
2023-01-13 13:26:46,069 : INFO : topic #2 (0.091): 0.018*"in" + 0.017*"to" + 0.016*""" + 0.012*"said" + 0.011*"that" + 0.011*"and" + 0.009*"of" + 0.008*"the" + 0.008*"have" + 0.007*"her"
2023-01-13 13:26:46,069 : INFO : topic #1 (0.091): 0.034*"and" + 0.027*"the" + 0.025*"to" + 0

The perplexity for the text：
-11.093206786386723
