In [None]:
# Following https://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html

In [6]:
import gensim
import glob
import itertools
import os

In [38]:
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import STOPWORDS

In [3]:
def head(stream, n=10):
    return list(itertools.islice(stream, n))

In [13]:
# Given a filename, extract just the book id
def get_book_id(fn):
    return os.path.split(fn)[1].split('_')[0].split('.')[0]

In [39]:
def tokenize(text):
    return [token for token in word_tokenize(text) if token not in STOPWORDS]

In [40]:
def text_stream(text_dir):
    #     yield each article as (title, tokens) tuple
    for fn in glob.glob('tm_texts/*.txt'):
        book_id = get_book_id(fn)
        with open(fn, 'r') as f:
            document = f.read()
            yield (book_id, tokenize(document))

In [41]:
text_dir = "tm_texts"

In [42]:
head(text_stream(text_dir))

[('36105213320042',
  ['florence',
   'montgomerythwartedlondon',
   'printed',
   'ily',
   'spottiswoodb',
   'new',
   'square',
   'parliament',
   'streetthwarted',
   'ducks',
   'eggs',
   'hen',
   'nest',
   'hy',
   'author',
   'misunderstood',
   'london',
   'richard',
   'bentley',
   'son',
   'new',
   'burlington',
   'street',
   'rights',
   'reservesito',
   'youngest',
   'brother',
   'alec',
   'cjjis',
   'iltlc',
   'slorg',
   'jlíbictütbcontents',
   'chapter',
   'page',
   'washerwoman',
   'ner',
   'family',
   'ii',
   'quiet',
   'little',
   'village',
   'grinfield',
   'iii',
   'brother',
   'sister',
   'walk',
   'iiome',
   'iv',
   'trying',
   'past',
   'fairy',
   'child',
   'big',
   'house',
   'vi',
   'letter',
   'bourn',
   'ii',
   'vii',
   'burden',
   'double',
   'secret',
   'viii',
   'parable',
   'talents',
   'vi',
   'contexts',
   'chattei',
   'paoe',
   'ix',
   'sister',
   'fairy',
   'child',
   'hie',
   'infant',
   

In [43]:
for book_id, tokens in itertools.islice(text_stream(text_dir), 5):
    print(book_id, tokens[:10])

36105213320042 ['florence', 'montgomerythwartedlondon', 'printed', 'ily', 'spottiswoodb', 'new', 'square', 'parliament', 'streetthwarted', 'ducks']
36105213325884 ['ÿê', 'êêê', 'rrr', 'jk', 'ws', 'êàthe', 'secret', 'mere', 'stir', 'face']
36105213324556 ['blind', 'larry', 'irish', 'idylls', 'lewis', 'macnamara', 'blind', 'larry', 'impressionist', 'series']
36105213325660 ['si', 'ules', 'samemorrison', 'gibb', 'irnters', 'edinburghhe', 'si', 'ules', 'ame', 'roger']
36105213334779 ['doctor', 'ditr', 'inrinry', 'author', 'tales', 'kirkbeck', 'alice', 'beresford', 'lives', 'fathers']


In [44]:
doc_stream = (tokens for _, tokens in text_stream(text_dir))

In [45]:
%time id2word_novels = gensim.corpora.Dictionary(doc_stream)

CPU times: user 6min 30s, sys: 3.93 s, total: 6min 34s
Wall time: 6min 38s


In [21]:
print(id2word_novels)

Dictionary(417787 unique tokens: ['aa', 'abideth', 'able', 'ably', 'about']...)


In [23]:
class NovelCorpus(object):
    def __init__(self, text_dir, dictionary):
        self.text_dir = text_dir
        self.dictionary = dictionary
        
    def __iter__(self):
        self.book_ids = []
        for book_id, tokens in text_stream(text_dir):
            self.book_ids.append(book_id)
            yield self.dictionary.doc2bow(tokens)

In [46]:
novel_corpus = NovelCorpus(text_dir, id2word_novels)

In [25]:
vector = next(iter(novel_corpus))
print(vector)

[(0, 1), (1, 1), (2, 15), (3, 2), (4, 67), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 2), (22, 12), (23, 1), (24, 1), (25, 1), (26, 5), (27, 1), (28, 5), (29, 2), (30, 1), (31, 1), (32, 2), (33, 1), (34, 1), (35, 1), (36, 13), (37, 2), (38, 2), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 3), (47, 1), (48, 2), (49, 5), (50, 1), (51, 8), (52, 1), (53, 1), (54, 1), (55, 1), (56, 4), (57, 1), (58, 1), (59, 13), (60, 37), (61, 9), (62, 2), (63, 35), (64, 5), (65, 1), (66, 3), (67, 1), (68, 5), (69, 5), (70, 1), (71, 1), (72, 7), (73, 2), (74, 1), (75, 3), (76, 8), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 3), (84, 1), (85, 1), (86, 3), (87, 140), (88, 1), (89, 1), (90, 1), (91, 3), (92, 1), (93, 2), (94, 14), (95, 9), (96, 12), (97, 1), (98, 5), (99, 5), (100, 10), (101, 19), (102, 16), (103, 1), (104, 3), (105, 3), (106, 1), (107, 1), (108, 1), (109,

In [47]:
%time gensim.corpora.MmCorpus.serialize("./novels_bow.mm", novel_corpus)

CPU times: user 6min 30s, sys: 3.72 s, total: 6min 34s
Wall time: 6min 37s


In [48]:
j_corpus = gensim.corpora.MmCorpus("./novels_bow.mm")

In [49]:
print(j_corpus)

MmCorpus(1417 documents, 417455 features, 10042903 non-zero entries)


In [51]:
%time lda_model = gensim.models.LdaModel(j_corpus, num_topics=40, id2word=id2word_novels, passes=10)

CPU times: user 43min 28s, sys: 7min 26s, total: 50min 54s
Wall time: 13min 54s


In [58]:
lda_model.save('novels_50.model')

In [53]:
lda_model.print_topics(-1)

[(0,
  '0.006*"said" + 0.004*"man" + 0.004*"great" + 0.004*"men" + 0.004*"time" + 0.004*"grace" + 0.003*"king" + 0.003*"thou" + 0.003*"shall" + 0.003*"life"'),
 (1,
  '0.006*"said" + 0.004*"time" + 0.004*"man" + 0.003*"men" + 0.003*"little" + 0.003*"like" + 0.003*"old" + 0.003*"great" + 0.003*"good" + 0.002*"way"'),
 (2,
  '0.020*"ye" + 0.006*"wi" + 0.006*"said" + 0.005*"ll" + 0.005*"man" + 0.004*"time" + 0.004*"like" + 0.004*"hae" + 0.004*"laird" + 0.003*"weel"'),
 (3,
  '0.003*"time" + 0.003*"said" + 0.003*"jim" + 0.003*"father" + 0.003*"god" + 0.003*"man" + 0.002*"great" + 0.002*"heart" + 0.002*"soon" + 0.002*"day"'),
 (4,
  '0.008*"said" + 0.005*"clara" + 0.004*"little" + 0.004*"lady" + 0.004*"italy" + 0.004*"time" + 0.003*"know" + 0.003*"exiles" + 0.003*"farnorth" + 0.003*"man"'),
 (5,
  '0.011*"vo" + 0.006*"said" + 0.006*"crown" + 0.006*"cloth" + 0.005*"edition" + 0.005*"man" + 0.004*"little" + 0.003*"life" + 0.003*"old" + 0.003*"time"'),
 (6,
  '0.012*"said" + 0.005*"man" + 0.00

In [59]:
import pyLDAvis
import pyLDAvis.gensim

In [60]:
pyLDAvis.enable_notebook()

In [62]:
pyLDAvis.gensim.prepare(lda_model, j_corpus, id2word_novels)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
# Need to clean word 'said'
# Probably need to remove proper names
# remove words like 'thy,' 'thou,' etc.