In [None]:
# Following https://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html

In [1]:
import gensim
import glob
import itertools
import os

In [2]:
import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO

In [3]:
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import STOPWORDS
#Next time use the extreme_filtering as well to only keep 100k words

In [4]:
def head(stream, n=10):
    return list(itertools.islice(stream, n))

In [5]:
# Given a filename, extract just the book id
def get_book_id(fn):
    return os.path.split(fn)[1].split('_')[0].split('.')[0]

In [6]:
def tokenize(text):
    return [token for token in word_tokenize(text) if token not in STOPWORDS]

In [7]:
def text_stream(text_dir):
    #     yield each article as (title, tokens) tuple
    for fn in glob.glob('tm_texts/*.txt'):
        book_id = get_book_id(fn)
        with open(fn, 'r') as f:
            document = f.read()
            yield (book_id, tokenize(document))

In [8]:
text_dir = "tm_texts"

In [9]:
head(text_stream(text_dir))

[('36105213320000',
  ['year',
   'france',
   'illustrations',
   'london',
   'bedford',
   'street',
   'covent',
   'garden',
   'new',
   'york',
   'scribner',
   'welford',
   'chapter',
   'ii',
   'louis',
   'chapter',
   'vi',
   'chapter',
   'vii',
   'accident',
   'ix',
   'france',
   'page',
   'chapter',
   'xi',
   'xii',
   'xiv',
   'xvi',
   'xvii',
   'xix',
   'xxiii',
   'xxiv',
   'xxv',
   'year',
   'little',
   'moment',
   'louis',
   'hour',
   'narr',
   'year',
   'dieppe',
   'tion',
   'square',
   'yards',
   'year',
   'bygone',
   'days',
   'threescore',
   'years',
   'french',
   'english',
   'years',
   'lifetime',
   'second',
   'long',
   'years',
   'second',
   'louis',
   'cupids',
   'french',
   'china',
   'year',
   'evening',
   'day',
   'morning',
   'morning',
   'morning',
   'louis',
   'day',
   'week',
   'year',
   'louis',
   'morrow',
   'yesterday',
   'thursday',
   'larousse',
   'fãªte',
   'protes',
   'day',
   'seco

In [10]:
for book_id, tokens in itertools.islice(text_stream(text_dir), 5):
    print(book_id, tokens[:10])

36105213320000 ['year', 'france', 'illustrations', 'london', 'bedford', 'street', 'covent', 'garden', 'new', 'york']
36105213320018 ['cornish', 'township', 'old', 'cornish', 'township', 'old', 'vogue', 'folk', 'pentreath', 'paternoster']
36105213320026 ['chapter', 'iv', 'chapter', 'vi', 'viii', 'golden', 'square', 'ix', 'xii', 'miss']
36105213320034 ['mohtgomeeyherbert', 'manners', 'london', 'ocbtmtrg', 'lesson', 'unselfish', 'ness', 'ball', 'lesson', 'french']
36105213320042 ['spottiswoodb', 'hy', 'london', 'richard', 'bentley', 'slorg', 'family', 'little', 'village', 'grinfield']


In [11]:
doc_stream = (tokens for _, tokens in text_stream(text_dir))

In [12]:
%time id2word_novels = gensim.corpora.Dictionary(doc_stream)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : built Dictionary(106782 unique tokens: ['abbey', 'academy', 'accident', 'afternoon', 'afternoons']...) from 1417 documents (total 2597945 corpus positions)


CPU times: user 12.5 s, sys: 281 ms, total: 12.8 s
Wall time: 12.8 s


In [13]:
print(id2word_novels)

Dictionary(106782 unique tokens: ['abbey', 'academy', 'accident', 'afternoon', 'afternoons']...)


In [20]:
id2word_novels_filtered = id2word_novels.filter_extremes(no_below=2, no_above=1)

INFO : discarding 0 tokens: []...
INFO : keeping 25758 tokens which were in no less than 2 and no more than 1417 (=100.0%) documents
INFO : resulting dictionary: Dictionary(25758 unique tokens: ['abbey', 'academy', 'accident', 'afternoons', 'allons']...)


In [21]:
class NovelCorpus(object):
    def __init__(self, text_dir, dictionary):
        self.text_dir = text_dir
        self.dictionary = dictionary
        
    def __iter__(self):
        self.book_ids = []
        for book_id, tokens in text_stream(text_dir):
            self.book_ids.append(book_id)
            yield self.dictionary.doc2bow(tokens)

In [26]:
novel_corpus = NovelCorpus(text_dir, id2word_novels)

In [27]:
vector = next(iter(novel_corpus))
print(vector)

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 14), (17, 3), (18, 3), (19, 2), (20, 1), (21, 1), (22, 2), (23, 2), (24, 1), (25, 3), (26, 1), (27, 2), (28, 2), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 4), (38, 1), (39, 1), (40, 4), (41, 1), (42, 1), (43, 1), (44, 1), (45, 51), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 1), (53, 1), (54, 1), (55, 1), (56, 2), (57, 27), (58, 14), (59, 1), (60, 1), (61, 1), (62, 7), (63, 1), (64, 3), (65, 54), (66, 1), (67, 1), (68, 2), (69, 1), (70, 2), (71, 1), (72, 1), (73, 2), (74, 1), (75, 1), (76, 1), (77, 4), (78, 4), (79, 1), (80, 1), (81, 8), (82, 32), (83, 8), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 2), (91, 1), (92, 1), (93, 1), (94, 2), (95, 1), (96, 1), (97, 1), (98, 1), (99, 5), (100, 2), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 5), (109, 2), (11

In [28]:
%time gensim.corpora.MmCorpus.serialize("./novels_bow_lg.mm", novel_corpus)

INFO : storing corpus in Matrix Market format to ./novels_bow_lg.mm
INFO : saving sparse matrix to ./novels_bow_lg.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : saved 1417x25758 matrix, density=1.560% (569208/36499086)
INFO : saving MmCorpus index to ./novels_bow_lg.mm.index


CPU times: user 12.9 s, sys: 219 ms, total: 13.1 s
Wall time: 13.2 s


In [29]:
j_corpus = gensim.corpora.MmCorpus("./novels_bow_lg.mm")

INFO : loaded corpus index from ./novels_bow_lg.mm.index
INFO : initializing cython corpus reader from ./novels_bow_lg.mm
INFO : accepted corpus with 1417 documents, 25758 features, 569208 non-zero entries


In [30]:
print(j_corpus)

MmCorpus(1417 documents, 25758 features, 569208 non-zero entries)


In [31]:
%time lda_model = gensim.models.LdaModel(j_corpus, num_topics=60, id2word=id2word_novels, passes=50)

INFO : using symmetric alpha at 0.016666666666666666
INFO : using symmetric eta at 0.016666666666666666
INFO : using serial LDA version on this node
INFO : running online (multi-pass) LDA training, 60 topics, 50 passes over the supplied corpus of 1417 documents, updating model once every 1417 documents, evaluating perplexity every 1417 documents, iterating 50x with a convergence threshold of 0.001000
INFO : -13.485 per-word bound, 11468.4 perplexity estimate based on a held-out corpus of 1417 documents with 1506783 words
INFO : PROGRESS: pass 0, at document #1417/1417
INFO : topic #28 (0.017): 0.013*"israel" + 0.010*"vo" + 0.008*"crown" + 0.008*"cambridge" + 0.007*"castle" + 0.007*"abbey" + 0.006*"edition" + 0.005*"annals" + 0.005*"amalfi" + 0.005*"rome"
INFO : topic #2 (0.017): 0.028*"vo" + 0.013*"crown" + 0.009*"italy" + 0.006*"edition" + 0.005*"captain" + 0.004*"rome" + 0.004*"grange" + 0.003*"american" + 0.003*"scotland" + 0.003*"life"
INFO : topic #9 (0.017): 0.016*"grange" + 0.00

INFO : topic #20 (0.017): 0.048*"wilton" + 0.016*"effingham" + 0.015*"louis" + 0.011*"sydney" + 0.009*"arden" + 0.009*"prince" + 0.009*"heathfield" + 0.009*"englishmen" + 0.007*"cleveland" + 0.006*"jewish"
INFO : topic #42 (0.017): 0.041*"tanat" + 0.033*"myfanwy" + 0.027*"owen" + 0.027*"snap" + 0.020*"welsh" + 0.013*"march" + 0.010*"john" + 0.008*"wales" + 0.008*"dublin" + 0.007*"knights"
INFO : topic diff=2.880218, rho=0.353553
INFO : -7.912 per-word bound, 240.8 perplexity estimate based on a held-out corpus of 1417 documents with 1506783 words
INFO : PROGRESS: pass 7, at document #1417/1417
INFO : topic #50 (0.017): 0.086*"charlotte" + 0.036*"augusta" + 0.022*"australian" + 0.021*"montmorency" + 0.017*"march" + 0.014*"kings" + 0.014*"wool" + 0.011*"spinsters" + 0.011*"val" + 0.010*"markham"
INFO : topic #35 (0.017): 0.032*"bohemia" + 0.016*"mordaunt" + 0.012*"watch" + 0.011*"sunshine" + 0.009*"marvel" + 0.009*"lancashire" + 0.009*"state" + 0.009*"grace" + 0.007*"rivers" + 0.007*"pem

INFO : topic #22 (0.017): 0.024*"temple" + 0.023*"vernon" + 0.018*"ninon" + 0.015*"sevens" + 0.015*"printer" + 0.014*"calico" + 0.013*"grace" + 0.012*"inn" + 0.012*"sackville" + 0.011*"bull"
INFO : topic #0 (0.017): 0.027*"castle" + 0.021*"march" + 0.018*"anglomaniacs" + 0.018*"axhill" + 0.014*"leixlip" + 0.010*"eveleen" + 0.009*"rivers" + 0.008*"cornish" + 0.008*"york" + 0.008*"idalia"
INFO : topic diff=0.655537, rho=0.258199
INFO : -7.759 per-word bound, 216.5 perplexity estimate based on a held-out corpus of 1417 documents with 1506783 words
INFO : PROGRESS: pass 14, at document #1417/1417
INFO : topic #32 (0.017): 0.032*"laurel" + 0.027*"wilfred" + 0.024*"oban" + 0.023*"eglantine" + 0.021*"town" + 0.018*"leicester" + 0.016*"miser" + 0.016*"tasmania" + 0.012*"bartle" + 0.012*"naples"
INFO : topic #28 (0.017): 0.067*"israel" + 0.025*"cambridge" + 0.022*"abbey" + 0.014*"balliol" + 0.014*"tudor" + 0.010*"travers" + 0.010*"depths" + 0.009*"annals" + 0.009*"fairfield" + 0.008*"oxford"
IN

INFO : topic #53 (0.017): 0.019*"bishop" + 0.019*"king" + 0.018*"lincoln" + 0.013*"moreville" + 0.012*"queen" + 0.011*"tower" + 0.010*"louis" + 0.010*"journal" + 0.009*"fair" + 0.009*"master"
INFO : topic #50 (0.017): 0.099*"charlotte" + 0.042*"augusta" + 0.022*"montmorency" + 0.021*"australian" + 0.018*"march" + 0.015*"kings" + 0.015*"wool" + 0.012*"spinsters" + 0.012*"markham" + 0.012*"val"
INFO : topic diff=0.189995, rho=0.213201
INFO : -7.719 per-word bound, 210.7 perplexity estimate based on a held-out corpus of 1417 documents with 1506783 words
INFO : PROGRESS: pass 21, at document #1417/1417
INFO : topic #14 (0.017): 0.083*"rome" + 0.048*"roman" + 0.021*"romans" + 0.020*"greek" + 0.014*"christians" + 0.012*"greeks" + 0.011*"hayward" + 0.010*"shires" + 0.010*"constantinople" + 0.010*"persian"
INFO : topic #28 (0.017): 0.067*"israel" + 0.033*"cambridge" + 0.019*"abbey" + 0.014*"balliol" + 0.014*"tudor" + 0.010*"travers" + 0.010*"fairfield" + 0.010*"depths" + 0.009*"annals" + 0.008

INFO : topic #12 (0.017): 0.051*"chinese" + 0.019*"china" + 0.016*"pekin" + 0.012*"japanese" + 0.011*"hilda" + 0.011*"seat" + 0.011*"council" + 0.010*"sybil" + 0.010*"village" + 0.010*"thirlby"
INFO : topic #48 (0.017): 0.038*"sydney" + 0.034*"australia" + 0.022*"melbourne" + 0.019*"captain" + 0.018*"australian" + 0.008*"fernleigh" + 0.008*"town" + 0.008*"sea" + 0.007*"lasses" + 0.007*"colonial"
INFO : topic #4 (0.017): 0.037*"gerty" + 0.021*"sydney" + 0.013*"op" + 0.012*"mcleod" + 0.012*"grace" + 0.011*"haddon" + 0.009*"malta" + 0.008*"captain" + 0.008*"chariot" + 0.008*"monday"
INFO : topic diff=0.080303, rho=0.185695
INFO : -7.699 per-word bound, 207.7 perplexity estimate based on a held-out corpus of 1417 documents with 1506783 words
INFO : PROGRESS: pass 28, at document #1417/1417
INFO : topic #47 (0.017): 0.041*"manor" + 0.026*"fairfax" + 0.022*"mountains" + 0.013*"agnes" + 0.010*"copley" + 0.010*"gotham" + 0.009*"socialist" + 0.008*"wilfred" + 0.008*"grace" + 0.008*"temple"
INFO

INFO : topic #4 (0.017): 0.042*"gerty" + 0.021*"sydney" + 0.014*"chichester" + 0.013*"op" + 0.012*"mcleod" + 0.012*"grace" + 0.012*"haddon" + 0.009*"malta" + 0.008*"chariot" + 0.008*"captain"
INFO : topic #52 (0.017): 0.047*"oxford" + 0.023*"mount" + 0.014*"sea" + 0.013*"carmel" + 0.012*"myrtle" + 0.011*"campion" + 0.009*"western" + 0.008*"antonio" + 0.006*"park" + 0.005*"boston"
INFO : topic #12 (0.017): 0.052*"chinese" + 0.020*"china" + 0.016*"pekin" + 0.012*"japanese" + 0.011*"hilda" + 0.011*"seat" + 0.011*"council" + 0.010*"sybil" + 0.010*"village" + 0.010*"thirlby"
INFO : topic diff=0.046183, rho=0.166667
INFO : -7.685 per-word bound, 205.8 perplexity estimate based on a held-out corpus of 1417 documents with 1506783 words
INFO : PROGRESS: pass 35, at document #1417/1417
INFO : topic #55 (0.017): 0.031*"italy" + 0.027*"mademoiselle" + 0.020*"la" + 0.020*"lã" + 0.016*"kitty" + 0.015*"le" + 0.015*"countess" + 0.015*"duchesse" + 0.012*"marmorne" + 0.011*"du"
INFO : topic #11 (0.017):

INFO : topic #25 (0.017): 0.058*"russian" + 0.038*"russia" + 0.018*"cairo" + 0.015*"petersburg" + 0.015*"russians" + 0.013*"europe" + 0.013*"moscow" + 0.011*"polish" + 0.011*"poland" + 0.010*"egypt"
INFO : topic #48 (0.017): 0.038*"sydney" + 0.034*"australia" + 0.023*"melbourne" + 0.021*"australian" + 0.019*"captain" + 0.009*"cape" + 0.009*"town" + 0.008*"fernleigh" + 0.008*"sea" + 0.007*"colonial"
INFO : topic #59 (0.017): 0.038*"highland" + 0.037*"alchemy" + 0.034*"francs" + 0.033*"lorimer" + 0.032*"sentimental" + 0.031*"sex" + 0.024*"dulcie" + 0.022*"lille" + 0.020*"trevanion" + 0.016*"gray"
INFO : topic diff=0.032055, rho=0.152499
INFO : -7.675 per-word bound, 204.4 perplexity estimate based on a held-out corpus of 1417 documents with 1506783 words
INFO : PROGRESS: pass 42, at document #1417/1417
INFO : topic #5 (0.017): 0.052*"lettice" + 0.042*"kalulu" + 0.027*"selim" + 0.027*"manners" + 0.025*"forsyth" + 0.021*"arab" + 0.018*"carlo" + 0.017*"reader" + 0.017*"arabs" + 0.016*"scrip

INFO : topic #5 (0.017): 0.053*"lettice" + 0.043*"kalulu" + 0.027*"selim" + 0.027*"manners" + 0.026*"forsyth" + 0.021*"arab" + 0.018*"carlo" + 0.017*"reader" + 0.017*"arabs" + 0.017*"scripture"
INFO : topic #37 (0.017): 0.025*"saxon" + 0.021*"merle" + 0.018*"normandy" + 0.018*"mercy" + 0.017*"dayrell" + 0.017*"saxons" + 0.015*"carewes" + 0.010*"individualist" + 0.010*"anjou" + 0.010*"normans"
INFO : topic #57 (0.017): 0.035*"delhi" + 0.016*"malay" + 0.016*"maitland" + 0.014*"op" + 0.014*"cashmere" + 0.012*"wales" + 0.012*"malays" + 0.011*"captain" + 0.010*"bismillah" + 0.009*"melancthon"
INFO : topic diff=0.024651, rho=0.141421
INFO : -7.668 per-word bound, 203.3 perplexity estimate based on a held-out corpus of 1417 documents with 1506783 words
INFO : PROGRESS: pass 49, at document #1417/1417
INFO : topic #59 (0.017): 0.038*"highland" + 0.037*"alchemy" + 0.034*"francs" + 0.034*"lorimer" + 0.032*"sentimental" + 0.032*"sex" + 0.024*"dulcie" + 0.022*"lille" + 0.021*"trevanion" + 0.016*"g

CPU times: user 2h 27min 31s, sys: 2h 56min 46s, total: 5h 24min 17s
Wall time: 14min 28s


In [32]:
lda_model.save('novels_60_lg.model')

INFO : saving LdaState object under novels_60_lg.model.state, separately None
INFO : saved novels_60_lg.model.state
INFO : saving LdaModel object under novels_60_lg.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to novels_60_lg.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute state
INFO : not storing attribute id2word
INFO : saved novels_60_lg.model


In [33]:
lda_model.print_topics(-1)

INFO : topic #0 (0.017): 0.027*"castle" + 0.024*"march" + 0.020*"anglomaniacs" + 0.019*"axhill" + 0.015*"leixlip" + 0.011*"eveleen" + 0.010*"cornish" + 0.009*"rivers" + 0.009*"york" + 0.008*"idalia"
INFO : topic #1 (0.017): 0.041*"la" + 0.019*"captain" + 0.013*"vols" + 0.012*"grace" + 0.009*"illustrated" + 0.008*"bank" + 0.007*"harcourt" + 0.007*"cassell" + 0.007*"story" + 0.007*"song"
INFO : topic #2 (0.017): 0.069*"italy" + 0.023*"rome" + 0.019*"naples" + 0.013*"florence" + 0.008*"venice" + 0.008*"italians" + 0.008*"kingdom" + 0.007*"austrian" + 0.007*"san" + 0.007*"fate"
INFO : topic #3 (0.017): 0.123*"vo" + 0.074*"crown" + 0.014*"post" + 0.013*"illustrations" + 0.012*"vols" + 0.010*"illusts" + 0.008*"novels" + 0.007*"stories" + 0.007*"notes" + 0.007*"piccadilly"
INFO : topic #4 (0.017): 0.046*"gerty" + 0.022*"sydney" + 0.017*"chichester" + 0.014*"op" + 0.013*"mcleod" + 0.012*"grace" + 0.012*"haddon" + 0.010*"malta" + 0.009*"chariot" + 0.008*"captain"
INFO : topic #5 (0.017): 0.053*

INFO : topic #42 (0.017): 0.051*"welsh" + 0.045*"tanat" + 0.037*"myfanwy" + 0.030*"owen" + 0.028*"snap" + 0.017*"wales" + 0.013*"march" + 0.011*"john" + 0.008*"knights" + 0.008*"tuesday"
INFO : topic #43 (0.017): 0.054*"leadenhall" + 0.032*"press" + 0.020*"huguenot" + 0.018*"eve" + 0.018*"morn" + 0.017*"till" + 0.016*"marchmont" + 0.015*"vane" + 0.014*"conway" + 0.013*"wheel"
INFO : topic #44 (0.017): 0.083*"club" + 0.031*"fergus" + 0.028*"mystery" + 0.015*"wood" + 0.013*"master" + 0.012*"captain" + 0.012*"mask" + 0.010*"raid" + 0.008*"cross" + 0.008*"davenport"
INFO : topic #45 (0.017): 0.047*"edition" + 0.022*"mo" + 0.021*"vo" + 0.014*"illustrations" + 0.013*"esq" + 0.011*"life" + 0.009*"morocco" + 0.008*"academy" + 0.008*"vols" + 0.007*"gd"
INFO : topic #46 (0.017): 0.044*"palliser" + 0.028*"maori" + 0.025*"alps" + 0.022*"albion" + 0.013*"spider" + 0.012*"victorine" + 0.012*"pakeha" + 0.011*"captain" + 0.011*"belmont" + 0.010*"isle"
INFO : topic #47 (0.017): 0.070*"manor" + 0.026*"f

[(0,
  '0.027*"castle" + 0.024*"march" + 0.020*"anglomaniacs" + 0.019*"axhill" + 0.015*"leixlip" + 0.011*"eveleen" + 0.010*"cornish" + 0.009*"rivers" + 0.009*"york" + 0.008*"idalia"'),
 (1,
  '0.041*"la" + 0.019*"captain" + 0.013*"vols" + 0.012*"grace" + 0.009*"illustrated" + 0.008*"bank" + 0.007*"harcourt" + 0.007*"cassell" + 0.007*"story" + 0.007*"song"'),
 (2,
  '0.069*"italy" + 0.023*"rome" + 0.019*"naples" + 0.013*"florence" + 0.008*"venice" + 0.008*"italians" + 0.008*"kingdom" + 0.007*"austrian" + 0.007*"san" + 0.007*"fate"'),
 (3,
  '0.123*"vo" + 0.074*"crown" + 0.014*"post" + 0.013*"illustrations" + 0.012*"vols" + 0.010*"illusts" + 0.008*"novels" + 0.007*"stories" + 0.007*"notes" + 0.007*"piccadilly"'),
 (4,
  '0.046*"gerty" + 0.022*"sydney" + 0.017*"chichester" + 0.014*"op" + 0.013*"mcleod" + 0.012*"grace" + 0.012*"haddon" + 0.010*"malta" + 0.009*"chariot" + 0.008*"captain"'),
 (5,
  '0.053*"lettice" + 0.043*"kalulu" + 0.027*"selim" + 0.027*"manners" + 0.026*"forsyth" + 0.021*

In [34]:
import pyLDAvis
import pyLDAvis.gensim

INFO : Generating grammar tables from /usr/lib/python3.6/lib2to3/Grammar.txt
INFO : Generating grammar tables from /usr/lib/python3.6/lib2to3/PatternGrammar.txt


In [35]:
pyLDAvis.enable_notebook()

In [36]:
pyLDAvis.gensim.prepare(lda_model, j_corpus, id2word_novels)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
# Need to clean word 'said'
# Probably need to remove proper names
# remove words like 'thy,' 'thou,' etc.