In [None]:
# Following https://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html

In [1]:
import gensim
import glob
import itertools
import os

In [2]:
import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO

In [3]:
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import STOPWORDS
#Next time use the extreme_filtering as well to only keep 100k words

In [4]:
def head(stream, n=10):
    return list(itertools.islice(stream, n))

In [5]:
# Given a filename, extract just the book id
def get_book_id(fn):
    return os.path.split(fn)[1].split('_')[0].split('.')[0]

In [6]:
def tokenize(text):
    return [token for token in word_tokenize(text) if token not in STOPWORDS]

In [7]:
def text_stream(text_dir):
    #     yield each article as (title, tokens) tuple
    for fn in glob.glob('tm_texts/*.txt'):
        book_id = get_book_id(fn)
        with open(fn, 'r') as f:
            document = f.read()
            yield (book_id, tokenize(document))

In [8]:
text_dir = "plain_full_clean"

In [32]:
head(text_stream(text_dir))

[('36105213320000',
  ['year',
   'france',
   'illustrations',
   'london',
   'bedford',
   'street',
   'covent',
   'garden',
   'new',
   'york',
   'scribner',
   'welford',
   'chapter',
   'ii',
   'louis',
   'chapter',
   'vi',
   'chapter',
   'vii',
   'accident',
   'ix',
   'france',
   'page',
   'chapter',
   'xi',
   'xii',
   'xiv',
   'xvi',
   'xvii',
   'xix',
   'xxiii',
   'xxiv',
   'xxv',
   'year',
   'little',
   'moment',
   'louis',
   'hour',
   'narr',
   'year',
   'dieppe',
   'tion',
   'square',
   'yards',
   'year',
   'bygone',
   'days',
   'threescore',
   'years',
   'french',
   'english',
   'years',
   'lifetime',
   'second',
   'long',
   'years',
   'second',
   'louis',
   'cupids',
   'french',
   'china',
   'year',
   'evening',
   'day',
   'morning',
   'morning',
   'morning',
   'louis',
   'day',
   'week',
   'year',
   'louis',
   'morrow',
   'yesterday',
   'thursday',
   'larousse',
   'fãªte',
   'protes',
   'day',
   'seco

In [31]:
for book_id, tokens in itertools.islice(text_stream(text_dir), 5):
    print(book_id, tokens[:10])

36105213320000 ['year', 'france', 'illustrations', 'london', 'bedford', 'street', 'covent', 'garden', 'new', 'york']
36105213320018 ['cornish', 'township', 'old', 'cornish', 'township', 'old', 'vogue', 'folk', 'pentreath', 'paternoster']
36105213320026 ['chapter', 'iv', 'chapter', 'vi', 'viii', 'golden', 'square', 'ix', 'xii', 'miss']
36105213320034 ['mohtgomeeyherbert', 'manners', 'london', 'ocbtmtrg', 'lesson', 'unselfish', 'ness', 'ball', 'lesson', 'french']
36105213320042 ['spottiswoodb', 'hy', 'london', 'richard', 'bentley', 'slorg', 'family', 'little', 'village', 'grinfield']


In [33]:
book_source_ids = [book_id for book_id, tokens in text_stream(text_dir)]

In [34]:
head(book_source_ids)

['36105213320000',
 '36105213320018',
 '36105213320026',
 '36105213320034',
 '36105213320042',
 '36105213320059',
 '36105213320067',
 '36105213320075',
 '36105213320083',
 '36105213320091']

In [11]:
doc_stream = (tokens for _, tokens in text_stream(text_dir))

In [12]:
%time id2word_novels = gensim.corpora.Dictionary(doc_stream)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : built Dictionary(106782 unique tokens: ['abbey', 'academy', 'accident', 'afternoon', 'afternoons']...) from 1417 documents (total 2597945 corpus positions)


CPU times: user 12.5 s, sys: 516 ms, total: 13 s
Wall time: 13 s


In [13]:
print(id2word_novels)

Dictionary(106782 unique tokens: ['abbey', 'academy', 'accident', 'afternoon', 'afternoons']...)


In [None]:
id2word_novels_filtered = id2word_novels.filter_extremes(no_below=2, no_above=1)

In [14]:
class NovelCorpus(object):
    def __init__(self, text_dir, dictionary):
        self.text_dir = text_dir
        self.dictionary = dictionary
        
    def __iter__(self):
        self.book_ids = []
        for book_id, tokens in text_stream(text_dir):
            self.book_ids.append(book_id)
            yield self.dictionary.doc2bow(tokens)

In [15]:
novel_corpus = NovelCorpus(text_dir, id2word_novels)

In [16]:
vector = next(iter(novel_corpus))
print(vector)

[(0, 2), (1, 1), (2, 1), (3, 7), (4, 1), (5, 8), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 14), (21, 3), (22, 6), (23, 5), (24, 3), (25, 2), (26, 1), (27, 1), (28, 1), (29, 2), (30, 2), (31, 1), (32, 1), (33, 3), (34, 1), (35, 2), (36, 2), (37, 1), (38, 2), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 6), (47, 1), (48, 1), (49, 4), (50, 4), (51, 4), (52, 1), (53, 1), (54, 5), (55, 4), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 51), (62, 1), (63, 1), (64, 1), (65, 1), (66, 11), (67, 1), (68, 128), (69, 36), (70, 2), (71, 1), (72, 5), (73, 1), (74, 1), (75, 1), (76, 2), (77, 2), (78, 27), (79, 1), (80, 14), (81, 1), (82, 1), (83, 1), (84, 8), (85, 4), (86, 7), (87, 1), (88, 1), (89, 2), (90, 3), (91, 54), (92, 1), (93, 1), (94, 2), (95, 30), (96, 43), (97, 2), (98, 1), (99, 25), (100, 2), (101, 1), (102, 1), (103, 2), (104, 2), (105, 3), (106, 1), (107, 3), (108, 1), (109, 4

In [17]:
%time gensim.corpora.MmCorpus.serialize("./novels_bow_lg_full.mm", novel_corpus)

INFO : storing corpus in Matrix Market format to ./novels_bow_lg_full.mm
INFO : saving sparse matrix to ./novels_bow_lg_full.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : saved 1417x106782 matrix, density=0.509% (770379/151310094)
INFO : saving MmCorpus index to ./novels_bow_lg_full.mm.index


CPU times: user 13 s, sys: 281 ms, total: 13.2 s
Wall time: 13.2 s


In [18]:
j_corpus = gensim.corpora.MmCorpus("./novels_bow_lg_full.mm")

INFO : loaded corpus index from ./novels_bow_lg_full.mm.index
INFO : initializing cython corpus reader from ./novels_bow_lg_full.mm
INFO : accepted corpus with 1417 documents, 106782 features, 770379 non-zero entries


In [19]:
print(j_corpus)

MmCorpus(1417 documents, 106782 features, 770379 non-zero entries)


In [20]:
%time lda_model = gensim.models.LdaModel(j_corpus, num_topics=60, id2word=id2word_novels, passes=50, minimum_probability=0.0)

INFO : using symmetric alpha at 0.016666666666666666
INFO : using symmetric eta at 0.016666666666666666
INFO : using serial LDA version on this node
INFO : running online (multi-pass) LDA training, 60 topics, 50 passes over the supplied corpus of 1417 documents, updating model once every 1417 documents, evaluating perplexity every 1417 documents, iterating 50x with a convergence threshold of 0.001000
INFO : -18.661 per-word bound, 414537.8 perplexity estimate based on a held-out corpus of 1417 documents with 2597945 words
INFO : PROGRESS: pass 0, at document #1417/1417
INFO : topic #34 (0.017): 0.020*"day" + 0.016*"days" + 0.014*"years" + 0.010*"morning" + 0.010*"night" + 0.009*"london" + 0.009*"england" + 0.009*"english" + 0.007*"evening" + 0.006*"hour"
INFO : topic #28 (0.017): 0.027*"vo" + 0.025*"day" + 0.012*"years" + 0.012*"crown" + 0.010*"hour" + 0.010*"night" + 0.009*"morning" + 0.009*"days" + 0.009*"english" + 0.008*"england"
INFO : topic #45 (0.017): 0.037*"day" + 0.027*"years

INFO : topic #51 (0.017): 0.036*"idaline" + 0.028*"grace" + 0.027*"day" + 0.017*"harcourt" + 0.017*"thebes" + 0.015*"days" + 0.015*"temple" + 0.015*"sevens" + 0.015*"years" + 0.014*"ninon"
INFO : topic diff=1.527675, rho=0.353553
INFO : -7.694 per-word bound, 207.0 perplexity estimate based on a held-out corpus of 1417 documents with 2597945 words
INFO : PROGRESS: pass 7, at document #1417/1417
INFO : topic #28 (0.017): 0.024*"harley" + 0.018*"day" + 0.012*"hour" + 0.011*"night" + 0.011*"morning" + 0.011*"english" + 0.010*"days" + 0.010*"london" + 0.009*"french" + 0.009*"spelscraft"
INFO : topic #46 (0.017): 0.028*"day" + 0.024*"english" + 0.018*"rome" + 0.015*"years" + 0.015*"irish" + 0.014*"ireland" + 0.013*"england" + 0.012*"days" + 0.012*"church" + 0.011*"night"
INFO : topic #37 (0.017): 0.026*"day" + 0.018*"days" + 0.015*"english" + 0.014*"delhi" + 0.013*"years" + 0.012*"abbey" + 0.010*"night" + 0.009*"castle" + 0.009*"morning" + 0.008*"england"
INFO : topic #50 (0.017): 0.027*"da

INFO : topic diff=0.246313, rho=0.258199
INFO : -7.590 per-word bound, 192.7 perplexity estimate based on a held-out corpus of 1417 documents with 2597945 words
INFO : PROGRESS: pass 14, at document #1417/1417
INFO : topic #6 (0.017): 0.052*"palliser" + 0.034*"maori" + 0.017*"yorkshire" + 0.016*"albion" + 0.016*"spider" + 0.015*"merrion" + 0.015*"courtland" + 0.015*"pakeha" + 0.014*"day" + 0.014*"night"
INFO : topic #54 (0.017): 0.034*"day" + 0.020*"night" + 0.020*"nell" + 0.017*"years" + 0.017*"morning" + 0.017*"days" + 0.014*"hour" + 0.010*"london" + 0.010*"evening" + 0.009*"minutes"
INFO : topic #34 (0.017): 0.051*"saxon" + 0.023*"day" + 0.023*"saxons" + 0.018*"kalulu" + 0.018*"arab" + 0.018*"england" + 0.016*"moto" + 0.014*"normans" + 0.013*"days" + 0.012*"marmorne"
INFO : topic #2 (0.017): 0.019*"world" + 0.019*"old" + 0.015*"gatty" + 0.013*"day" + 0.012*"ca" + 0.011*"adelaide" + 0.011*"time" + 0.009*"church" + 0.009*"days" + 0.008*"creed"
INFO : topic #51 (0.017): 0.037*"idaline"

INFO : PROGRESS: pass 21, at document #1417/1417
INFO : topic #24 (0.017): 0.025*"day" + 0.013*"new" + 0.013*"years" + 0.013*"saxon" + 0.012*"night" + 0.012*"days" + 0.011*"london" + 0.009*"normans" + 0.009*"french" + 0.009*"morning"
INFO : topic #37 (0.017): 0.025*"day" + 0.018*"days" + 0.016*"english" + 0.015*"delhi" + 0.012*"years" + 0.012*"abbey" + 0.010*"night" + 0.009*"castle" + 0.009*"th" + 0.008*"australian"
INFO : topic #46 (0.017): 0.029*"day" + 0.025*"english" + 0.023*"rome" + 0.016*"years" + 0.013*"church" + 0.012*"roman" + 0.012*"days" + 0.012*"england" + 0.011*"night" + 0.010*"hour"
INFO : topic #10 (0.017): 0.026*"day" + 0.018*"years" + 0.015*"days" + 0.014*"fernleigh" + 0.010*"naggletons" + 0.010*"violet" + 0.010*"night" + 0.010*"esq" + 0.010*"killarney" + 0.009*"miss"
INFO : topic #55 (0.017): 0.020*"day" + 0.015*"london" + 0.012*"life" + 0.011*"married" + 0.011*"house" + 0.011*"morning" + 0.009*"english" + 0.009*"maiden" + 0.008*"days" + 0.008*"years"
INFO : topic dif

INFO : topic #50 (0.017): 0.027*"day" + 0.021*"years" + 0.015*"days" + 0.015*"hour" + 0.014*"oban" + 0.013*"morning" + 0.013*"london" + 0.012*"sandham" + 0.012*"town" + 0.011*"florence"
INFO : topic #48 (0.017): 0.144*"irish" + 0.102*"ireland" + 0.049*"dublin" + 0.023*"english" + 0.018*"monmouth" + 0.012*"england" + 0.012*"protestant" + 0.010*"night" + 0.010*"court" + 0.009*"years"
INFO : topic #24 (0.017): 0.024*"day" + 0.014*"new" + 0.012*"saxon" + 0.012*"years" + 0.012*"night" + 0.011*"days" + 0.010*"london" + 0.010*"normans" + 0.009*"york" + 0.009*"camp"
INFO : topic #40 (0.017): 0.021*"day" + 0.018*"wilton" + 0.016*"years" + 0.013*"days" + 0.012*"london" + 0.012*"english" + 0.011*"night" + 0.010*"morning" + 0.009*"french" + 0.008*"hour"
INFO : topic diff=0.019932, rho=0.182574
INFO : -7.557 per-word bound, 188.3 perplexity estimate based on a held-out corpus of 1417 documents with 2597945 words
INFO : PROGRESS: pass 29, at document #1417/1417
INFO : topic #34 (0.017): 0.056*"saxon

INFO : topic #3 (0.017): 0.054*"day" + 0.034*"years" + 0.024*"days" + 0.022*"morning" + 0.019*"hour" + 0.018*"night" + 0.018*"evening" + 0.018*"bible" + 0.017*"christian" + 0.016*"christ"
INFO : topic #20 (0.017): 0.040*"day" + 0.021*"years" + 0.019*"days" + 0.014*"scotland" + 0.013*"morning" + 0.012*"edinburgh" + 0.012*"london" + 0.011*"night" + 0.010*"hour" + 0.010*"england"
INFO : topic #50 (0.017): 0.027*"day" + 0.021*"years" + 0.015*"days" + 0.015*"hour" + 0.014*"oban" + 0.013*"morning" + 0.013*"london" + 0.012*"sandham" + 0.012*"town" + 0.011*"spinsters"
INFO : topic diff=0.012015, rho=0.164399
INFO : -7.550 per-word bound, 187.4 perplexity estimate based on a held-out corpus of 1417 documents with 2597945 words
INFO : PROGRESS: pass 36, at document #1417/1417
INFO : topic #46 (0.017): 0.028*"day" + 0.026*"rome" + 0.025*"english" + 0.015*"years" + 0.014*"roman" + 0.013*"church" + 0.012*"days" + 0.011*"england" + 0.011*"night" + 0.010*"hour"
INFO : topic #36 (0.017): 0.030*"day" +

INFO : topic #11 (0.017): 0.026*"castle" + 0.023*"day" + 0.020*"india" + 0.018*"years" + 0.017*"england" + 0.016*"italian" + 0.014*"american" + 0.013*"home" + 0.013*"english" + 0.013*"shires"
INFO : topic #23 (0.017): 0.057*"vo" + 0.031*"crown" + 0.027*"edition" + 0.011*"second" + 0.011*"day" + 0.010*"new" + 0.009*"english" + 0.007*"illustrations" + 0.006*"years" + 0.006*"scotland"
INFO : topic diff=0.008888, rho=0.150756
INFO : -7.544 per-word bound, 186.7 perplexity estimate based on a held-out corpus of 1417 documents with 2597945 words
INFO : PROGRESS: pass 43, at document #1417/1417
INFO : topic #28 (0.017): 0.037*"harley" + 0.015*"day" + 0.013*"hour" + 0.011*"yoked" + 0.011*"night" + 0.011*"spelscraft" + 0.011*"dacoit" + 0.010*"english" + 0.009*"days" + 0.009*"minutes"
INFO : topic #11 (0.017): 0.026*"castle" + 0.023*"day" + 0.020*"india" + 0.018*"years" + 0.017*"england" + 0.016*"italian" + 0.014*"american" + 0.014*"home" + 0.013*"shires" + 0.013*"english"
INFO : topic #13 (0.01

INFO : topic #8 (0.017): 0.036*"day" + 0.028*"years" + 0.019*"days" + 0.018*"morning" + 0.015*"night" + 0.014*"hayward" + 0.012*"montmorency" + 0.011*"year" + 0.010*"hour" + 0.010*"evening"
INFO : topic diff=0.007216, rho=0.140028


CPU times: user 3h 14min 58s, sys: 3h 49min 35s, total: 7h 4min 33s
Wall time: 19min 47s


In [21]:
lda_model.save('novels_60_lg_full.model')

INFO : saving LdaState object under novels_60_lg_full.model.state, separately None
INFO : saved novels_60_lg_full.model.state
INFO : saving LdaModel object under novels_60_lg_full.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to novels_60_lg_full.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : saved novels_60_lg_full.model


In [22]:
lda_model.print_topics(-1)

INFO : topic #0 (0.017): 0.118*"vo" + 0.062*"crown" + 0.010*"edition" + 0.009*"day" + 0.009*"illustrations" + 0.007*"english" + 0.007*"years" + 0.006*"london" + 0.006*"days" + 0.006*"night"
INFO : topic #1 (0.017): 0.035*"day" + 0.021*"london" + 0.020*"years" + 0.019*"night" + 0.019*"days" + 0.013*"morning" + 0.012*"hour" + 0.008*"second" + 0.008*"grange" + 0.008*"hours"
INFO : topic #2 (0.017): 0.022*"world" + 0.020*"old" + 0.017*"gatty" + 0.014*"ca" + 0.013*"time" + 0.012*"adelaide" + 0.009*"creed" + 0.009*"day" + 0.007*"days" + 0.006*"crayford"
INFO : topic #3 (0.017): 0.055*"day" + 0.035*"years" + 0.024*"days" + 0.023*"morning" + 0.019*"hour" + 0.019*"bible" + 0.019*"night" + 0.019*"evening" + 0.018*"christian" + 0.016*"christ"
INFO : topic #4 (0.017): 0.109*"vernon" + 0.044*"experiment" + 0.019*"galbray" + 0.019*"empiric" + 0.014*"mahomedan" + 0.012*"nawab" + 0.011*"english" + 0.010*"rangers" + 0.006*"india" + 0.006*"rownpore"
INFO : topic #5 (0.017): 0.045*"chinese" + 0.023*"engl

INFO : topic #43 (0.017): 0.020*"danish" + 0.014*"yorkshire" + 0.013*"english" + 0.013*"garriton" + 0.013*"cape" + 0.013*"day" + 0.012*"crown" + 0.011*"vo" + 0.010*"night" + 0.009*"days"
INFO : topic #44 (0.017): 0.028*"italy" + 0.023*"day" + 0.018*"years" + 0.015*"days" + 0.013*"morning" + 0.013*"night" + 0.011*"hour" + 0.009*"evening" + 0.009*"florence" + 0.008*"english"
INFO : topic #45 (0.017): 0.040*"day" + 0.033*"years" + 0.021*"days" + 0.019*"morning" + 0.016*"night" + 0.014*"year" + 0.014*"london" + 0.013*"hour" + 0.012*"second" + 0.012*"evening"
INFO : topic #46 (0.017): 0.028*"day" + 0.028*"rome" + 0.025*"english" + 0.015*"years" + 0.014*"roman" + 0.013*"church" + 0.012*"days" + 0.011*"england" + 0.010*"night" + 0.010*"hour"
INFO : topic #47 (0.017): 0.041*"jerusalem" + 0.023*"jews" + 0.020*"salome" + 0.017*"jewish" + 0.014*"gospeller" + 0.014*"faire" + 0.014*"ryder" + 0.013*"israel" + 0.013*"roman" + 0.012*"strongsoul"
INFO : topic #48 (0.017): 0.143*"irish" + 0.103*"ireland

[(0,
  '0.118*"vo" + 0.062*"crown" + 0.010*"edition" + 0.009*"day" + 0.009*"illustrations" + 0.007*"english" + 0.007*"years" + 0.006*"london" + 0.006*"days" + 0.006*"night"'),
 (1,
  '0.035*"day" + 0.021*"london" + 0.020*"years" + 0.019*"night" + 0.019*"days" + 0.013*"morning" + 0.012*"hour" + 0.008*"second" + 0.008*"grange" + 0.008*"hours"'),
 (2,
  '0.022*"world" + 0.020*"old" + 0.017*"gatty" + 0.014*"ca" + 0.013*"time" + 0.012*"adelaide" + 0.009*"creed" + 0.009*"day" + 0.007*"days" + 0.006*"crayford"'),
 (3,
  '0.055*"day" + 0.035*"years" + 0.024*"days" + 0.023*"morning" + 0.019*"hour" + 0.019*"bible" + 0.019*"night" + 0.019*"evening" + 0.018*"christian" + 0.016*"christ"'),
 (4,
  '0.109*"vernon" + 0.044*"experiment" + 0.019*"galbray" + 0.019*"empiric" + 0.014*"mahomedan" + 0.012*"nawab" + 0.011*"english" + 0.010*"rangers" + 0.006*"india" + 0.006*"rownpore"'),
 (5,
  '0.045*"chinese" + 0.023*"english" + 0.021*"day" + 0.020*"matilda" + 0.017*"china" + 0.014*"days" + 0.014*"years" + 0

In [None]:
lda_model[j_corpus[4]]

In [23]:
import pyLDAvis
import pyLDAvis.gensim

INFO : Generating grammar tables from /usr/lib/python3.6/lib2to3/Grammar.txt
INFO : Generating grammar tables from /usr/lib/python3.6/lib2to3/PatternGrammar.txt


In [24]:
pyLDAvis.enable_notebook()

In [25]:
pyLDAvis.gensim.prepare(lda_model, j_corpus, id2word_novels)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [26]:
#visualization following https://www.kaggle.com/ykhorramz/lda-and-t-sne-interactive-visualization#

In [27]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, atopic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [29]:
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(60):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(lda_model,topic_number=i, topn=20, output=True )
#     print tmp[:5]
    topic_summaries += [tmp[:5]]
    print

term                 frequency

Topic 0 |---------------------

vo                   0.118
crown                0.062
edition              0.010
day                  0.009
illustrations        0.009
english              0.007
years                0.007
london               0.006
days                 0.006
night                0.006
post                 0.005
second               0.005
life                 0.005
vols                 0.005
new                  0.005
fcp                  0.005
notes                0.005
england              0.004
stories              0.004
morning              0.004
Topic 1 |---------------------

day                  0.035
london               0.021
years                0.020
night                0.019
days                 0.019
morning              0.013
hour                 0.012
second               0.008
grange               0.008
hours                0.008
evening              0.007
minutes              0.007
year                 0.006
virginia     

harley               0.037
day                  0.014
hour                 0.013
yoked                0.011
spelscraft           0.011
dacoit               0.011
night                0.011
nell                 0.010
english              0.010
englishmen           0.010
minutes              0.009
days                 0.009
scotland             0.009
morning              0.009
life                 0.008
treasure             0.008
danes                0.008
crayford             0.007
sea                  0.007
thane                0.007
Topic 29 |---------------------

bell                 0.147
kingdom              0.026
junction             0.026
san                  0.015
ballybeg             0.012
italian              0.012
marino               0.012
hour                 0.010
night                0.008
codlington           0.007
palace               0.007
royal                0.007
morrow               0.006
bally                0.006
morning              0.006
italy                0

danish               0.020
yorkshire            0.014
english              0.013
garriton             0.013
cape                 0.013
day                  0.013
crown                0.012
vo                   0.011
night                0.010
days                 0.009
duchesse             0.009
house                0.008
morning              0.007
catalina             0.007
scar                 0.007
england              0.007
capture              0.007
estrella             0.007
hour                 0.007
philip               0.006
Topic 44 |---------------------

italy                0.028
day                  0.023
years                0.018
days                 0.015
morning              0.013
night                0.013
hour                 0.011
evening              0.009
florence             0.009
english              0.008
hours                0.008
london               0.008
italian              0.008
england              0.007
second               0.007
cambridge            0

gerty                0.043
day                  0.031
ruby                 0.018
london               0.018
years                0.017
morning              0.017
days                 0.016
night                0.013
carewes              0.013
woodburn             0.013
hour                 0.012
evening              0.011
usk                  0.010
yalentine            0.010
minutes              0.009
home                 0.009
langley              0.008
grange               0.007
england              0.007
second               0.007
Topic 59 |---------------------

march                0.076
days                 0.030
day                  0.021
philippine           0.018
idalia               0.013
yore                 0.013
kaffir               0.012
torney               0.011
ione                 0.010
newark               0.010
kaffirs              0.009
years                0.008
dutch                0.008
mauritz              0.008
morning              0.007
rica                 0

In [144]:
topic_summaries

[['vo', 'crown', 'edition', 'day', 'illustrations'],
 ['day', 'london', 'years', 'night', 'days'],
 ['world', 'old', 'gatty', 'ca', 'time'],
 ['day', 'years', 'days', 'morning', 'hour'],
 ['vernon', 'experiment', 'galbray', 'empiric', 'mahomedan'],
 ['chinese', 'english', 'day', 'matilda', 'china'],
 ['palliser', 'maori', 'yorkshire', 'albion', 'spider'],
 ['day', 'op', 'wales', 'galway', 'mcleod'],
 ['day', 'years', 'days', 'morning', 'night'],
 ['day', 'tanat', 'years', 'myfanwy', 'mount'],
 ['day', 'years', 'fernleigh', 'days', 'violet'],
 ['castle', 'day', 'india', 'years', 'england'],
 ['tyson', 'du', 'gushetneuk', 'dawvid', 'westmoreland'],
 ['day', 'night', 'years', 'days', 'morning'],
 ['charlotte', 'english', 'england', 'day', 'russian'],
 ['derry', 'murrough', 'como', 'somerville', 'day'],
 ['cherry', 'violet', 'fardorougha', 'purple', 'colloquies'],
 ['french', 'paris', 'france', 'english', 'day'],
 ['nile', 'cairo', 'portugal', 'years', 'day'],
 ['day', 'ashlynne', 'staunto

In [35]:
import pandas as pd

In [36]:
source_ids = pd.Series(book_source_ids)

In [43]:
lda_model[j_corpus[1416]]

[(0, 1.548947e-05),
 (1, 1.548947e-05),
 (2, 1.548947e-05),
 (3, 1.548947e-05),
 (4, 1.548947e-05),
 (5, 1.548947e-05),
 (6, 1.548947e-05),
 (7, 0.015550078),
 (8, 0.026274221),
 (9, 1.548947e-05),
 (10, 1.548947e-05),
 (11, 1.548947e-05),
 (12, 1.548947e-05),
 (13, 1.548947e-05),
 (14, 1.548947e-05),
 (15, 1.548947e-05),
 (16, 1.548947e-05),
 (17, 1.548947e-05),
 (18, 1.548947e-05),
 (19, 1.548947e-05),
 (20, 1.548947e-05),
 (21, 1.548947e-05),
 (22, 1.548947e-05),
 (23, 1.548947e-05),
 (24, 0.24858516),
 (25, 1.548947e-05),
 (26, 1.548947e-05),
 (27, 1.548947e-05),
 (28, 1.548947e-05),
 (29, 1.548947e-05),
 (30, 1.548947e-05),
 (31, 1.548947e-05),
 (32, 1.548947e-05),
 (33, 1.548947e-05),
 (34, 1.548947e-05),
 (35, 0.0046094903),
 (36, 1.548947e-05),
 (37, 1.548947e-05),
 (38, 1.548947e-05),
 (39, 1.548947e-05),
 (40, 1.548947e-05),
 (41, 1.548947e-05),
 (42, 1.548947e-05),
 (43, 1.548947e-05),
 (44, 0.07169224),
 (45, 0.03618221),
 (46, 1.548947e-05),
 (47, 1.548947e-05),
 (48, 1.54

In [41]:
for i in range(1417):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [44]:
# need to create headers, especially for topics 0 through 59
headers = ["source_id"]
for i in range(60):
    headers.append("topic-{}".format(i))


In [45]:
headers

['source_id',
 'topic-0',
 'topic-1',
 'topic-2',
 'topic-3',
 'topic-4',
 'topic-5',
 'topic-6',
 'topic-7',
 'topic-8',
 'topic-9',
 'topic-10',
 'topic-11',
 'topic-12',
 'topic-13',
 'topic-14',
 'topic-15',
 'topic-16',
 'topic-17',
 'topic-18',
 'topic-19',
 'topic-20',
 'topic-21',
 'topic-22',
 'topic-23',
 'topic-24',
 'topic-25',
 'topic-26',
 'topic-27',
 'topic-28',
 'topic-29',
 'topic-30',
 'topic-31',
 'topic-32',
 'topic-33',
 'topic-34',
 'topic-35',
 'topic-36',
 'topic-37',
 'topic-38',
 'topic-39',
 'topic-40',
 'topic-41',
 'topic-42',
 'topic-43',
 'topic-44',
 'topic-45',
 'topic-46',
 'topic-47',
 'topic-48',
 'topic-49',
 'topic-50',
 'topic-51',
 'topic-52',
 'topic-53',
 'topic-54',
 'topic-55',
 'topic-56',
 'topic-57',
 'topic-58',
 'topic-59']

In [91]:
df = pd.DataFrame(columns=headers)

In [92]:
df

Unnamed: 0,source_id,topic-0,topic-1,topic-2,topic-3,topic-4,topic-5,topic-6,topic-7,topic-8,...,topic-50,topic-51,topic-52,topic-53,topic-54,topic-55,topic-56,topic-57,topic-58,topic-59


In [93]:
for i in range(1417):
    book_id = book_source_ids[i]
    print(book_id)
    new_row = [book_id]
    for k, i in lda_model[j_corpus[i]]:
        new_row.append(i)
    df.loc[book_id] = new_row

36105213320000
36105213320018
36105213320026
36105213320034
36105213320042
36105213320059
36105213320067
36105213320075
36105213320083
36105213320091
36105213320109
36105213320117
36105213320125
36105213320133
36105213320141
36105213320158
36105213320166
36105213320174
36105213320182
36105213320190
36105213320208
36105213320224
36105213320232
36105213320240
36105213320265
36105213320273
36105213320299
36105213320307
36105213320315
36105213320323
36105213320331
36105213320349
36105213320356
36105213320364
36105213320372
36105213320380
36105213320398
36105213320406
36105213320414
36105213320422
36105213320430
36105213320448
36105213320455
36105213320463
36105213320471
36105213320489
36105213320497
36105213320505
36105213320513
36105213320521
36105213320539
36105213320547
36105213320562
36105213320570
36105213320588
36105213320596
36105213320604
36105213320612
36105213320620
36105213320638
36105213320646
36105213320653
36105213320661
36105213320679
36105213320687
36105213320703
3610521332

36105213326270
36105213326288
36105213326296
36105213326304
36105213326312
36105213326320
36105213326338
36105213326346
36105213326353
36105213326361
36105213326379
36105213326387
36105213326395
36105213326403
36105213326411
36105213326429
36105213326437
36105213326445
36105213326460
36105213326478
36105213326486
36105213326494
36105213326502
36105213326510
36105213326528
36105213326536
36105213326544
36105213326551
36105213326569
36105213326577
36105213326585
36105213326593
36105213326601
36105213326619
36105213326627
36105213326635
36105213326643
36105213326650
36105213326668
36105213326676
36105213326684
36105213326692
36105213326700
36105213326718
36105213326726
36105213326734
36105213326742
36105213326759
36105213326767
36105213326775
36105213326783
36105213326791
36105213326809
36105213326817
36105213326825
36105213326833
36105213326841
36105213326858
36105213326866
36105213326874
36105213326882
36105213326890
36105213326908
36105213326916
36105213326924
36105213326932
3610521332

36105213332336
36105213332344
36105213332351
36105213332369
36105213332377
36105213332385
36105213332393
36105213332401
36105213332419
36105213332427
36105213332435
36105213332443
36105213332450
36105213332468
36105213332476
36105213332484
36105213332500
36105213332518
36105213332526
36105213332534
36105213332542
36105213332559
36105213332567
36105213332575
36105213332583
36105213332591
36105213332609
36105213332617
36105213332633
36105213332641
36105213332658
36105213332674
36105213332682
36105213332690
36105213332708
36105213332716
36105213332724
36105213332732
36105213332740
36105213332757
36105213332765
36105213332773
36105213332781
36105213332799
36105213332807
36105213332815
36105213332823
36105213332831
36105213332849
36105213332856
36105213332864
36105213332872
36105213332880
36105213332906
36105213332914
36105213332922
36105213332930
36105213332948
36105213332955
36105213332963
36105213332971
36105213332989
36105213332997
36105213333003
36105213333011
36105213333029
3610521333

In [94]:
df

Unnamed: 0,source_id,topic-0,topic-1,topic-2,topic-3,topic-4,topic-5,topic-6,topic-7,topic-8,...,topic-50,topic-51,topic-52,topic-53,topic-54,topic-55,topic-56,topic-57,topic-58,topic-59
36105213320000,36105213320000,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,...,0.000009,0.000009,0.028960,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009
36105213320018,36105213320018,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,...,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014
36105213320026,36105213320026,0.000009,0.758175,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,...,0.000009,0.000009,0.146363,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009
36105213320034,36105213320034,0.087177,0.000023,0.000023,0.000023,0.000023,0.000023,0.000023,0.000023,0.000023,...,0.000023,0.000023,0.306425,0.000023,0.000023,0.000023,0.005995,0.000023,0.516762,0.000023
36105213320042,36105213320042,0.000034,0.609333,0.000034,0.000034,0.000034,0.000034,0.000034,0.000034,0.000034,...,0.000034,0.000034,0.383622,0.000034,0.000034,0.000034,0.000034,0.000034,0.000034,0.000034
36105213320059,36105213320059,0.009083,0.000023,0.000023,0.120277,0.000023,0.000023,0.000023,0.000023,0.000023,...,0.000023,0.000023,0.135712,0.000023,0.000023,0.000023,0.000023,0.000023,0.000023,0.000023
36105213320067,36105213320067,0.631601,0.000012,0.000012,0.000012,0.000012,0.000012,0.002156,0.000012,0.000012,...,0.000012,0.000012,0.031705,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012
36105213320075,36105213320075,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,...,0.000010,0.000010,0.224912,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010
36105213320083,36105213320083,0.000014,0.000014,0.000014,0.000536,0.000014,0.000014,0.000014,0.000014,0.000014,...,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014
36105213320091,36105213320091,0.002929,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,...,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020


In [95]:
df.to_csv('doc_topic_probs.csv')

In [96]:
df_titles = pd.read_csv("jarndyce_ids.csv")

In [97]:
df_titles

Unnamed: 0.1,Unnamed: 0,Druid,Purl,Title,Source Id,Catkey,source_id
0,0,bb018zb8894,https://purl.stanford.edu/bb018zb8894,George Stirling's heritage : a story of cheque...,sul:36105213325215,9370404,36105213325215
1,1,bb122wh8873,https://purl.stanford.edu/bb122wh8873,"The king and the cloister, or, Legends of the ...",sul:36105213325355,9371533,36105213325355
2,2,bb403jp9042,https://purl.stanford.edu/bb403jp9042,"On the way, or, Places passed by pilgrims",sul:36105213328276,9428576,36105213328276
3,3,bb456zt5479,https://purl.stanford.edu/bb456zt5479,"Edgar Nelthorpe, or, The fair maids of Taunton...",sul:36105213325710,9374499,36105213325710
4,4,bb705cc9370,https://purl.stanford.edu/bb705cc9370,"Euthanasia, or, Turf, tent and tomb",sul:36105213330843,9522427,36105213330843
5,5,bb737zp0787,https://purl.stanford.edu/bb737zp0787,The curate of Cumberworth ; and The vicar of R...,sul:36105213335446,9616533,36105213335446
6,6,bb846gv7194,https://purl.stanford.edu/bb846gv7194,"Miss Kate, or, Confessions of a caretaker : a ...",sul:36105213336279,9626057,36105213336279
7,7,bb851sg5107,https://purl.stanford.edu/bb851sg5107,"De la More, or, Scenes in many lands!",sul:36105213321420,9276363,36105213321420
8,8,bc067cr7596,https://purl.stanford.edu/bc067cr7596,Mere stores,sul:36105213332054,9561939,36105213332054
9,9,bc647th4106,https://purl.stanford.edu/bc647th4106,Harry Fludyer at Cambridge : a series of famil...,sul:36105213330587,9519092,36105213330587


In [98]:
result = pd.merge(df, df_titles, left_on="source_id", right_on="source_id")

In [99]:
result

Unnamed: 0.1,source_id,topic-0,topic-1,topic-2,topic-3,topic-4,topic-5,topic-6,topic-7,topic-8,...,topic-56,topic-57,topic-58,topic-59,Unnamed: 0,Druid,Purl,Title,Source Id,Catkey
0,36105213320000,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,...,0.000009,0.000009,0.000009,0.000009,115,ck486xd3174,https://purl.stanford.edu/ck486xd3174,"One year, or, A story of three homes",sul:36105213320000,9262900
1,36105213320018,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,...,0.000014,0.000014,0.000014,0.000014,1621,zt090gz7149,https://purl.stanford.edu/zt090gz7149,In a Cornish township with old Vogue folk,sul:36105213320018,9262916
2,36105213320026,0.000009,0.758175,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,...,0.000009,0.000009,0.000009,0.000009,1107,sj935nf4860,https://purl.stanford.edu/sj935nf4860,"Richard Arbour, or, The family scapegrace",sul:36105213320026,9262927
3,36105213320034,0.087177,0.000023,0.000023,0.000023,0.000023,0.000023,0.000023,0.000023,0.000023,...,0.005995,0.000023,0.516762,0.000023,19,bg577xv8248,https://purl.stanford.edu/bg577xv8248,Herbert Manners : and other tales,sul:36105213320034,9262968
4,36105213320042,0.000034,0.609333,0.000034,0.000034,0.000034,0.000034,0.000034,0.000034,0.000034,...,0.000034,0.000034,0.000034,0.000034,477,jb262ym6309,https://purl.stanford.edu/jb262ym6309,"Thwarted, or, Ducks' eggs in a hen's nest : a ...",sul:36105213320042,9262980
5,36105213320059,0.009083,0.000023,0.000023,0.120277,0.000023,0.000023,0.000023,0.000023,0.000023,...,0.000023,0.000023,0.000023,0.000023,858,pm111tg1949,https://purl.stanford.edu/pm111tg1949,"Transformed, or, Three weeks in a life-time",sul:36105213320059,9263003
6,36105213320067,0.631601,0.000012,0.000012,0.000012,0.000012,0.000012,0.002156,0.000012,0.000012,...,0.000012,0.000012,0.000012,0.000012,1519,ym715jj9317,https://purl.stanford.edu/ym715jj9317,Wild Mike and his victim,sul:36105213320067,9263022
7,36105213320075,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,...,0.000010,0.000010,0.000010,0.000010,1574,zd144kq5650,https://purl.stanford.edu/zd144kq5650,Golden Face : a tale of the wild West,sul:36105213320075,9263103
8,36105213320083,0.000014,0.000014,0.000014,0.000536,0.000014,0.000014,0.000014,0.000014,0.000014,...,0.000014,0.000014,0.000014,0.000014,307,fs094qg2849,https://purl.stanford.edu/fs094qg2849,The lily of Lumley,sul:36105213320083,9263135
9,36105213320091,0.002929,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,...,0.000020,0.000020,0.000020,0.000020,775,nn170wc9214,https://purl.stanford.edu/nn170wc9214,"Stable secrets, or, Puffy Doddles : his saying...",sul:36105213320091,9263155


In [103]:
result = result.drop('Unnamed: 0', axis=1)

In [104]:
result

Unnamed: 0,source_id,topic-0,topic-1,topic-2,topic-3,topic-4,topic-5,topic-6,topic-7,topic-8,...,topic-55,topic-56,topic-57,topic-58,topic-59,Druid,Purl,Title,Source Id,Catkey
0,36105213320000,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,...,0.000009,0.000009,0.000009,0.000009,0.000009,ck486xd3174,https://purl.stanford.edu/ck486xd3174,"One year, or, A story of three homes",sul:36105213320000,9262900
1,36105213320018,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,0.000014,...,0.000014,0.000014,0.000014,0.000014,0.000014,zt090gz7149,https://purl.stanford.edu/zt090gz7149,In a Cornish township with old Vogue folk,sul:36105213320018,9262916
2,36105213320026,0.000009,0.758175,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,0.000009,...,0.000009,0.000009,0.000009,0.000009,0.000009,sj935nf4860,https://purl.stanford.edu/sj935nf4860,"Richard Arbour, or, The family scapegrace",sul:36105213320026,9262927
3,36105213320034,0.087177,0.000023,0.000023,0.000023,0.000023,0.000023,0.000023,0.000023,0.000023,...,0.000023,0.005995,0.000023,0.516762,0.000023,bg577xv8248,https://purl.stanford.edu/bg577xv8248,Herbert Manners : and other tales,sul:36105213320034,9262968
4,36105213320042,0.000034,0.609333,0.000034,0.000034,0.000034,0.000034,0.000034,0.000034,0.000034,...,0.000034,0.000034,0.000034,0.000034,0.000034,jb262ym6309,https://purl.stanford.edu/jb262ym6309,"Thwarted, or, Ducks' eggs in a hen's nest : a ...",sul:36105213320042,9262980
5,36105213320059,0.009083,0.000023,0.000023,0.120277,0.000023,0.000023,0.000023,0.000023,0.000023,...,0.000023,0.000023,0.000023,0.000023,0.000023,pm111tg1949,https://purl.stanford.edu/pm111tg1949,"Transformed, or, Three weeks in a life-time",sul:36105213320059,9263003
6,36105213320067,0.631601,0.000012,0.000012,0.000012,0.000012,0.000012,0.002156,0.000012,0.000012,...,0.000012,0.000012,0.000012,0.000012,0.000012,ym715jj9317,https://purl.stanford.edu/ym715jj9317,Wild Mike and his victim,sul:36105213320067,9263022
7,36105213320075,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,0.000010,...,0.000010,0.000010,0.000010,0.000010,0.000010,zd144kq5650,https://purl.stanford.edu/zd144kq5650,Golden Face : a tale of the wild West,sul:36105213320075,9263103
8,36105213320083,0.000014,0.000014,0.000014,0.000536,0.000014,0.000014,0.000014,0.000014,0.000014,...,0.000014,0.000014,0.000014,0.000014,0.000014,fs094qg2849,https://purl.stanford.edu/fs094qg2849,The lily of Lumley,sul:36105213320083,9263135
9,36105213320091,0.002929,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,0.000020,...,0.000020,0.000020,0.000020,0.000020,0.000020,nn170wc9214,https://purl.stanford.edu/nn170wc9214,"Stable secrets, or, Puffy Doddles : his saying...",sul:36105213320091,9263155


In [105]:
result.to_csv("doc_topic_prob_titles.csv")

In [108]:
vis_data = result.drop(['Druid', 'Purl', "Source Id", "Catkey"], axis=1)

In [145]:
vis_data.to_csv("vis-data.csv")

In [116]:
single = df[df["source_id"] == "36105213320000"]

In [141]:
import matplotlib
import matplotlib.pyplot as plt


In [142]:
%matplotlib inline

In [143]:
single.hist()

ImportError: matplotlib is required for plotting.