# Textual Analysis of the Hobbit

## Tolkien Books in my Library 

![texte](images/tolkien_shelf.JPG)

In [1]:
import pandas as pd
from tabulate import tabulate
import sqlite3
import os
from math import*

In [2]:
conn = sqlite3.connect("/Users/ray/Calibre Library/metadata.db")

books_df = pd.read_sql_query("select * from books;", conn)
authors_df = pd.read_sql_query("select * from authors;", conn)
publishers_df = pd.read_sql_query("select * from publishers;", conn)
books_authors_df = pd.read_sql_query("select * from books_authors_link;", conn)

In [3]:
author_ids = list(authors_df[authors_df.name.str.contains('Tol')]['id'])
book_ids = list(books_authors_df[books_authors_df.author.isin(author_ids)]['book'])

print(tabulate(books_df[books_df.id.isin(book_ids)][['title','id']], 
                        showindex='never', headers=['Title','Id']))

Title                                                                     Id
----------------------------------------------------------------------  ----
The Hobbit: 75th Anniversary Edition                                      61
The History of the Hobbit: Mr Baggins and Return to Bag-End              123
The Legend of Sigurd and Gudrún                                          127
The Book of Lost Tales, Part Two: Part Two (History of Middle-Earth 2)   270
The Book of Lost Tales, Part One (History of Middle-Earth 1)             290
Unfinished Tales of Numenor and Middle-Earth                             392
The Lord of the Rings: One Volume                                        393
The Hobbit: Illustrated by Alan Lee                                      394
Beowulf: A Translation and Commentary, Together With Sellic Spell        395
The Silmarillion                                                         396
The Children of Húrin                                                    397

In [4]:
import glob
from pathlib import Path

In [5]:
hobbit_path = list(books_df[books_df['id']==61]['path'])[0]

In [6]:
for p in Path("/Users/ray/Calibre Library/" + hobbit_path).glob('*.txt'):
    print(p)

/Users/ray/Calibre Library/J. R. R. Tolkien/The Hobbit_ 75th Anniversary Edition (61)/The Hobbit_ 75th Anniversary Edition - J. R. R. Tolkien.txt


In [7]:
text = p.read_text()

## Breaking the Text into Parts

![The Hobbit_ 75th Anniversary Edition](images/hobbit_cover.jpg)

In [18]:
import re
from src.models import normalize
import io

from segtok.segmenter import split_single, split_multi

In [19]:
# from segtok.segmenter import split_single, split_multi
# from segtok.tokenizer import symbol_tokenizer, word_tokenizer, web_tokenizer
# from segtok.tokenizer import split_possessive_markers, split_contractions

In [20]:
for n, line in enumerate(text.split('\n')):
    match = re.findall(r'^Chapter', line)
    if match:
        print(n, line)

56 Chapter I - An Unexpected Party
74 Chapter II - Roast Mutton
86 Chapter III - A Short Rest
92 Chapter IV - Over Hill and under Hill
98 Chapter V - Riddles in the Dark
110 Chapter VI - Out of the Frying-Pan into the Fire
116 Chapter VII - Queer Lodgings
122 Chapter VIII - Flies and Spiders
128 Chapter IX - Barrels Out of Bond
134 Chapter X - A Warm Welcome
140 Chapter XI - On the Doorstep
146 Chapter XII - Inside Information
152 Chapter XIII - Not at Home
158 Chapter XIV - Fire and Water
164 Chapter XV - The Gathering of the Clouds
170 Chapter XVI - A Thief in the Night
176 Chapter XVII - The Clouds Burst
182 Chapter XVIII - The Return Journey
188 Chapter XIX - The Last Stage
612 Chapter I
1102 Chapter II
1420 Chapter III
1612 Chapter IV
1774 Chapter V
2192 Chapter VI
2466 Chapter VII
2878 Chapter VIII
3224 Chapter IX
3452 Chapter X
3624 Chapter XI
3726 Chapter XII
3954 Chapter XIII
4112 Chapter XIV
4212 Chapter XV
4414 Chapter XVI
4524 Chapter XVII
4670 Chapter XVIII
4790 Chapter XI

In [21]:
text_lines = text.split('\n')
print(len(text_lines))

5726


In [22]:
chapters = {1:{'start':612,  'end':1102, 'title':'An Unexpected Party'},
            2:{'start':1102, 'end':1420, 'title':'Roast Mutton'},
            3:{'start':1420, 'end':1612, 'title':'A Short Rest'},
            4:{'start':1612, 'end':1774, 'title':'Over Hill and under Hill'},
            5:{'start':1774, 'end':2192, 'title':'Riddles in the Dark'},
            6:{'start':2192, 'end':2466, 'title':'Out of the Frying-Pan into the Fire'},
            7:{'start':2466, 'end':2878, 'title':'Queer Lodgings'},
            8:{'start':2878, 'end':3224, 'title':'Flies and Spiders'},
            9:{'start':3224, 'end':3452, 'title':'Barrels Out of Bond'},
           10:{'start':3452, 'end':3624, 'title':'A Warm Welcome'},
           11:{'start':3624, 'end':3726, 'title':'On the Doorstep'},
           12:{'start':3726, 'end':3954, 'title':'Inside Information'},
           13:{'start':3954, 'end':4112, 'title':'Not at Home'},
           14:{'start':4112, 'end':4212, 'title':'Fire and Water'},
           15:{'start':4212, 'end':4414, 'title':'The Gathering of the Clouds'},
           16:{'start':4414, 'end':4524, 'title':'A Thief in the Night'},
           17:{'start':4524, 'end':4670, 'title':'The Clouds Burst'},
           18:{'start':4670, 'end':4790, 'title':'The Return Journey'},
           19:{'start':4790, 'end':5096-50, 'title':'The Last Stage'},  
                                              # Last 50 lines of Chapter actually are back matter.
                                              # Back matter includes 1st chapter of Lord of the Rings.
           'front_matter':{'start':0, 'end':612, 'title':'Front Matter'},
           'back_matter':{'start':5096-50, 'end':5726, 'title':'Back Matter'}}

In [23]:
def SimpleTokenizer(text):
    filter_terms = set(['Tolkien', 'Caption', 'J.R.R.', 'J.R.R.Tolkien', 'Image', 'audio'])
    sentences = []
    for sentence in split_single(text):
        # tokens = split_contractions(word_tokenizer(sentence))
        # sentence = ' '.join(tokens)
        if set(sentence.split()).intersection(filter_terms):
            continue
        sentence_clean = normalize.tokenize(s=sentence, lang='en', tokenizer='elasticsearch')
        sentences.append((sentence, sentence_clean)) 
        # print(sentence_clean)
    return sentences

In [24]:
# chapter_text = '\n'.join(chapter.split('\n')[3:])

In [25]:
start = chapters[1]['start']
end = chapters[19]['end']
all_text = text_lines[start: end]

In [27]:
# Put in separate data prep script, only needs to be run when regenerating data.

if True:
    with open('../data/raw/hobbit_flat.txt', 'w') as flat:
        with open('../data/raw/hobbit_flat_clean.txt', 'w') as clean:
            for paragraph in all_text:
                if paragraph == '':
                    continue
                paragraph_clean = SimpleTokenizer(paragraph)
                for pair in paragraph_clean:
                    flat.write(pair[0] + '\n')
                    clean.write(pair[1] + '\n')

Jun-28-2019 12:01:42 GET http://localhost:9200/en/_analyze [status:404 request:0.129s]


NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index', en, index_expression)

In [None]:
!grep Tolkien ../data/raw/hobbit_flat.txt
!grep Caption ../data/raw/hobbit_flat.txt
!grep 'J.R.R.' ../data/raw/hobbit_flat.txt
!grep Image ../data/raw/hobbit_flat.txt
!grep audio ../data/raw/hobbit_flat.txt

## Word Level Statistics

In [None]:
from src.models import WordLevelStatistics

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
# chapt_no = 5
# chapter = '\n'.join(text_lines[chapters[chapt_no]['start']: chapters[chapt_no]['end']])

In [None]:
# start = chapters[1]['start']
# end = chapters[19]['end']
# all_text = '\n'.join(text_lines[start: end])

In [None]:
# all_text = normalize.tokenize(s=all_text, lang='en', tokenizer='elasticsearch')

In [None]:
with open('../data/raw/hobbit_flat_clean.txt') as fp:
    all_text = fp.read()

In [None]:
sentences = all_text.split('\n')

In [None]:
sentences = [s.replace('-', ' ') for s in sentences]
sentences = [s.replace('-', ' ') for s in sentences]
phrases = Phrases([s.split() for s in sentences], min_count=3, threshold=10)
bigram = Phraser(phrases)

In [None]:
sentences_bigrams = [' '.join(s) for s in bigram[[s.split() for s in sentences]]]

In [None]:
with open('hobbit_flat_clean_bigrams.txt', 'w') as fp:
    for s in sentences_bigrams:
        fp.write(s + '\n')

In [None]:
with open('hobbit_flat_clean_bigrams.txt') as fp:
    all_text = fp.read()

In [None]:
fp = io.StringIO(all_text)
word_level_statistics = WordLevelStatistics(corpus_file=fp, percentile_C=95)
word_level_statistics.compute_spectra()

In [None]:
lvls_df = pd.DataFrame(word_level_statistics.level_stat_thresholded)
significant_terms = word_level_statistics.significant_terms
print('With threshold = {}, ({} percentile) find {} significant terms.'.format(
    word_level_statistics.threshold, word_level_statistics.percentile_C, len(significant_terms)))

In [None]:
lvls_df = lvls_df.sort_values(by='sigma_nor', ascending=False)
lvls_df.head()

In [None]:
# lvls_df = lvls_df[lvls_df['count'] < 300]

In [None]:
# import matplotlib
# import matplotlib.pyplot as plot
import plotly_express as px

In [None]:
# plot.style.available

In [None]:
# plot.rcParams["figure.figsize"] = [10, 10]
# plot.style.use('fivethirtyeight')

In [None]:
vocab = [{'term':term, 'count':len(word_level_statistics.word_pos[term])} for term in word_level_statistics.word_pos]
vocab = pd.DataFrame(vocab)
vocab = vocab.sort_values(by='count', ascending=False)
vocab['index'] = list(range(0,len(vocab)))
# vocab.plot(kind='scatter', x='index', y='count', loglog=True, xlim=(1,10000));
px.scatter(vocab, x='index', y='count', log_x=True, log_y=True, hover_name='term')

In [None]:
tokens = re.findall('\w+', all_text)
chapter_boundaries = [(n, tokens[n], tokens[n+1]) for n in word_level_statistics.word_pos['chapter']
                                                  if tokens[n+1] not in ['you', 'beginning']]
chapter_labels = [str(n) for n in range(1, 20)]
# chapter_labels = ['']
chapter_labels = [chapters[n]['title'] for n in range(1, 20)]

In [None]:
start_markers = [b for (b,c1,c2) in chapter_boundaries]

end_markers = [b for (b,c1,c2) in chapter_boundaries][1:]
end_markers.append(word_level_statistics.tot_words)

chapters = [' '.join(tokens[start:end]) for (start,end) in zip(start_markers, end_markers)]

In [None]:
# plot.rcParams["figure.figsize"] = [15,10]

In [None]:
# no_terms = 30
# word_list = list(lvls_df['word'].head(no_terms))
# positions = [word_level_statistics.word_pos[word] for word in word_list]
# fig, ax = plot.subplots()
# ax.eventplot(positions, linelengths=[0.5]*len(word_list))
# plot.title('Word Distributions for Top {} Significant Terms'.format(no_terms));
# plot.xticks([c[0] for c in chapter_boundaries], chapter_labels, rotation=45, ha='right')
# plot.yticks(range(0, len(word_list)), word_list);
# plot.gca().invert_yaxis()

### Using Plotly for word usage trends.

In [None]:
import plotly.graph_objs as go

In [None]:
no_terms = 30
word_list = list(lvls_df['word'].head(no_terms))
positions = [word_level_statistics.word_pos[word] for word in word_list]
keywords_in_context = [' '.join(word_level_statistics.tokens[n-2:n+3]) for n,w in enumerate(word_level_statistics.tokens)]

word_list.reverse()
positions.reverse()

fig1 = go.FigureWidget()
fig1.layout.hovermode = 'closest'
for w, p in zip(word_list, positions):
    scatter = fig1.add_scatter(x=p, y=[w]*len(p))
    scatter.mode = 'markers'
    scatter.marker.symbol = 'line-ns-open'
    scatter.marker.color = 'grey'
    scatter.name = w
    scatter.hovertext = [keywords_in_context[n] for n in p]
    scatter.hoverinfo = 'text'

In [None]:
layout = go.Layout(
    title='Word Distributions for Top {} Significant Terms'.format(no_terms),
    showlegend=False,
    autosize=False,
    width=1100,
    height=700,
    margin=go.layout.Margin(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
#     paper_bgcolor='#7f7f7f',
#     plot_bgcolor='#c7c7c7',
    xaxis=dict(
        title=None,
        titlefont=dict(
            family='Arial, sans-serif',
            size=18,
            color='lightgrey'
        ),
        showticklabels=True,
        tickangle=45,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=14,
            color='black'
        ),
        tickvals=[c[0] for c in chapter_boundaries],
        ticktext=chapter_labels,
        automargin=True,
        showgrid=True,
        zeroline=False,
        showline=False,
    ),
    yaxis=dict(
        title=None,
        titlefont=dict(
            family='Arial, sans-serif',
            size=18,
            color='lightgrey'
        ),
        showticklabels=True,
        automargin=True,
        tickangle=0,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=14,
            color='black'
        ),
        tickvals=word_list,
        showgrid=True,
        zeroline=False,
        showline=False,
    )
)

fig1.layout = layout

In [None]:
fig1

## Word Clusters

In [None]:
import pymagnitude
import hdbscan
import numpy as np
from collections import Counter
try:
    import umap
    print("Using: umap")
except ImportError:
    import bhtsne

In [None]:
from src.models import enrich_significant_terms, topic_exemplars, display_topics, topic_order_index, hdbscan_parameter_search, enumerate_exemplars
from IPython.core.display import display, HTML

In [None]:
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s %(message)s',"%b-%d-%Y %H:%M:%S")
logger.handlers[0].setFormatter(formatter)
logging.getLogger('joblib').setLevel(logging.ERROR)

In [None]:
# background_model = '../data/external/wiki-news-300d-1M.magnitude'
background_model = '../data/external/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude'
background_vectors = pymagnitude.Magnitude(background_model)

In [None]:
local_vectors = '../models/hobbit/wordvectors_rare15_spl_window5_bag_hash0_dim200_sqrt_cca_pseudo0_ce0P75_se0.magnitude'
local_vectors = pymagnitude.Magnitude(local_vectors)

In [None]:
# vectors = pymagnitude.Magnitude(local_vectors, background_vectors)
# vectors = local_vectors
vectors = background_vectors

In [None]:
significant_terms = list(lvls_df['word'])
significant_vectors = vectors.query(significant_terms)

In [None]:
try:
    fit = umap.UMAP(n_neighbors=15, n_components=10, metric='euclidean')
    vec_10d = fit.fit_transform(significant_vectors)
    fit = umap.UMAP(n_neighbors=15, n_components=2, metric='euclidean')
    vec_2d = fit.fit_transform(vec_10d)
except Exception as ex:
    logging.error("Trying bhtsne. Got exception {}".format(ex))
    vec_2d = bhtsne.tsne(np.asfarray(significant_vectors, dtype='float64' ),dimensions=2)

In [None]:
significant_terms_enriched = enrich_significant_terms(lvls_df, vec_10d, vec_2d, 'leaf')
exemplar_scores, hovers = topic_exemplars(significant_terms_enriched)
summary = pd.DataFrame([h.split(':') for h in hovers], columns=['topic', 'terms'])

In [None]:
len(significant_terms_enriched[significant_terms_enriched['topic']==-1])

In [None]:
topics, top_columns = display_topics(significant_terms_enriched, n_rows=20, n_cols=35)
topics = topics.fillna('')
print('{} topics'.format(significant_terms_enriched['topic'].max()))
display(HTML(topics.to_html(index=False)))

In [None]:
sentence  = vectors.query(["play", "some", "music", "on", "the", "living", "room", "speakers", "."])
# Returns: an array of size (9 (number of words) x 768 (3 ELMo components concatenated))
unrolled = vectors.unroll(sentence)
# Returns: an array of size (3 (each ELMo component) x 9 x 256 (the number of dimensions for each ELMo component))

In [None]:
unrolled.shape

In [None]:
with open('hobbit_flat_clean_bigrams.txt') as fp:
    sents = fp.readlines()
    sents = [s.strip() for s in sents]
sent_ids = range(len(sents))

In [None]:
len(significant_terms)

In [None]:
word_vectors = {}
for sent_id, sent in enumerate(sents):
    filtered_sent = list(filter(lambda x: x in set(significant_terms), sent.split()))
    if len(filtered_sent) == 0:
        continue
    sentence  = vectors.query(filtered_sent)
    unrolled = vectors.unroll(sentence)
    for word_pos, word in enumerate(filtered_sent):
        key = word + '_' + str(sent_id) + '_' + str(word_pos)
        word_vectors[key] = unrolled[word_pos]

In [None]:
significant_vectors = [word_vectors[key] for key in word_vectors]
significant_words = [key for key in word_vectors]

In [None]:
try:
    fit = umap.UMAP(n_neighbors=15, n_components=10, metric='euclidean')
    vec_10d = fit.fit_transform(significant_vectors)
    fit = umap.UMAP(n_neighbors=15, n_components=2, metric='euclidean')
    vec_2d = fit.fit_transform(vec_10d)
except Exception as ex:
    logging.error("Trying bhtsne. Got exception {}".format(ex))
    vec_2d = bhtsne.tsne(np.asfarray(significant_vectors, dtype='float64' ),dimensions=2)

In [None]:
lvls_df.head()

In [None]:
word_to_weight = dict(zip(lvls_df.word, significant_terms_enriched.sigma_nor))

In [None]:
tmp = []
for word_ in significant_words:
    word = '_'.join(word_.split('_')[0:-2])
    tmp.append((word_, word, word_to_weight[word]))

In [None]:
lvls_extended_df = pd.DataFrame(tmp, columns=['word_sent_pos', 'word', 'weight'])

In [None]:
def extract_sentences_JMC(parameters=None, text_data=None,
        text_no=0, text_id=None, weights=None):
    '''
    OCCAMS needs the names of the data files, the dimensions (`MxN`) of the
    term-sentence matrix, and the size of the summary.

    The sparse term-sentence matrix is input as a list of sentence-id, term
    pairs (`sets.txt`, with one pair per line). Internally, the data is stored
    in an array such that the i-th element points to an array of integers
    holding the i-th sentence.

    The `sets_lengths` array describes the costs of sets or lengths of
    sentences. Populated by the `read_lengths` routine that parses the
    `lengths.txt` file -- one lenght for each sentence.

    Parameters:
    * non-zeros
    * number of terms
    * number of sets
    * summary length
    * weights
    * lengths
    * lower bound
    '''

    occams_path = parameters['occams_path']
    lower_bound = parameters['lower_bound']
    summary_length = parameters['summary_length']
    non_zeros = parameters['non_zeros']

    ## Assumption that the text_id array will be enumerated then both the
    ## position in the array and the text_id will be passed into this routine.
    target_data = text_data.tmp_dir + '/' + text_id + '.txt'
    logging.info('Extract sentences from %d: target_data',
            len(text_data.sentences_nouns[text_id]))
    lines = LineSentence(target_data)
    dictionary = corpora.Dictionary(lines)
    logging.info('Dictionary details for text %s: %d, %s',
            target_data, len(dictionary), dictionary)

#     filtered_sent = list(filter(lambda x: x in set(significant_terms), sent.split()))
    sentences_nouns = [s.split() for s in text_data.sentences_nouns[text_id]]
    lengths_data = text_data.tmp_dir + '/' + text_id + '.lengths.txt'
    with open(lengths_data,'w', encoding='utf-8') as fp:
        for sentence_id, sentence in enumerate(sentences_nouns):
            fp.write(str(len(sentence)) + '\n')

    sets_data = text_data.tmp_dir + '/' + text_id + '.sets.txt'
    with open(sets_data,'w', encoding='utf-8') as fp:
        for sentence_id, sentence in enumerate(sentences_nouns):
            #print len(sentence), sentence
            term_ids = [w[0] for w in dictionary.doc2bow(sentence)]
            for term_id in term_ids:
                fp.write(str(sentence_id+1) + ' ' + str(term_id+1) + '\n')

    vocab_data = text_data.tmp_dir + '/' + text_id + '.vocab.txt'
    with open(vocab_data,'w', encoding='utf-8') as fp:
        for n in range(0,len(dictionary)):
            w = dictionary.get(n)
            try:
                fp.write(str(weights[w])+'\n')
            except:
                #logging.error('Bad weight for word: %s',w)
                fp.write(str(0) + '\n')

    nz = non_zeros
    M = len(dictionary)
    N = len(sentences_nouns)
    s = summary_length
    D = sets_data
    W = vocab_data
    L = lengths_data
    b = lower_bound
    cmd_path = occams_path
    cmd = cmd_path + ' -z {} -m {} -n {} -s {} -D {} -W {} -L {} -b {}'.format(nz, M, N, s, D, W, L, b)
    logging.info('OCCAMS cmd %s: ', cmd)
    cmd_list = cmd.split()
    ret_val = subprocess.getoutput(cmd) #, stderr=subprocess.STDOUT)
    for line in ret_val.split('\n'):
        if 'Chosen sentences' in line:
            logging.info(line)
            sentence_ids = line.split(':')[1].split()

    summary_nouns = []
    for n in [int(n)-1 for n in sentence_ids]:
        summary_nouns.append(' '.join(text_data.sentences_nouns[text_id][n].split()))
        #print sample_data.sentences_nouns[0][n].split()
        #print n, '|'.join(sample_data.sentences_nouns[0][n].split())

    summary_nouns = set([w for line in summary_nouns for w in line.split()])

    sids = [int(sid)-1 for sid in sentence_ids]

    summary_data = text_data.tmp_dir + '/' + text_id + '.summary.txt'
    with open(summary_data,'w', encoding='utf-8') as fp:
        for sid in sids:
            s = text_data.text_sentences[text_id][sid]
            fp.write(s + '\n')
            #logging.info(str(sid)+': ', text_data.text_sentences[text_id][sid])

    return sids, summary_nouns

In [None]:
def display_topics(df, n_rows=10, n_cols=12):
    """Pretty-print table of themes and some corpus statistics."""

#     exemplar_scores, hovers = topic_exemplars(df)
#     top_columns = sorted(range(len(exemplar_scores)),
#                          key=lambda i: exemplar_scores[i],
#                          reverse=True)[:n_cols]

    topics = df.pivot(index='pos', columns='topic', values='word_sent_pos')
                      #values='word*').replace([None], [''], regex=True)

    topics_display = topics[range(n_cols)].head(n_rows)

    return topics_display, top_columns

In [None]:
def topic_order_index(topic_list):
    '''
    The input is a list of integers (topics) that has many repeats but has been
    sorted in a meaningful way (e.g by some word importance score). Three
    topics might look, for example, like [1, 1, 2, 1, 3, 2, 2, 3] and this
    routine produces an index to keep track of the topic
    order => [1, 2, 1, 3, 1, 2, 3, 2].
    '''
    position_counter = Counter()
    per_topic_index = []
    for t in topic_list:
        position_counter[t] += 1
        per_topic_index.append(position_counter[t])
    return per_topic_index

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=10,
                            min_samples=10,
                            approx_min_span_tree=False,
                            cluster_selection_method='leaf')
labels = clusterer.fit_predict(np.array(vec_10d))
lvls_extended_df['topic'] = labels

topic_list = list(lvls_extended_df['topic'])
lvls_extended_df['pos'] = topic_order_index(topic_list)

lvls_extended_df['x2D'] = [v[0] for v in vec_2d]
lvls_extended_df['y2D'] = [v[1] for v in vec_2d]

In [None]:
lvls_extended_df.head()

In [None]:
center_no = 1446
for n in range(center_no-5, center_no+5):
    print(n, sents[n])

In [None]:
lvls_extended_df.topic.max()

In [None]:
topics, top_columns = display_topics(lvls_extended_df, n_rows=100, n_cols=451)
topics = topics.fillna('')
print('{} topics'.format(lvls_extended_df['topic'].max()))
display(HTML(topics.to_html(index=False)))

In [None]:
cluster_fn = '../models/hobbit/agglomerative_rare3_spl_window5_list_hash0_dim100_sqrt_cca_pseudo0_ce0P75_se0'
brown_model_df = pd.read_csv(cluster_fn, delimiter=' ', names=['cluster', 'word', 'count'])

In [None]:
# brown_model_df[brown_model_df.term.isin(significant_terms)]

In [None]:
significant_terms_brown = pd.merge(significant_terms_enriched, brown_model_df, on='word', how='inner')
significant_terms_brown = significant_terms_brown.rename(index=str, columns={"topic": "cluster", "cluster": "topic"})

In [None]:
max(list(map(lambda x: len(str(x)), significant_terms_brown.topic)))

In [None]:
def short_str(s):
    s = str(s)
    return s[0:20]

In [None]:
significant_terms_brown['topic_s'] = list(map(short_str, significant_terms_brown['topic']))

In [None]:
significant_terms_brown.head()

In [None]:
topic_list = list(significant_terms_brown['topic'])
significant_terms_brown['pos'] = topic_order_index(topic_list)

In [None]:
brown_clusters = significant_terms_brown.pivot(index='pos', columns='topic', values='word').fillna('')
display(HTML(brown_clusters.to_html(index=False)))

In [None]:
brown_clusters_list = []
for g in significant_terms_brown.groupby('topic'):
    brown_clusters_list.append(set(g[1]['word']))

In [None]:
clusters_list = []
for g in significant_terms_enriched.groupby('topic'):
    clusters_list.append(set(g[1]['word']))

In [None]:
def jaccard_similarity(x,y):
  
 intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
 union_cardinality = len(set.union(*[set(x), set(y)]))
 return intersection_cardinality/float(union_cardinality)

print(jaccard_similarity([0,1,2,5,6],[0,2,3,5,7,9]))

In [None]:
clusters_list[2]

In [None]:
for word_cluster in clusters_list:
    for brown_cluster in brown_clusters_list:
        js = jaccard_similarity(word_cluster, brown_cluster)
        if js > 0.2:
            print(js)
            print(word_cluster)
            print(brown_cluster)
            print('='*40)

## Topic Distributions in Sentences

In [None]:
import base64
from collections import Counter
import numpy as np

In [None]:
with open('hobbit_flat_clean_bigrams.txt') as fp:
    sents = fp.readlines()
    sents = [s.strip() for s in sents]
sent_ids = range(len(sents))

In [None]:
significant_terms_enriched['weight'] = significant_terms_enriched['sigma_nor']

In [None]:
K = significant_terms_enriched.topic.max()
word_to_topic = dict(zip(significant_terms_enriched.word, significant_terms_enriched.topic))
word_to_weight = dict(zip(significant_terms_enriched.word, significant_terms_enriched.weight))
    
def message_topics(sentence):
    ''' Calculuate the distribution of term weights in each sentence.
        Expects a data frame that at least includes columns for word,
        weight, and topic number. Expects lists of sentences and their
        corresponding ids. 
    '''

    sent_filtered = list(filter(lambda x: x in set(significant_terms) and word_to_topic[x] > -1, sentence.split()))
    
    c = Counter({n:0 for n in range(0,K)})
    # c.update([word_to_topic[word] for word in set(sentence.split()).intersection(set(significant_terms))])
    c.update([word_to_topic[word] for word in sent_filtered])
    return np.array(list(c.values()))

In [None]:
# Turn off the max column width so the HTML 
# image tags don't get truncated 
pd.set_option('display.max_colwidth', -1)

# Turning off the max column will display all the data in
# our arrays so limit the number of element to display
pd.set_option('display.max_seq_items', 2)

In [None]:
def sparkline(data, figsize=(5, 0.25), **kwags):
    """
    Returns a HTML image tag containing a base64 encoded sparkline style plot
    """
    legend = ' '.join([str(n) for n in data.nonzero()[0]])    
    fig, ax = plot.subplots(1, 1, figsize=figsize, **kwags)
#     ax.bar(range(len(data)), data)
    ax.plot(data)
    for k,v in ax.spines.items():
        v.set_visible(False)
    ax.set_xticks([])
    ax.set_yticks([])   
    ax.text(len(data), 0.1, legend)
    ax.set_xlim(0,2*len(data))

    ax.fill_between(range(len(data)), data, len(data)*[min(data)], alpha=0.1)
    
    img = io.BytesIO()
    plot.savefig(img)
    img.seek(0)
    plot.close()
    
    short_summary = summary[summary.index.isin(legend.split())]['terms'].to_dict()
    short_summary = '\n'.join(c + ':' + s for c,s in [(str(key), short_summary[key]) for key in short_summary]).replace(' ','')
                                               
    return short_summary, '<img src="data:image/png;base64,{}"/>'.format(base64.b64encode(img.read()).decode())

In [None]:
plot.style.use('classic')

In [None]:
word_to_topic = dict(zip(significant_terms_enriched['word'], significant_terms_enriched['topic']))
word_to_weight = dict(zip(significant_terms_enriched['word'], significant_terms_enriched['weight']))

In [None]:
sent = sents[278]
c = message_topics(sentence=sent)
print(sent)
filtered_sent = list(filter(lambda x: x in set(significant_terms), sent.split()))
print(filtered_sent)
print([word_to_topic[word] for word in filtered_sent])
print(sparkline(c)[0])
HTML(sparkline(c)[1])

In [None]:
columns = set([word_to_topic[word] for word in filtered_sent])
columns.remove(-1)
topics[columns]

## Sentence Embeddings

In [None]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [None]:
if False:
    import sentences
    from sentences.types import Sentences
    import numpy as np
    
    with open('hobbit_flat_clean_bigrams.txt') as fp:
        text = fp.read()
        
    english_sentences = Sentences(text)
    en_vectors = english_sentences.embed
    
    np.save('hobbit_clean_bigrams.npy', en_vectors)

In [None]:
en_sent_vectors = np.load('hobbit_clean_bigrams.npy')

In [None]:
en_sent_vectors.shape

In [None]:
with open('hobbit_flat_clean_bigrams.txt') as fp:
    hobbit_sentences = fp.readlines()
    
with open('hobbit_flat.txt') as fp:
    hobbit_sentences_orig = fp.readlines()

In [None]:
len(hobbit_sentences)

In [None]:
estimator = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=15)
nbrs = estimator.fit(en_sent_vectors)
distances, indices = nbrs.kneighbors(en_sent_vectors)

In [None]:
hobbit_sentences_df = pd.DataFrame(hobbit_sentences, columns=['en'])

In [None]:
cluster_no = 9
significant_terms_enriched[significant_terms_enriched['topic'] == cluster_no]

In [None]:
search_results = hobbit_sentences_df[hobbit_sentences_df.en.str.contains('mirkwood')]
print(len(search_results), set(search_results.index))

In [None]:
word_clusters = {}
for cluster_no in set(significant_terms_enriched['topic']):
    terms = set(significant_terms_enriched[significant_terms_enriched['topic'] == cluster_no]['word'])
    word_clusters[cluster_no] = terms

In [None]:
index_no = 1122
print(indices[index_no])
print(distances[index_no])
seed_sent = list(filter(lambda x: x in set(significant_terms), hobbit_sentences[index_no].split()))
seed_sent_clusters = set([word_to_topic[word] for word in seed_sent])
matches = set(search_results.index).intersection(set(indices[index_no]))
print("Matches: {} {}".format(len(matches), matches) )
print('='*50)
for n, sent_num in enumerate(indices[index_no]):
    if distances[index_no][n] < 0.5:
        print(sent_num, hobbit_sentences_orig[sent_num])
        print(sent_num, hobbit_sentences[sent_num])
        filtered_sent = list(filter(lambda x: x in set(significant_terms), hobbit_sentences[sent_num].split()))
        c = message_topics(hobbit_sentences[sent_num])        
        if True:
            print(filtered_sent)
            print([word_to_topic[word] for word in filtered_sent])
            filtered_sent_clusters = set([word_to_topic[word] for word in filtered_sent])
            print("Jaccard similarity (seed vs. current): {}, {}".format(jaccard_similarity(seed_sent_clusters, filtered_sent_clusters), 
                                                                         seed_sent_clusters.intersection(filtered_sent_clusters)))
        
        print(sparkline(c)[0])
        display(HTML(sparkline(c)[1]))
        print('='*50)

## Inverted Index

### lucene

In [None]:
import lucene
from org.apache.lucene import analysis, document, index, queryparser, search, store, misc

import os

In [None]:
lucene.initVM()

In [None]:
from org.apache.lucene import codecs

In [None]:
analyzer = analysis.standard.StandardAnalyzer()

In [None]:
# Store the index in memory:
# directory = store.RAMDirectory()

In [None]:
from java.nio.file import Paths

In [None]:
rm -rf ../testindex/

In [None]:
storeDir = "../testindex"
if not os.path.exists(storeDir):
    os.mkdir(storeDir)

In [None]:
directory = store.SimpleFSDirectory(Paths.get(storeDir))

In [None]:
t = document.FieldType()

In [None]:
def indexDocs(sentences, writer):
    t1 = document.FieldType()
    t1.setStored(True)
    t1.setTokenized(False)
    t1.setIndexOptions(index.IndexOptions.DOCS_AND_FREQS)

    t2 = document.FieldType()
    t2.setStored(True)
    t2.setTokenized(True)
    t2.setIndexOptions(index.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
#     t2.setStoreTermVectorPositions()

    for sent_id, sent in enumerate(sentences):
        try:
            doc = document.Document()
            doc.add(document.Field("sent_id", str(sent_id), t1))
            if len(sent) > 0:
                doc.add(document.Field("contents", sent, t2))
            writer.addDocument(doc)
        except Exception as ex:
            print('Exception: {}'.format(ex))

In [None]:
with open('hobbit_flat.txt') as fp:
    sents = fp.readlines()
    sents = [s.strip() for s in sents]
sent_ids = range(len(sents))

In [None]:
config = index.IndexWriterConfig(analyzer)
iwriter = index.IndexWriter(directory, config)

In [None]:
config.setSimilarity(search.similarities.ClassicSimilarity())

In [None]:
config.setUseCompoundFile(False)

In [None]:
iwriter.getConfig()

In [None]:
indexDocs(sents, iwriter)

In [None]:
# for sent_id, sent in enumerate(sents):
#     doc = document.Document()
#     doc.add(document.Field('contents', sent, document.TextField.TYPE_STORED))
#     doc.add(document.Field('sent_id', str(sent_id), document.TextField.TYPE_STORED))
#     iwriter.addDocument(doc)

In [None]:
iwriter.close()

In [None]:
# Now search the index:
ireader = index.DirectoryReader.open(directory)
isearcher = search.IndexSearcher(ireader)

In [None]:
isearcher.setSimilarity(search.similarities.ClassicSimilarity())

In [None]:
# Parse a simple query that searches for "text":
parser = queryparser.classic.QueryParser('contents', analyzer)
query = parser.parse('contents:fierce')
hits = isearcher.search(query, 1000).scoreDocs

In [None]:
for hit in hits:
    hitDoc = isearcher.doc(hit.doc)
    print(hit.score, hitDoc)
    print(isearcher.explain(query, hit.doc))

In [None]:
ireader.close()

In [None]:
directory.close()

### lupyne

In [None]:
import lucene
import os

In [None]:
lucene.initVM()

In [None]:
from lupyne import engine

In [None]:
rm -rf ../testindex/

In [None]:
storeDir = "../testindex"
if not os.path.exists(storeDir):
    os.mkdir(storeDir)

In [None]:
indexer = engine.Indexer(directory=storeDir) 

In [None]:
indexer.set('contents', engine.Field.Text, stored=True)

In [None]:
with open('hobbit_flat.txt') as fp:
    sents = fp.readlines()
    sents = [s.strip() for s in sents]
sent_ids = range(len(sents))

In [None]:
for sent in sents:
    indexer.add(contents=sent) 

In [None]:
indexer.commit() 

In [None]:
hits = indexer.search('dragon', field='contents')

In [None]:
for hit in hits:
    print(hit)

## LDA Topic Models

In [None]:
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim

In [None]:
texts = [chapter.split() for chapter in chapters]

In [None]:
dataset = [word for chapter in texts for word in chapter]

In [None]:
dictionary = corpora.Dictionary(texts)
dictionary.save(os.path.join('../models/hobbit/', 'hobbit.dict'))  # store the dictionary, for future reference
print(dictionary)

In [None]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize(os.path.join('../models/hobbit', 'hobbit.mm'), corpus) 

In [None]:
dictionary = corpora.Dictionary.load('../models/hobbit/hobbit.dict')
corpus = corpora.MmCorpus('../models/hobbit/hobbit.mm')

In [None]:
model = models.LdaModel(corpus, id2word=dictionary, num_topics=30)

In [None]:
prep = pyLDAvis.gensim.prepare(model, corpus, dictionary, mds='tsne')

In [None]:
pyLDAvis.display(prep)