In [None]:
from collections import Counter 
import math
import re
from functools import reduce
import pandas as pd
# import matplotlib
# import matplotlib.pyplot as plot
import io

import plotly_express as px
import plotly.graph_objs as go

In [None]:
from src.models import WordLevelStatistics
from src.visualization import word_distributions

In [None]:
def count(file):
    '''Counts the words contained in a file'''
    with open(file) as f:
        return Counter(re.findall('\w+', re.sub('[\r\n]', ' ', f.read().lower())))

In [None]:
hobbit = count('../data/raw/hobbit_flat_clean.txt')
pryftan = count('../data/raw/pryftan.txt')
simple = count('../data/processed/simplewiki.txt')

In [None]:
def llr(k):
    '''Computes an LLR score for a list of Count objects'''
    all = flatten([kx.values() for kx in k])
    rows = rowSums(k)
    cols = colSums(k)
    return 2 * (denormEntropy(rows) + denormEntropy(cols) - denormEntropy(all))

In [None]:
def llr_compare(k1, k2):
    '''Compute root-LLR values for all the things in k1 and k2'''
    t1 = sum(k1.values())
    t2 = sum(k2.values())
    r = {}
    for x in set(k1.keys()).union(set(k2.keys())):
        k11 = k1[x]
        k21 = k2[x]
        k12 = t1 - k11
        k22 = t2 - k21
        r[x] = llr_root(k11, k12, k21, k22)
    return r

In [None]:
def llr_2x2(k11, k12, k21, k22):
    '''Special case of llr with a 2x2 table'''
    return 2 * (denormEntropy([k11+k12, k21+k22]) +
                denormEntropy([k11+k21, k12+k22]) -
                denormEntropy([k11, k12, k21, k22]))

In [None]:
def llr_root(k11, k12, k21, k22):
    '''Computes a score for a 2x2 contingency table, but then adds a sign according 
    to whether k11 is larger (result is positive) or smaller (result is negative) 
    than might be expected. The magnitude of the result can be roughly interpreted 
    on a scale similar to standard deviations'''
    row = k11 + k21
    total = (k11 + k12 + k21 + k22)
    sign = cmp(float(k11) / (k11 + k12), float(row) / total)
    llr22 = llr_2x2(k11, k12, k21, k22)
    if llr22 < 0: return 0.0
    return math.copysign(math.sqrt(llr22), sign)

In [None]:
def cmp(a, b):
    return int(a > b) - int(a < b)


def flatten(list_of_lists):
    '''Iterates through the elements in a list of lists'''
    for xl in list_of_lists:
        for x in xl:
            yield x

def rowSums(k):
    '''Combines a list of counters into a summed counter'''
    return reduce(lambda x, y: x + y, k, Counter()).values()

def colSums(k):
    '''Computes a list of total counts from a list of Count objects'''
    return [sum(x.values()) for x in k]

In [None]:
def denormEntropy(counts):
    '''Computes the entropy of a list of counts scaled by the sum of the counts. 
    If the inputs sum to one, this is just the normal definition of entropy'''
    counts = list(counts)
    total = float(sum(counts))
    # Note tricky way to avoid 0*log(0)
    return -sum([k * math.log(k/total + (k==0)) for k in counts])

In [None]:
diff = llr_compare(hobbit, simple)
ranked = sorted(diff.items(), key=lambda x: x[1])

In [None]:
print("\nMore in Simple")
for k,v in ranked[:20]:
    print(k, v)

In [None]:
print("\nMore in Hobbit")
for k,v in ranked[-20:]:
    print(k, v)

In [None]:
diff = llr_compare(pryftan, simple)
ranked = sorted(diff.items(), key=lambda x: x[1])

In [None]:
print("\nMore in Pryftan")
for k,v in ranked[-20:]:
    print(k, v)

In [None]:
# plot.rcParams["figure.figsize"] = [10, 10]
# plot.style.use('fivethirtyeight')

In [None]:
text = hobbit
vocab = [{'term':term, 'count':text[term]} for term in text] 
vocab = pd.DataFrame(vocab)
vocab = vocab.sort_values(by='count', ascending=False)
vocab['index'] = list(range(0,len(vocab)))
px.scatter(vocab, x='index', y='count', log_x=True, log_y=True, hover_name='term')
# vocab.plot(kind='scatter', x='index', y='count', loglog=True, xlim=(0.5,10**4));

In [None]:
with open('../data/raw/hobbit_flat_clean.txt') as fp:
    all_text = fp.read().lower()

In [None]:
all_sents = all_text.split('\n')

In [None]:
fp = io.StringIO(all_text)
word_level_statistics = WordLevelStatistics(corpus_file=fp, percentile_C=90)
word_level_statistics.compute_spectra()

In [None]:
lvls_df = pd.DataFrame(word_level_statistics.level_stat_thresholded)
significant_terms = word_level_statistics.significant_terms
print('With threshold = {}, ({} percentile) find {} significant terms.'.format(
    word_level_statistics.threshold, word_level_statistics.percentile_C, len(significant_terms)))

In [None]:
lvls_df = lvls_df.sort_values(by='sigma_nor', ascending=False)
lvls_df.head(15)

In [None]:
chapters = {1:{'start':612,  'end':1102, 'title':'An Unexpected Party'},
            2:{'start':1102, 'end':1420, 'title':'Roast Mutton'},
            3:{'start':1420, 'end':1612, 'title':'A Short Rest'},
            4:{'start':1612, 'end':1774, 'title':'Over Hill and under Hill'},
            5:{'start':1774, 'end':2192, 'title':'Riddles in the Dark'},
            6:{'start':2192, 'end':2466, 'title':'Out of the Frying-Pan into the Fire'},
            7:{'start':2466, 'end':2878, 'title':'Queer Lodgings'},
            8:{'start':2878, 'end':3224, 'title':'Flies and Spiders'},
            9:{'start':3224, 'end':3452, 'title':'Barrels Out of Bond'},
           10:{'start':3452, 'end':3624, 'title':'A Warm Welcome'},
           11:{'start':3624, 'end':3726, 'title':'On the Doorstep'},
           12:{'start':3726, 'end':3954, 'title':'Inside Information'},
           13:{'start':3954, 'end':4112, 'title':'Not at Home'},
           14:{'start':4112, 'end':4212, 'title':'Fire and Water'},
           15:{'start':4212, 'end':4414, 'title':'The Gathering of the Clouds'},
           16:{'start':4414, 'end':4524, 'title':'A Thief in the Night'},
           17:{'start':4524, 'end':4670, 'title':'The Clouds Burst'},
           18:{'start':4670, 'end':4790, 'title':'The Return Journey'},
           19:{'start':4790, 'end':5096-50, 'title':'The Last Stage'},  
                                              # Last 50 lines of Chapter actually are back matter.
                                              # Back matter includes 1st chapter of Lord of the Rings.
           'front_matter':{'start':0, 'end':612, 'title':'Front Matter'},
           'back_matter':{'start':5096-50, 'end':5726, 'title':'Back Matter'}}

In [None]:
tokens = re.findall('\w+', all_text)
chapter_boundaries = [(n, tokens[n], tokens[n+1]) for n in word_level_statistics.word_pos['chapter']
                                                  if tokens[n+1] not in ['you', 'beginning']]
chapter_labels = [str(n) for n in range(1, 20)]
# chapter_labels = ['']
chapter_labels = [chapters[n]['title'] for n in range(1, 20)]

In [None]:
diff = llr_compare(hobbit, simple)
ranked = sorted(diff.items(), key=lambda x: x[1])
word_list = [k for k,v in ranked[-20:]]
fig = word_distributions(word_list=word_list, word_level_statistics=word_level_statistics)
fig.layout.xaxis.tickvals=[c[0] for c in chapter_boundaries]
fig.layout.xaxis.ticktext=chapter_labels
fig

In [None]:
lvls_df[lvls_df.word.isin(word_list)]

In [None]:
diff = llr_compare(hobbit, simple)
ranked = sorted(diff.items(), key=lambda x: x[1])
mask = lvls_df.word.isin(word_list)
word_list = list(lvls_df[mask]['word'])
print(set([k for k,v in ranked[-20:]]).difference(word_list))
fig = word_distributions(word_list=word_list, word_level_statistics=word_level_statistics)
fig.layout.xaxis.tickvals=[c[0] for c in chapter_boundaries]
fig.layout.xaxis.ticktext=chapter_labels
fig