# A vs. the - next-word log-likelihood ratios

- Count up which words come after "a" and "the" in the first and last 5% of novels. This gives 4 sets of counts - `a_beginning`, `a_end`, `the_beginning`, `the_end`.
- Then, using Dunning's log-likelihood ratio, we can get words that follow a/the distinctively in one of these contexts. Eg, words that follow "a" distinctively in the first 5% relative to "the."

In [1]:
import attr
import os
import ujson
import bz2
import random
import math

import pandas as pd

from glob import glob
from tqdm import tqdm
from multiprocessing import Pool
from itertools import islice
from functools import partial
from boltons.iterutils import pairwise
from collections import Counter
from pprint import pprint
from scipy import stats

from IPython.display import display, Markdown

In [2]:
def zip_offset(seq, skip=0):
    """Yield (item, 0-1 offset).
    """
    size = len(seq)
    start = max(0, int(size * skip) - 10)
    for i in range(start, size):
        item = seq[i]
        offset = i / (size - 1) if (size - 1) else 0
        yield item, offset

In [3]:
def map_segment(func, path):
    """Parse JSON segment, apply worker function.
    """
    results = []
    
    with bz2.open(path) as fh:
        for line in fh:
            results.append(func(ujson.loads(line)))
            
    return results

In [4]:
@attr.s
class Corpus:
    
    root = attr.ib()
    
    def paths(self):
        return glob(os.path.join(self.root, '*.bz2'))
                        
    def map_novels(self, func, shuffle=True):
        """Apply a worker to segment files in parallel.
        """
        paths = self.paths()
        
        if shuffle:
            random.shuffle(paths)
        
        with Pool() as p:
            
            worker = partial(map_segment, func)
            
            for results in p.imap_unordered(worker, paths):
                yield from results

def load_vocab(path):
    with open(path) as ip:
        return set(w.strip() for w in ip)

def save_vocab(words):
    with open(os.path.join(data_dir, 'dunnings-vocab.txt'), 'w') as op:
        for w in words:
            op.write('{}\n'.format(w))

In [5]:
data_dir = '../../../data'
corpus = Corpus(os.path.join(data_dir, 'chicago-bins-tokens.json'))
vocab = load_vocab(os.path.join(data_dir, 'dunnings-vocab.txt'))

In [6]:
def suffixes_worker(q, o1, o2, vocab, n):
    results = []
    
    for (t1, t2), offset in zip_offset(pairwise(n['tokens']), skip=o1):
        if o1 < offset < o2 and t2.lower() in vocab:
            results.append(t2.lower())
            if t1.lower() == q:
                results.append('{}_{}'.format(q, t2.lower()))

        elif offset >= o2:
            break
    
    return results

In [7]:
def suffixes(q, o1, o2, num_novels=None):
    worker = partial(suffixes_worker, q, o1, o2, set(vocab))
    res_iter = islice(corpus.map_novels(worker), num_novels)
    return Counter([m for ms in tqdm(res_iter) for m in ms])

In [8]:
a0_combined = suffixes('a', 0, 0.05)

6638it [02:42, 40.84it/s]


In [9]:
a1_combined = suffixes('a', 0.95, 1)

6638it [02:33, 43.18it/s]


In [10]:
the0_combined = suffixes('the', 0, 0.05)

6638it [02:42, 40.75it/s]


In [11]:
the1_combined = suffixes('the', 0.95, 1)

6638it [02:35, 42.70it/s]


In [12]:
a0_ = {w: a0_combined['a_' + w] for w in vocab}
a1_ = {w: a1_combined['a_' + w] for w in vocab}
the0_ = {w: the0_combined['the_' + w] for w in vocab}
the1_ = {w: the1_combined['the_' + w] for w in vocab}
total0_ = {w: a0_combined[w] for w in vocab}
total1_ = {w: a1_combined[w] for w in vocab}

In [13]:
def mdw(fg, bg, min_count=100, n=50):

    vocab = set.intersection(
        {t for t, c in fg.items() if c > min_count},
        {t for t, c in bg.items() if c > min_count},
    )
    
    n_fg = sum(fg[t] for t in vocab)
    n_bg = sum(bg[t] for t in vocab)
    
    rows = []
    for t in vocab:
        
        p = (fg[t] + bg[t]) / (n_fg + n_bg)
        
        e_fg = n_fg * p
        e_bg = n_bg * p
        
        if fg[t] > e_fg:

            s, _ = stats.power_divergence(
                [fg[t], bg[t]],
                [e_fg, e_bg],
                lambda_='log-likelihood',
            )

            rows.append((t, s))
            
    return pd.DataFrame(rows, columns=('token', 'dll'))

# def weighted_pmi()

In [35]:
p_a_w_0_ = {w: a0_[w] / total0_[w] for w in a0_}
p_a_w_1_ = {w: a1_[w] / total1_[w] for w in a1_}
p_a_w = {w: (a0_[w] + a1_[w]) / (total0_[w] + total1_[w]) for w in a0_}
ll_a_w_0 = {w: p_a_w_0_[w] * -math.log(p_a_w_0_[w] / p_a_w[w]) for w in a0_}
ll_a_w_1 = {w: p_a_w_1_[w] * -math.log(p_a_w_1_[w] / p_a_w[w]) for w in a1_}


# Words with falling "a" probability:

In [49]:
print('word                          p(a|w,0)       p(a|w,1)')
print('----                          --------       --------')
for w in sorted(ll_a_w_0, key=ll_a_w_0.get)[:50]:
    print('{:30}{:1.5f}        {:1.5f}'.format(w, p_a_w_0_[w], p_a_w_1_[w]))

word                          p(a|w,0)       p(a|w,1)
----                          --------       --------
gun                           0.21117        0.13216
witch                         0.29346        0.19022
penny                         0.25561        0.14901
pistol                        0.19159        0.12097
clue                          0.49312        0.39568
cave                          0.17525        0.10055
sword                         0.16243        0.09765
flood                         0.22403        0.15691
hammer                        0.19140        0.12687
dragon                        0.16003        0.09927
murder                        0.11549        0.06371
flurry                        0.61236        0.54106
snake                         0.29062        0.22254
former                        0.18315        0.11275
hurry                         0.27707        0.21004
knife                         0.21426        0.16021
pity                          0.24027       


# Words with rising "a" probability:

In [50]:
print('word                          p(a|w,0)       p(a|w,1)')
print('----                          --------       --------')
for w in sorted(ll_a_w_1, key=ll_a_w_1.get)[:50]:
    print('{:30}{:05.5f}        {:5.5f}'.format(w, p_a_w_0_[w], p_a_w_1_[w]))

word                          p(a|w,0)       p(a|w,1)
----                          --------       --------
puff                          0.32515        0.46875
mouthful                      0.66805        0.77564
stranger                      0.35350        0.45261
rag                           0.26096        0.36695
headache                      0.34152        0.44970
thief                         0.28850        0.38734
football                      0.17208        0.25285
fake                          0.21020        0.32037
nap                           0.38692        0.48101
hero                          0.27343        0.38383
wry                           0.51071        0.60294
divorce                       0.20537        0.28660
fleeting                      0.38667        0.46484
victim                        0.09919        0.16714
pencil                        0.25796        0.32143
tangle                        0.30072        0.37568
vacation                      0.20597       

# a > the (beginning)

In [15]:
' '.join(mdw(a0_, the0_).sort_values('dll', ascending=False).head(100).token)

'few little lot good bit couple moment long while hundred small dozen minute single large week pair half thousand nice month year piece different quick smile fine short pretty very hand bad drink deep series chance fool quarter slight look sudden brief man cigarette step strange special child full great finger rather strong mistake private beautiful pleasant wonderful joke new woman loud person real note hint sense wave copy big cup mere thin simple - sharp lovely low thing handsome huge terrible faint number row part better fresh damn complete tight curious tall vague breath visit professional happy flash personal'

# the > a (beginning)

In [16]:
' '.join(mdw(the0_, a0_).sort_values('dll', ascending=False).head(100).token)

'first door last back world house rest front room two way water street most kitchen road one top people window sky morning wind table city middle right night river wall phone side time car bed sea fact rain town whole king land police beach hell war building village crowd doorway desk three sound smell bar blood church light hill thought far body head bridge lake country work ship fire sight train corner yard name company power boy store hospital future pain forest scene hotel queen radio boat dark summer bank law garden mirror family path winter telephone spring shoulder coffee'

# a > the (end)

In [17]:
' '.join(mdw(a1_, the1_).sort_values('dll', ascending=False).head(100).token)

'few little lot moment good long while couple minute hundred small single week deep friend large half pair hand chance piece sudden drink smile step man very great quick short month year fine fool pretty new bad look strange mistake brief woman cigarette slight finger lie quarter wonderful terrible low part child better loud special sense dream beautiful strong huge big private damn - copy thing joke full breath cup kind simple stranger hero tight wave flash person faint hard sharp word slow thin lovely fresh miracle fair shot note break kiss happy number visit curious mere perfect clear gesture'

# the > a (end)

In [18]:
' '.join(mdw(the1_, a1_).sort_values('dll', ascending=False).head(100).token)

'door first room house back last two rest world one front water road way sky people kitchen night top street window morning wind most police bed middle side table right city wall car river phone rain whole sea crowd pain doorway three fact hospital beach fire blood church hell head king hill power bridge building land desk light time future body sound dark town sight law gun country lake judge bar couch village thought far ship garden hotel work yard truck queen war four forest smell boat radio face boy scene name blade bank living stage company doctor telephone corner'

# Notes

- "A" is clearly used in the context of description - many of the distinctive a>the words are adjectives, whereas in the the>a lists, nouns.
- Of the a>the adjectives, interesting that many of the most distinctive ones are quantifiers, markers of _degree_ - few, little, lot, bit, hundred, single, small, large.
- "A" also associated with time, questions of when / how long - while, minute, week, year.
- "The," meanwhile, is clearly marking what might be thought of as "physical rendering" - descriptions of physical settings and spatial relationships. "Sides" of things - front, back, middle, side; and literal objects and locations - door, floor, house, room, street, kitchen, road, etc.
- So basically, my gloss - "a" is description (and temporality?), "the" is physicality. At the beginning, both are common - things are getting introduced for the first time, the physical setting is getting established. Whereas, at the end, there's a return to the physical (away from the psychological / dialogic middle?), with "the" going back up; but, less need for "a," since the world has already been described.

---
Less useful - "a" (beginning) compared to "a" (end), vice versa, and "the" vs "the." This basically just reproduces the overall frequency differences, though. (Murder at the end, etc.)

In [19]:
' '.join(mdw(a0_, a1_).sort_values('dll', ascending=False).head(100).token)

'year tall young large boy girl small well pair half handsome month student “ week two high narrow thin big town face lady white three city pleasant brown broad widow job name fat house four - slender six black decade blue thick private school beauty five habit natural day row gold summer square dollar first female wide successful country dark husband dozen constant century street teacher fine block pale local certain particularly slim genius frown model detective twenty slight famous delicate friendly slightly kid variety particular bar social shade middle cow quarter full vague land writer rather living rare desk'

In [20]:
' '.join(mdw(a1_, a0_).sort_values('dll', ascending=False).head(100).token)

'moment while long gun murderer minute chance step shot lie bullet terrible deal way hero last deep message lot hand letter little witness roar trap bitch voice dream final weapon sudden time nurse second kiss pistol fool part plan sound cry killer chair whisper scream great trial promise noise horrible choice new rifle fist traitor hug monster fake wonderful decision tear breath blanket flash shotgun guard silence will criminal tremendous word suicide life liar thing move sitting mistake grave split question death desperate few deadly crazy trick blessing prayer plane shadow court miracle saint brave search difference threat light fucking'

In [21]:
' '.join(mdw(the0_, the1_).sort_values('dll', ascending=False).head(100).token)

'town girl young family boy year city summer school stranger man local most village “ younger name street big woman store wide high teacher victim sort tall more class bar - small thick kind narrow gentleman war land large rich middle late broad job thin winter few guy daughter smell third wagon male habit fact bus spring mirror week lady fine business century home french great company low table five long day hot half country waiter british club social farm famous planet heavy national dusty ten wife kid kitchen fourth roman word view foreign problem widow modern visitor smaller sign'

In [22]:
' '.join(mdw(the1_, the0_).sort_values('dll', ascending=False).head(100).token)

'gun pistol police knife door murder hospital rifle bullet murderer pain sword shotgun bed dragon cave fire room shot blade baby judge story grave tunnel one night weapon future car battle final nurse light sky key rope will couch blow bomb hammer trial flood force power wedding circle whole way funeral truck rest fight sound love cell cliff case stone death people demon ring side blood doctor crowd shock note mountain witness doorway hell hole statue chain broken witch chance wall bridge last plan letter flashlight roar box distance burning suitcase storm court machine real rock monster spear terrible spot'