Run first the [setup notebook](./00-setup.ipynb)

## Bi-gram extraction

In [1]:
from helpers import get_book
book = get_book(244)

Naive approach: create bi-grams, count frequencies in text, then pick top-20

In [2]:
import nltk
from nltk.collocations import (
    BigramCollocationFinder, BigramAssocMeasures,  
    TrigramCollocationFinder, TrigramAssocMeasures,  
    QuadgramCollocationFinder, QuadgramAssocMeasures
)
from nltk.corpus import stopwords

ignored_words = stopwords.words('english')
stopword_filter = lambda w: len(w) < 3 or w.lower() in ignored_words

nResults = 20

def ngram_measures(n: int, metric: str):
    measure = BigramAssocMeasures
    if n == 3: measure = TrigramAssocMeasures
    if n == 4: measure = QuadgramAssocMeasures
    if metric == 'freq': return measure.raw_freq
    if metric == 'pmi': return measure.pmi
    return measure.likelihood_ratio

def ngrams_from_words(n: int, tokens: list[str]):
    if n == 3: return TrigramCollocationFinder.from_words(tokens)
    if n == 4: return QuadgramCollocationFinder.from_words(tokens)
    return BigramCollocationFinder.from_words(tokens)

def ngrams_result(n: int, finder, scores, metric: str):
    if n == 3:
        headers = ['trigram', f'frequency']
        rows = [[' '.join([t1,t2,t3]),  score] for ((t1,t2,t3),score) in scores]
        # headers = ['trigram', 'tf(w1)', 'tf(w2)', 'tf(w3)', 'tf(trigram)', f'score ({metric})']
        # rows = [[' '.join([t1,t2,t3]), finder.word_fd[t1], finder.word_fd[t2], finder.word_fd[t3], finder.ngram_fd[(t1,t2,t3)], score] for ((t1,t2,t3),score) in scores]
        return (headers, rows)
    if n == 4:
        headers = ['quadgram', 'tf(w1)', 'tf(w2)', 'tf(w3)', 'tf(w4)', 'tf(quadgram)', f'score ({metric})']
        rows = [[' '.join([t1,t2,t3,t4]), finder.word_fd[t1], finder.word_fd[t2], finder.word_fd[t3], finder.word_fd[t4], finder.ngram_fd[(t1,t2,t3,t4)], score] for ((t1,t2,t3,t4),score) in scores]
        return (headers, rows)
    headers = ['bigram', 'tf(w1)', 'tf(w2)', 'tf(bigram)', f'score ({metric})']
    rows = [[' '.join([t1,t2]), finder.word_fd[t1], finder.word_fd[t2], finder.ngram_fd[(t1,t2)], score] for ((t1,t2),score) in scores]
    return (headers, rows)

def ngrams(tokens: list[str], n: int, metric: str, filters: list[str]) -> tuple[list[str],list[list[str]]]:
    measure = ngram_measures(n, metric)
    finder = ngrams_from_words(n, tokens)
    for f in filters:
        if f == 'freq3': finder.apply_freq_filter(3)
        if f == 'freq5': finder.apply_freq_filter(5)
        if f == 'stopwords': finder.apply_word_filter(stopword_filter)
    scores = finder.score_ngrams(measure)[:nResults]
    return ngrams_result(n, finder, scores, metric)

In [3]:
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output
from tabulate import tabulate

last_book = 0
tokens = []

def show_ngrams(book, n, metric, stopwords, freq3, freq5):
    global last_book, tokens
    if last_book != book:
        tokens = [token for token in nltk.word_tokenize(get_book(book).page_content) if token.isalpha()]
        last_book = book
    headers, rows = ngrams(tokens, n, metric, [stopwords and 'stopwords', freq3 and 'freq3', freq5 and 'freq5'])
    display(Markdown(tabulate(rows, headers, "pipe")))

opt_book = widgets.Dropdown(description='book', options=[
    ('A Study in Scarlet (en)', 244),
    ('Buddenbrooks: Verfall einer Familie (de)', 34811),
    ('Les trois mousquetaires (fr)', 13951),
    ('Bajki (pl)', 27729),
])
opt_metric = widgets.Dropdown(description='metric', options=['freq', 'pmi', 'lhr'])
opt_n = widgets.BoundedIntText(description='n', value=2, min=2, max=4)
opt_stopword = widgets.Checkbox(description='no stopwords')
opt_freq3 = widgets.Checkbox(description='freq > 3')
opt_freq5 = widgets.Checkbox(description='freq > 5')

display(widgets.interactive(show_ngrams, book=opt_book, n=opt_n ,metric=opt_metric, stopwords=opt_stopword, freq3=opt_freq3, freq5=opt_freq5))


interactive(children=(Dropdown(description='book', options=(('A Study in Scarlet (en)', 244), ('Buddenbrooks: …

### Manual calclualtions of PMI

In [4]:
import math
N = len(tokens)
tf1, tf2, tf12 = 5, 5, 5
p1, p2, p12 = tf1/N, tf2/N, tf12/N
math.log(p12/p1/p2, 2)

13.070154495134162

### Manual calulcation of LHR

In [5]:
c1=48
c2=94
c12=48
p=c2/N
p1=c12/c1
p2=(c2-c12)/(N-c1)
def L(k,n,p):
    return (p**k)*(1-p)**(n-k)
-2*(math.log(L(c12,c1,p))+math.log(L(c2-c12,N-c1,p))-math.log(L(c12,c1,p1))-math.log(L(c2-c12,N-c1,p2)))

618.2639397345096

### Simple summary of bi-gram calculation

In [6]:
from nltk.collocations import (
    BigramCollocationFinder, BigramAssocMeasures,  
    TrigramCollocationFinder, TrigramAssocMeasures,  
    QuadgramCollocationFinder, QuadgramAssocMeasures
)
from nltk.corpus import stopwords
tokens = [token for token in nltk.word_tokenize(get_book(244).page_content) if token.isalpha()]

# choose bi-grams, tri-grams, quad-grams
finder = QuadgramCollocationFinder.from_words(tokens)
finder = TrigramCollocationFinder.from_words(tokens)
finder = BigramCollocationFinder.from_words(tokens)

# choose a measure (must match with the finder, here for bi-grams)
measure = BigramAssocMeasures.raw_freq
measure = BigramAssocMeasures.pmi
measure = BigramAssocMeasures.likelihood_ratio


# apply frequency filter
finder.apply_freq_filter(3)

#apply stop word filter
ignored_words = stopwords.words('english')
stopword_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
finder.apply_word_filter(stopword_filter)

# obtain results (top-k)
k = 20
scores = finder.score_ngrams(measure)[:k]

# output term 1, term 2, freq of term 1, freq of term 2, freq of bigram, score
for ((t1,t2),score) in scores:
    print(f'{t1} {t2} {finder.word_fd[t1]} {finder.word_fd[t2]} {finder.ngram_fd[(t1,t2)]} {score}')


Sherlock Holmes 48 94 48 618.2639397345179
Jefferson Hope 37 42 34 491.94785591179664
John Ferrier 31 58 26 330.1825164533987
Brixton Road 15 13 13 224.92059705760528
Salt Lake 9 9 9 170.48968950333003
Lake City 9 13 8 129.82912069112837
Enoch Drebber 9 62 9 119.12601098633465
Scotland Yard 8 6 6 109.52843062923058
Baker Street 6 11 6 103.36758969662593
Private Hotel 5 5 5 100.59482597348887
Lucy Ferrier 29 58 10 96.68084062615573
Lauriston Gardens 4 4 4 82.26110221688567
Joseph Stangerson 13 43 7 79.9799404000117
Never mind 5 37 5 71.28837734133977
little girl 80 27 8 68.66609037830294
young hunter 40 14 6 65.60029780468453
Audley Court 3 3 3 63.42198886696671
could see 96 56 9 61.56793780013763
young man 40 154 9 59.46754617578933
CHAPTER III 28 4 4 59.29458839273477


---