# Chapter 10: Words Sequences
$N$-grams and collocations

Programs from the book: [_Python for Natural Language Processing_](https://link.springer.com/book/9783031575488)

__Author__: Pierre Nugues

## Modules

In [1]:
import math
import regex as re
from collections import Counter

## Reading a corpus

In [2]:
CORPUS = 'HOMER'  # 'ILIAD' # 'HOMER'

In [3]:
PATH = '../datasets/classics/'
if CORPUS == 'ILIAD':
    text = open(PATH + 'iliad.txt').read().strip()
elif CORPUS == 'HOMER':
    text = open(PATH + 'iliad.txt').read().strip() + ' ' + \
        open(PATH + 'odyssey.txt').read().strip()

text[:50]

'BOOK I\n\nSing, O goddess, the anger of Achilles son'

## The tokenizer

In [4]:
def tokenize(text):
    words = re.findall(r'\p{L}+', text)
    return words

## Unigrams

We analyze the text

In [5]:
words = tokenize(text.lower())
word_freqs = Counter(words)
word_freqs.most_common(5)

[('the', 15794), ('and', 11662), ('of', 8664), ('to', 6528), ('he', 4728)]

## Bigrams

We can extend the counts to pairs of words

In [6]:
words = tokenize(text.lower())
bigrams = [tuple(words[idx:idx + 2])
           for idx in range(len(words) - 1)]
bigram_freqs = Counter(bigrams)

In [7]:
bigram_freqs.most_common(10)

[(('of', 'the'), 1873),
 (('to', 'the'), 1011),
 (('son', 'of'), 953),
 (('in', 'the'), 929),
 (('and', 'the'), 776),
 (('on', 'the'), 549),
 (('the', 'achaeans'), 519),
 (('as', 'he'), 488),
 (('he', 'was'), 485),
 (('the', 'trojans'), 481)]

In [8]:
for bigram in sorted(bigram_freqs.keys(), key=bigram_freqs.get, reverse=True)[:15]:
    print(bigram, '\t', bigram_freqs[bigram])

('of', 'the') 	 1873
('to', 'the') 	 1011
('son', 'of') 	 953
('in', 'the') 	 929
('and', 'the') 	 776
('on', 'the') 	 549
('the', 'achaeans') 	 519
('as', 'he') 	 488
('he', 'was') 	 485
('the', 'trojans') 	 481
('from', 'the') 	 434
('of', 'his') 	 412
('the', 'son') 	 397
('for', 'the') 	 391
('by', 'the') 	 388


## Trigrams

In [9]:
trigrams = [tuple(words[idx:idx + 3])
            for idx in range(len(words) - 2)]
trigram_freqs = Counter(trigrams)

In [10]:
trigram_freqs.most_common(10)

[(('the', 'son', 'of'), 389),
 (('of', 'the', 'achaeans'), 210),
 (('the', 'house', 'of'), 131),
 (('son', 'of', 'atreus'), 129),
 (('son', 'of', 'peleus'), 106),
 (('out', 'of', 'the'), 103),
 (('as', 'he', 'spoke'), 95),
 (('the', 'trojans', 'and'), 95),
 (('as', 'soon', 'as'), 93),
 (('as', 'he', 'was'), 92)]

Dictionaries do not guarantee the order. We can sort according to the frequency and then the lexical order using a `lambda` function to define the sorting key

In [11]:
for trigram in sorted(trigram_freqs.keys(), key=lambda x: (-trigram_freqs.get(x), x))[:30]:
    print(trigram, '\t', trigram_freqs[trigram])

('the', 'son', 'of') 	 389
('of', 'the', 'achaeans') 	 210
('the', 'house', 'of') 	 131
('son', 'of', 'atreus') 	 129
('son', 'of', 'peleus') 	 106
('out', 'of', 'the') 	 103
('as', 'he', 'spoke') 	 95
('the', 'trojans', 'and') 	 95
('as', 'soon', 'as') 	 93
('as', 'he', 'was') 	 92
('even', 'so', 'did') 	 91
('son', 'of', 'tydeus') 	 85
('for', 'he', 'was') 	 83
('to', 'the', 'ships') 	 79
('of', 'the', 'trojans') 	 76
('on', 'to', 'the') 	 74
('in', 'the', 'house') 	 72
('son', 'of', 'saturn') 	 72
('to', 'the', 'ground') 	 70
('when', 'they', 'had') 	 70
('to', 'the', 'house') 	 67
('when', 'he', 'had') 	 67
('ships', 'of', 'the') 	 64
('in', 'front', 'of') 	 63
('the', 'city', 'of') 	 63
('the', 'ships', 'of') 	 63
('thus', 'did', 'he') 	 62
('him', 'in', 'the') 	 61
('the', 'body', 'of') 	 60
('the', 'hands', 'of') 	 60


## N-grams

In [12]:
n = 3

In [13]:
ngrams = [tuple(words[idx:idx + n])
          for idx in range(len(words) - n + 1)]

In [14]:
ngram_freqs = Counter(ngrams)
ngram_freqs.most_common(5)

[(('the', 'son', 'of'), 389),
 (('of', 'the', 'achaeans'), 210),
 (('the', 'house', 'of'), 131),
 (('son', 'of', 'atreus'), 129),
 (('son', 'of', 'peleus'), 106)]

In [15]:
for ngram in sorted(ngram_freqs.keys(), key=lambda x: (-ngram_freqs.get(x), x))[:15]:
    print(ngram, '\t', ngram_freqs[ngram])

('the', 'son', 'of') 	 389
('of', 'the', 'achaeans') 	 210
('the', 'house', 'of') 	 131
('son', 'of', 'atreus') 	 129
('son', 'of', 'peleus') 	 106
('out', 'of', 'the') 	 103
('as', 'he', 'spoke') 	 95
('the', 'trojans', 'and') 	 95
('as', 'soon', 'as') 	 93
('as', 'he', 'was') 	 92
('even', 'so', 'did') 	 91
('son', 'of', 'tydeus') 	 85
('for', 'he', 'was') 	 83
('to', 'the', 'ships') 	 79
('of', 'the', 'trojans') 	 76


## Cooccurrence measures

In all the computations, we need this

In [16]:
word_freqs = Counter(words)

bigrams = [tuple(words[idx:idx + 2])
           for idx in range(len(words) - 1)]
bigram_freqs = Counter(bigrams)

### Mutual information

In [17]:
def mutual_info(words, freq_unigrams, freq_bigrams):
    mi = {}
    factor = len(words) * len(words) / (len(words) - 1)
    for bigram in freq_bigrams:
        mi[bigram] = (
            math.log(factor * freq_bigrams[bigram] /
                     (freq_unigrams[bigram[0]] *
                      freq_unigrams[bigram[1]]), 2))
    return mi

In [18]:
mi = mutual_info(words, word_freqs, bigram_freqs)

In [19]:
for bigram in sorted(mi.keys(), key=lambda x: (-mi.get(x), x))[:15]:
    print(bigram, '\t',
          word_freqs[bigram[0]], '\t',
          word_freqs[bigram[1]], '\t',
          bigram_freqs[bigram], '\t',
          mi[bigram])

('abians', 'justest') 	 1 	 1 	 1 	 18.05062986856374
('acroneos', 'ocyalus') 	 1 	 1 	 1 	 18.05062986856374
('aesymnus', 'orus') 	 1 	 1 	 1 	 18.05062986856374
('agathon', 'pammon') 	 1 	 1 	 1 	 18.05062986856374
('agave', 'doto') 	 1 	 1 	 1 	 18.05062986856374
('aipy', 'cyparisseis') 	 1 	 1 	 1 	 18.05062986856374
('allotment', 'unplundered') 	 1 	 1 	 1 	 18.05062986856374
('alos', 'alope') 	 1 	 1 	 1 	 18.05062986856374
('amphoterus', 'epaltes') 	 1 	 1 	 1 	 18.05062986856374
('aretaon', 'ablerus') 	 1 	 1 	 1 	 18.05062986856374
('astypylus', 'mnesus') 	 1 	 1 	 1 	 18.05062986856374
('bessa', 'scarphe') 	 1 	 1 	 1 	 18.05062986856374
('blacksmith', 'plunges') 	 1 	 1 	 1 	 18.05062986856374
('bo', 'ies') 	 1 	 1 	 1 	 18.05062986856374
('boebe', 'glaphyrae') 	 1 	 1 	 1 	 18.05062986856374


Mutual information is highly biased toward low-frequency words

In [20]:
cutoff = 15
filtered_mi = {k: v for k, v in mi.items() if bigram_freqs[k] >= cutoff}

In [21]:
for bigram in sorted(filtered_mi.keys(), key=lambda x: (-filtered_mi.get(x), x))[:15]:
    print(bigram, '\t',
          word_freqs[bigram[0]], '\t',
          word_freqs[bigram[1]], '\t',
          bigram_freqs[bigram], '\t',
          filtered_mi[bigram])

('rosy', 'fingered') 	 25 	 26 	 25 	 13.350190150422646
('mixing', 'bowl') 	 33 	 29 	 17 	 12.235717595328053
('barley', 'meal') 	 29 	 35 	 18 	 12.233290857933513
('fingered', 'dawn') 	 26 	 48 	 22 	 12.224659268338788
('dawn', 'appeared') 	 48 	 29 	 21 	 12.000003795493768
('thigh', 'bones') 	 37 	 47 	 21 	 11.67890507403591
('outer', 'court') 	 36 	 56 	 24 	 11.658312445784977
('aegis', 'bearing') 	 45 	 64 	 32 	 11.558776772234063
('morning', 'rosy') 	 85 	 25 	 20 	 11.319310837538673
('ox', 'hide') 	 32 	 53 	 15 	 11.229600009609058
('single', 'handed') 	 57 	 37 	 15 	 10.915177084378564
('store', 'room') 	 42 	 95 	 20 	 10.410384932341392
('phoebus', 'apollo') 	 39 	 170 	 33 	 10.400230832922242
('drink', 'offering') 	 141 	 41 	 24 	 10.138489012268018
('drink', 'offerings') 	 141 	 45 	 25 	 10.063081609609995


### Likelihood ratio

In [22]:
def likelihood_ratio(words, word_freqs, bigram_freqs):
    lr = {}
    for bigram in bigram_freqs:
        p = word_freqs[bigram[1]] / len(words)
        p1 = bigram_freqs[bigram] / word_freqs[bigram[0]]
        p2 = ((word_freqs[bigram[1]] - bigram_freqs[bigram])
              / (len(words) - word_freqs[bigram[0]]))
        if p1 != 1.0 and p2 != 0.0:
            lr[bigram] = 2.0 * (
                log_f(bigram_freqs[bigram],
                      word_freqs[bigram[0]], p1) +
                log_f(word_freqs[bigram[1]] -
                      bigram_freqs[bigram],
                      len(words) - word_freqs[bigram[0]], p2) -
                log_f(bigram_freqs[bigram],
                      word_freqs[bigram[0]], p) -
                log_f(word_freqs[bigram[1]] -
                      bigram_freqs[bigram],
                      len(words) - word_freqs[bigram[0]], p))
    return lr


def log_f(k, N, p):
    return k * math.log(p) + (N - k) * math.log(1 - p)

In [23]:
lr = likelihood_ratio(words, word_freqs, bigram_freqs)

for bigram in sorted(lr, key=lambda x: (-lr.get(x), x))[:15]:
    print(bigram, "\t", word_freqs[bigram[0]], "\t", word_freqs[bigram[1]], "\t",
          bigram_freqs[bigram], '\t', lr[bigram])

('son', 'of') 	 1290 	 8664 	 953 	 5209.0273578509805
('of', 'the') 	 8664 	 15794 	 1873 	 2557.1296105680667
('the', 'achaeans') 	 15794 	 601 	 519 	 2499.2751029700266
('i', 'am') 	 3195 	 342 	 294 	 2363.33694579609
('the', 'trojans') 	 15794 	 573 	 481 	 2255.997249339922
('the', 'the') 	 15794 	 15794 	 4 	 1899.7875260145374
('he', 'was') 	 4728 	 2166 	 485 	 1729.0250078236386
('at', 'once') 	 1207 	 233 	 176 	 1674.4429057758052
('as', 'he') 	 2578 	 4728 	 488 	 1569.0800730942938
('i', 'will') 	 3195 	 1505 	 340 	 1474.440123164859
('you', 'are') 	 3716 	 1117 	 321 	 1463.38557310121
('one', 'another') 	 993 	 287 	 161 	 1441.477222814483
('in', 'the') 	 3870 	 15794 	 929 	 1405.7896939456114
('even', 'so') 	 469 	 1409 	 175 	 1247.0054785120046
('let', 'us') 	 536 	 610 	 147 	 1203.0767601206207


### T-scores

In [24]:
def t_scores(words, word_freqs, bigram_freqs):
    ts = {}
    for bigram in bigram_freqs:
        ts[bigram] = ((bigram_freqs[bigram] -
                      word_freqs[bigram[0]] *
                      word_freqs[bigram[1]] /
                      len(words)) /
                      math.sqrt(bigram_freqs[bigram]))
    return ts

In [25]:
ts = t_scores(words, word_freqs, bigram_freqs)

for bigram in sorted(ts, key=lambda x: (-ts.get(x), x))[:15]:
    print(bigram, "\t", word_freqs[bigram[0]], "\t", word_freqs[bigram[1]], "\t",
          bigram_freqs[bigram], '\t', ts[bigram])

('of', 'the') 	 8664 	 15794 	 1873 	 31.632559609220863
('son', 'of') 	 1290 	 8664 	 953 	 29.537231081902494
('in', 'the') 	 3870 	 15794 	 929 	 23.09339064531879
('the', 'achaeans') 	 15794 	 601 	 519 	 21.246942166107083
('the', 'trojans') 	 15794 	 573 	 481 	 20.41188299332636
('he', 'was') 	 4728 	 2166 	 485 	 20.309998502446987
('as', 'he') 	 2578 	 4728 	 488 	 20.05850054978281
('to', 'the') 	 6528 	 15794 	 1011 	 19.853120651062955
('on', 'the') 	 1941 	 15794 	 549 	 18.611802965260487
('i', 'will') 	 3195 	 1505 	 340 	 17.478608590799595
('he', 'had') 	 4728 	 1731 	 357 	 17.299075188669896
('you', 'are') 	 3716 	 1117 	 321 	 17.06318232738217
('i', 'am') 	 3195 	 342 	 294 	 16.911711563645227
('from', 'the') 	 1536 	 15794 	 434 	 16.54363696048748
('the', 'son') 	 15794 	 1290 	 397 	 16.158626064246796
