# Words Sequences
Author: Pierre Nugues

# Imports

In [1]:
import math
import regex as re
import sys

## Reading a corpus

In [2]:
file_name = '../../corpus/Selma.txt'
text = open(file_name).read().strip()
text[:50]

'Nils Holgerssons underbara resa genom Sverige\nSelm'

## The tokenizer

In [3]:
def tokenize(text):
    words = re.findall(r'\p{L}+', text)
    return words

## Unigrams

A function to count the words

In [4]:
def count_unigrams(words):
    frequency = {}
    for word in words:
        if word in frequency:
            frequency[word] += 1
        else:
            frequency[word] = 1
    return frequency

We analyze Selma Lagerlöf

In [5]:
words = tokenize(text.lower())
frequency = count_unigrams(words)
for word in sorted(frequency.keys(), key=frequency.get, reverse=True)[:15]:
    print(word, '\t', frequency[word])

och 	 37799
att 	 28914
han 	 22743
det 	 22087
i 	 17072
som 	 16790
hade 	 14955
på 	 14634
hon 	 14093
en 	 13921
inte 	 13826
var 	 12852
de 	 12599
den 	 11773
för 	 9811


## Bigrams

We can extend the counts to pairs of words

In [6]:
def count_bigrams(words):
    bigrams = [tuple(words[idx:idx + 2])
               for idx in range(len(words) - 1)]
    frequencies = {}
    for bigram in bigrams:
        if bigram in frequencies:
            frequencies[bigram] += 1
        else:
            frequencies[bigram] = 1
    return frequencies

In [7]:
words = tokenize(text.lower())
frequency_bigrams = count_bigrams(words)
for bigram in sorted(frequency_bigrams.keys(), key=frequency_bigrams.get, reverse=True)[:15]:
    print(bigram, '\t', frequency_bigrams[bigram])

('det', 'var') 	 4024
('att', 'han') 	 3064
('för', 'att') 	 3007
('han', 'hade') 	 2352
('att', 'det') 	 2152
('det', 'är') 	 2114
('att', 'hon') 	 1854
('hon', 'hade') 	 1469
('att', 'de') 	 1365
('så', 'att') 	 1313
('att', 'jag') 	 1187
('han', 'var') 	 1061
('han', 'inte') 	 1023
('var', 'det') 	 1021
('som', 'hade') 	 1002


## Trigrams

In [8]:
def count_trigrams(words):
    trigrams = [tuple(words[idx:idx + 3])
                for idx in range(len(words) - 2)]
    frequencies = {}
    for trigram in trigrams:
        if trigram in frequencies:
            frequencies[trigram] += 1
        else:
            frequencies[trigram] = 1
    return frequencies

In [9]:
words = tokenize(text.lower())
frequency_trigrams = count_trigrams(words)
for trigram in sorted(frequency_trigrams.keys(), key=frequency_trigrams.get, reverse=True)[:30]:
    print(trigram, '\t', frequency_trigrams[trigram])

('att', 'det', 'var') 	 555
('att', 'han', 'hade') 	 422
('att', 'han', 'inte') 	 410
('det', 'var', 'en') 	 357
('att', 'han', 'skulle') 	 333
('det', 'var', 'inte') 	 298
('att', 'hon', 'inte') 	 278
('i', 'alla', 'fall') 	 272
('men', 'det', 'var') 	 269
('och', 'det', 'var') 	 253
('så', 'att', 'han') 	 239
('att', 'det', 'inte') 	 231
('det', 'var', 'som') 	 231
('som', 'han', 'hade') 	 226
('att', 'hon', 'hade') 	 224
('att', 'hon', 'skulle') 	 223
('att', 'han', 'var') 	 222
('för', 'att', 'få') 	 221
('att', 'det', 'är') 	 220
('som', 'om', 'han') 	 205
('det', 'är', 'inte') 	 170
('om', 'han', 'hade') 	 165
('därför', 'att', 'han') 	 164
('att', 'de', 'inte') 	 152
('så', 'att', 'det') 	 151
('för', 'att', 'se') 	 151
('han', 'hade', 'varit') 	 151
('än', 'en', 'gång') 	 149
('att', 'de', 'hade') 	 145
('att', 'jag', 'inte') 	 144


Dictionaries do not guarantee the order. We can sort according to the frequency and then the lexical order using a `lambda` function to define the sorting key

In [10]:
for trigram in sorted(frequency_trigrams.keys(), key=lambda x: (-frequency_trigrams.get(x), x))[:30]:
    print(trigram, '\t', frequency_trigrams[trigram])

('att', 'det', 'var') 	 555
('att', 'han', 'hade') 	 422
('att', 'han', 'inte') 	 410
('det', 'var', 'en') 	 357
('att', 'han', 'skulle') 	 333
('det', 'var', 'inte') 	 298
('att', 'hon', 'inte') 	 278
('i', 'alla', 'fall') 	 272
('men', 'det', 'var') 	 269
('och', 'det', 'var') 	 253
('så', 'att', 'han') 	 239
('att', 'det', 'inte') 	 231
('det', 'var', 'som') 	 231
('som', 'han', 'hade') 	 226
('att', 'hon', 'hade') 	 224
('att', 'hon', 'skulle') 	 223
('att', 'han', 'var') 	 222
('för', 'att', 'få') 	 221
('att', 'det', 'är') 	 220
('som', 'om', 'han') 	 205
('det', 'är', 'inte') 	 170
('om', 'han', 'hade') 	 165
('därför', 'att', 'han') 	 164
('att', 'de', 'inte') 	 152
('för', 'att', 'se') 	 151
('han', 'hade', 'varit') 	 151
('så', 'att', 'det') 	 151
('än', 'en', 'gång') 	 149
('att', 'de', 'hade') 	 145
('att', 'jag', 'inte') 	 144


## N-grams

In [11]:
def count_ngrams(words, n):
    ngrams = [tuple(words[idx:idx + n])
              for idx in range(len(words) - n + 1)]
    # "\t".join(words[idx:idx + n])
    frequencies = {}
    for ngram in ngrams:
        if ngram in frequencies:
            frequencies[ngram] += 1
        else:
            frequencies[ngram] = 1
    return frequencies

In [12]:
N = 10

In [13]:
words = tokenize(text.lower())
frequency_ngrams = count_ngrams(words, N)
for ngram in sorted(frequency_ngrams.keys(), key=lambda x: (-frequency_ngrams.get(x), x))[:15]:
    print(ngram, '\t', frequency_ngrams[ngram])

('haver', 'sagt', 'österrike', 'portugal', 'metz', 'japan', 'som', 'det', 'var', 'bom') 	 6
('japan', 'som', 'det', 'var', 'bom', 'bom', 'bom', 'å', 'rulla', 'bom') 	 6
('metz', 'japan', 'som', 'det', 'var', 'bom', 'bom', 'bom', 'å', 'rulla') 	 6
('portugal', 'metz', 'japan', 'som', 'det', 'var', 'bom', 'bom', 'bom', 'å') 	 6
('sagt', 'österrike', 'portugal', 'metz', 'japan', 'som', 'det', 'var', 'bom', 'bom') 	 6
('som', 'det', 'var', 'bom', 'bom', 'bom', 'å', 'rulla', 'bom', 'bom') 	 6
('som', 'tidningen', 'haver', 'sagt', 'österrike', 'portugal', 'metz', 'japan', 'som', 'det') 	 6
('tidningen', 'haver', 'sagt', 'österrike', 'portugal', 'metz', 'japan', 'som', 'det', 'var') 	 6
('österrike', 'portugal', 'metz', 'japan', 'som', 'det', 'var', 'bom', 'bom', 'bom') 	 6
('han', 'satt', 'i', 'nästa', 'rum', 'och', 'så', 'kastade', 'han', 'sig') 	 5
('i', 'nästa', 'rum', 'och', 'så', 'kastade', 'han', 'sig', 'över', 'oss') 	 5
('satt', 'i', 'nästa', 'rum', 'och', 'så', 'kastade', 'han', 'si

## Cooccurrence measures

In all the computations, we need this

In [14]:
frequency = count_unigrams(words)
frequency_bigrams = count_bigrams(words)

### Mutual information

In [15]:
def mutual_info(words, freq_unigrams, freq_bigrams):
    mi = {}
    factor = len(words) * len(words) / (len(words) - 1)
    for bigram in freq_bigrams:
        mi[bigram] = (
            math.log(factor * freq_bigrams[bigram] /
                     (freq_unigrams[bigram[0]] *
                      freq_unigrams[bigram[1]]), 2))
    return mi

In [16]:
mi = mutual_info(words, frequency, frequency_bigrams)

Mutual information is highly biased toward low-frequency words

In [17]:
cutoff = 5
filtered_mi = {k: v for k, v in mi.items() if frequency_bigrams[k] >= cutoff}

In [18]:
for bigram in sorted(filtered_mi.keys(), key=lambda x: (-filtered_mi.get(x), x))[:15]:
    print(bigram, '\t',
          frequency[bigram[0]], '\t',
          frequency[bigram[1]], '\t',
          frequency_bigrams[bigram], '\t',
          filtered_mi[bigram])

('atterdag', 'brandskattar') 	 5 	 7 	 5 	 17.068629848637798
('el', 'aksa') 	 7 	 5 	 5 	 17.068629848637798
('metz', 'japan') 	 6 	 7 	 6 	 17.068629848637798
('new', 'york') 	 7 	 7 	 7 	 17.068629848637798
('rättar', 'söderlind') 	 7 	 5 	 5 	 17.068629848637798
('portugal', 'metz') 	 8 	 6 	 6 	 16.8759847706954
('xxii', 'karrs') 	 8 	 8 	 8 	 16.8759847706954
('österrike', 'portugal') 	 6 	 8 	 6 	 16.8759847706954
('valdemar', 'atterdag') 	 9 	 5 	 5 	 16.70605976925309
('xliii', 'västerbotten') 	 5 	 9 	 5 	 16.70605976925309
('neljä', 'viisi') 	 7 	 7 	 5 	 16.583203021467558
('svängda', 'brätten') 	 9 	 6 	 5 	 16.443025363419295
('tidiga', 'morgonstunden') 	 11 	 5 	 5 	 16.416553152058103
('sophie', 'elkan') 	 11 	 11 	 10 	 16.279049628308172
('britta', 'lambert') 	 13 	 9 	 9 	 16.17554505255431


### Likelihood ratio

In [19]:
def likelihood_ratio(words, freq_unigrams, freq_bigrams):
    lr = {}
    for bigram in freq_bigrams:
        p = freq_unigrams[bigram[1]] / len(words)
        p1 = freq_bigrams[bigram] / freq_unigrams[bigram[0]]
        p2 = ((freq_unigrams[bigram[1]] - freq_bigrams[bigram])
              / (len(words) - freq_unigrams[bigram[0]]))
        if p1 != 1.0 and p2 != 0.0:
            lr[bigram] = 2.0 * (
                log_f(freq_bigrams[bigram],
                      freq_unigrams[bigram[0]], p1) +
                log_f(freq_unigrams[bigram[1]] -
                      freq_bigrams[bigram],
                      len(words) - freq_unigrams[bigram[0]], p2) -
                log_f(freq_bigrams[bigram],
                      freq_unigrams[bigram[0]], p) -
                log_f(freq_unigrams[bigram[1]] -
                      freq_bigrams[bigram],
                      len(words) - freq_unigrams[bigram[0]], p))
    return lr


def log_f(k, N, p):
    return k * math.log(p) + (N - k) * math.log(1 - p)

In [20]:
lr = likelihood_ratio(words, frequency, frequency_bigrams)

for bigram in sorted(lr, key=lambda x: (-lr.get(x), x))[:15]:
    print(bigram, "\t", frequency[bigram[0]], "\t", frequency[bigram[1]], "\t",
          frequency_bigrams[bigram], '\t', lr[bigram])

('det', 'var') 	 22087 	 12852 	 4024 	 15501.405961381693
('för', 'att') 	 9811 	 28914 	 3007 	 9674.805977699376
('det', 'är') 	 22087 	 6604 	 2114 	 8070.401027389045
('ett', 'par') 	 5224 	 800 	 776 	 8000.628446635721
('han', 'hade') 	 22743 	 14955 	 2352 	 5393.1480120694905
('hade', 'varit') 	 14955 	 1824 	 889 	 4957.154755381902
('att', 'han') 	 28914 	 22743 	 3064 	 4912.6058663810545
('in', 'i') 	 2147 	 17072 	 856 	 4101.641612320935
('en', 'gång') 	 13921 	 1332 	 695 	 4096.222880805115
('därför', 'att') 	 906 	 28914 	 666 	 3650.285979713226
('klara', 'gulla') 	 304 	 214 	 213 	 3627.243582541728
('annat', 'än') 	 982 	 2630 	 407 	 3540.9682107325316
('sven', 'elversson') 	 346 	 232 	 215 	 3464.0422768111766
('hon', 'hade') 	 14093 	 14955 	 1469 	 3318.3298605444725
('hade', 'kommit') 	 14955 	 923 	 538 	 3257.5113136380414


### T-scores

In [21]:
def t_scores(words, freq_unigrams, freq_bigrams):
    ts = {}
    for bigram in freq_bigrams:
        ts[bigram] = ((freq_bigrams[bigram] -
                      freq_unigrams[bigram[0]] *
                      freq_unigrams[bigram[1]] /
                      len(words)) /
                      math.sqrt(freq_bigrams[bigram]))
    return ts

In [22]:
ts = t_scores(words, frequency, frequency_bigrams)

for bigram in sorted(ts, key=lambda x: (-ts.get(x), x))[:15]:
    print(bigram, "\t", frequency[bigram[0]], "\t", frequency[bigram[1]], "\t",
          frequency_bigrams[bigram], '\t', ts[bigram])

('det', 'var') 	 22087 	 12852 	 4024 	 58.784381299609926
('för', 'att') 	 9811 	 28914 	 3007 	 49.459768903363944
('att', 'han') 	 28914 	 22743 	 3064 	 43.006894978233305
('det', 'är') 	 22087 	 6604 	 2114 	 42.68121626005159
('han', 'hade') 	 22743 	 14955 	 2352 	 41.20875153156267
('att', 'hon') 	 28914 	 14093 	 1854 	 33.22275512003115
('hon', 'hade') 	 14093 	 14955 	 1469 	 32.612593974488576
('att', 'det') 	 28914 	 22087 	 2152 	 32.082389179672
('hade', 'varit') 	 14955 	 1824 	 889 	 28.865294187312568
('så', 'att') 	 9558 	 28914 	 1313 	 28.308947395316558
('in', 'i') 	 2147 	 17072 	 856 	 27.95547341366061
('ett', 'par') 	 5224 	 800 	 776 	 27.700858955995376
('att', 'de') 	 28914 	 12599 	 1365 	 26.698575301782878
('jag', 'har') 	 9640 	 5138 	 805 	 26.558234722068747
('att', 'jag') 	 28914 	 9640 	 1187 	 26.044866545482744
