# Counting words
Author: Pierre Nugues

# Imports

In [36]:
import math
import regex as re
import sys

## Tokenization

Tokenization has no unique solution. Let us explore some possible strategies

First, let us take a text

In [37]:
text = """Tell me, O muse, of that ingenious hero who
travelled far and wide after he had sacked the famous
town of Troy."""

## Using content

A first tokenizer: sequences of letters

In [38]:
pattern1 = r'\p{L}+'

In [39]:
re.findall(pattern1, text)

['Tell',
 'me',
 'O',
 'muse',
 'of',
 'that',
 'ingenious',
 'hero',
 'who',
 'travelled',
 'far',
 'and',
 'wide',
 'after',
 'he',
 'had',
 'sacked',
 'the',
 'famous',
 'town',
 'of',
 'Troy']

Let us add the other characters

In [40]:
pattern2 = r'\p{L}+|[^\s\p{L}]+'

In [41]:
re.findall(pattern2, text)

['Tell',
 'me',
 ',',
 'O',
 'muse',
 ',',
 'of',
 'that',
 'ingenious',
 'hero',
 'who',
 'travelled',
 'far',
 'and',
 'wide',
 'after',
 'he',
 'had',
 'sacked',
 'the',
 'famous',
 'town',
 'of',
 'Troy',
 '.']

The numbers

In [42]:
pattern3 = r'\p{L}+|\p{N}+|[^\s\p{L}\p{N}]+'

In [43]:
re.findall(pattern3, text)

['Tell',
 'me',
 ',',
 'O',
 'muse',
 ',',
 'of',
 'that',
 'ingenious',
 'hero',
 'who',
 'travelled',
 'far',
 'and',
 'wide',
 'after',
 'he',
 'had',
 'sacked',
 'the',
 'famous',
 'town',
 'of',
 'Troy',
 '.']

And the punctuation as separate tokens

In [44]:
pattern4 = r'\p{L}+|\p{N}+|\p{P}|[^\s\p{L}\p{N}\p{P}]+'

In [45]:
re.findall(pattern4, text)

['Tell',
 'me',
 ',',
 'O',
 'muse',
 ',',
 'of',
 'that',
 'ingenious',
 'hero',
 'who',
 'travelled',
 'far',
 'and',
 'wide',
 'after',
 'he',
 'had',
 'sacked',
 'the',
 'famous',
 'town',
 'of',
 'Troy',
 '.']

## Using boundaries: A first tokenizer

In [52]:
pattern5 = r'\s+'

In [55]:
re.split(pattern5, text)

['Tell',
 'me,',
 'O',
 'muse,',
 'of',
 'that',
 'ingenious',
 'hero',
 'who',
 'travelled',
 'far',
 'and',
 'wide',
 'after',
 'he',
 'had',
 'sacked',
 'the',
 'famous',
 'town',
 'of',
 'Troy.']

Keeping the punctuation

In [56]:
pattern6 = r'([\p{S}\p{P}]+)'

In [64]:
re.split(
    pattern5,
    re.sub(pattern6, r' \1 ', text))

['Tell',
 'me',
 ',',
 'O',
 'muse',
 ',',
 'of',
 'that',
 'ingenious',
 'hero',
 'who',
 'travelled',
 'far',
 'and',
 'wide',
 'after',
 'he',
 'had',
 'sacked',
 'the',
 'famous',
 'town',
 'of',
 'Troy',
 '.',
 '']

In [66]:
list(filter(None, re.split(
    pattern5,
    re.sub(pattern6, r' \1 ', text))))

['Tell',
 'me',
 ',',
 'O',
 'muse',
 ',',
 'of',
 'that',
 'ingenious',
 'hero',
 'who',
 'travelled',
 'far',
 'and',
 'wide',
 'after',
 'he',
 'had',
 'sacked',
 'the',
 'famous',
 'town',
 'of',
 'Troy',
 '.']

## Reading a corpus

In [71]:
file_name = '../../corpus/Selma.txt'
text = open(file_name).read().strip()
text[:100]

'Nils Holgerssons underbara resa genom Sverige\nSelma Lagerlöf\n\nInnehåll\n\tDen kristna dagvisan - Sveri'

## Counting and sorting

We redefine the tokenizer

In [78]:
def tokenize(text, pattern=r'\p{L}+'):
    words = re.findall(pattern, text)
    return words

A function to count the words

In [79]:
def count_unigrams(words):
    frequency = {}
    for word in words:
        if word in frequency:
            frequency[word] += 1
        else:
            frequency[word] = 1
    return frequency

We analyze Selma Lagerlöf's novels

In [80]:
words = tokenize(text.lower())
frequency = count_unigrams(words)
for word in sorted(frequency.keys(), key=frequency.get, reverse=True)[:15]:
    print(word, '\t', frequency[word])

och 	 37799
att 	 28914
han 	 22743
det 	 22087
i 	 17072
som 	 16790
hade 	 14955
på 	 14634
hon 	 14093
en 	 13921
inte 	 13826
var 	 12852
de 	 12599
den 	 11773
för 	 9811
