# Counting words
Author: Pierre Nugues

# Imports

In [1]:
import math
import regex as re
import sys

## Tokenization

Tokenization has no unique solution. Let us explore some possible strategies

First, let us take a text

In [4]:
text = """Tell me, O muse, of that ingenious hero who
travelled far and wide after he had sacked the famous
town of Troy."""

Using boundaries: A first tokenizer

In [5]:
def tokenize(text):
    """uses the nonletters to break the text into words
    returns a list of words"""
    # words = re.split(r'[\s\-,;:!?.’\'«»()–...&‘’“”*—]+', text)
    # words = re.split(r'[^a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’\-]+', text)
    # words = re.split(r'\W+', text)
    words = re.split(r'\P{L}+', text)
    words.remove('')
    return words

tokenize(text)

['Tell',
 'me',
 'O',
 'muse',
 'of',
 'that',
 'ingenious',
 'hero',
 'who',
 'travelled',
 'far',
 'and',
 'wide',
 'after',
 'he',
 'had',
 'sacked',
 'the',
 'famous',
 'town',
 'of',
 'Troy']

Using content: A second one

In [6]:
def tokenize2(text):
    """uses the letters to break the text into words
    returns a list of words"""
    # words = re.findall(r'[a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’\-]+', text)
    # words = re.findall(r'\w+', text)
    words = re.findall(r'\p{L}+', text)
    return words

tokenize2(text)

['Tell',
 'me',
 'O',
 'muse',
 'of',
 'that',
 'ingenious',
 'hero',
 'who',
 'travelled',
 'far',
 'and',
 'wide',
 'after',
 'he',
 'had',
 'sacked',
 'the',
 'famous',
 'town',
 'of',
 'Troy']

Using punctuation: A third one

In [7]:
def tokenize3(text):
    """uses the punctuation and nonletters to break the text into words
    returns a list of words"""
    # text = re.sub(r'[^a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’'()\-,.?!:;]+', '\n', text)
    # text = re.sub('([,.?!:;)('-])', r'\n\1\n', text)
    text = re.sub(r'[^\p{L}\p{P}]+', '\n', text)
    text = re.sub(r'(\p{P})', r'\n\1\n', text)
    text = re.sub(r'\n+', '\n', text)
    return text.split()

tokenize3(text)

['Tell',
 'me',
 ',',
 'O',
 'muse',
 ',',
 'of',
 'that',
 'ingenious',
 'hero',
 'who',
 'travelled',
 'far',
 'and',
 'wide',
 'after',
 'he',
 'had',
 'sacked',
 'the',
 'famous',
 'town',
 'of',
 'Troy',
 '.']

A final one

In [8]:
def tokenize4(text):
    """uses the punctuation and symbols to break the text into words
    returns a list of words"""
    spaced_tokens = re.sub(r'([\p{S}\p{P}])', r' \1 ', text)
    # print(spaced_tokens)
    one_token_per_line = re.sub(r'\p{Z}+', '\n', spaced_tokens)
    # print(one_token_per_line)
    tokens = one_token_per_line.split()
    return tokens

tokenize4(text)

['Tell',
 'me',
 ',',
 'O',
 'muse',
 ',',
 'of',
 'that',
 'ingenious',
 'hero',
 'who',
 'travelled',
 'far',
 'and',
 'wide',
 'after',
 'he',
 'had',
 'sacked',
 'the',
 'famous',
 'town',
 'of',
 'Troy',
 '.']

## Reading a corpus

In [9]:
file_name = '../../corpus/Selma.txt'
text = open(file_name).read().strip()
text[:100]

'Nils Holgerssons underbara resa genom Sverige\nSelma Lagerlöf\n\nInnehåll\n\tDen kristna dagvisan - Sveri'

## Counting and sorting

We redefine the tokenizer

In [10]:
def tokenize(text):
    words = re.findall(r'\p{L}+', text)
    return words

A function to count the words

In [11]:
def count_unigrams(words):
    frequency = {}
    for word in words:
        if word in frequency:
            frequency[word] += 1
        else:
            frequency[word] = 1
    return frequency

We analyze Selma Lagerlöf's novels

In [12]:
words = tokenize(text.lower())
frequency = count_unigrams(words)
for word in sorted(frequency.keys(), key=frequency.get, reverse=True)[:15]:
    print(word, '\t', frequency[word])

och 	 37799
att 	 28914
han 	 22743
det 	 22087
i 	 17072
som 	 16790
hade 	 14955
på 	 14634
hon 	 14093
en 	 13921
inte 	 13826
var 	 12852
de 	 12599
den 	 11773
för 	 9811
