# Counting words
Author: Pierre Nugues

# Imports

In [None]:
import math
import regex as re
import sys

## Tokenization

Tokenization has no unique solution. Let us explore some possible strategies

First, let us take a text

In [None]:
text = """Tell me, O muse, of that ingenious hero who
travelled far and wide after he had sacked the famous
town of Troy."""

Using boundaries: A first tokenizer

In [None]:
def tokenize(text):
    """uses the nonletters to break the text into words
    returns a list of words"""
    # words = re.split('[\s\-,;:!?.’\'«»()–...&‘’“”*—]+', text)
    # words = re.split('[^a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’\-]+', text)
    # words = re.split('\W+', text)
    words = re.split('\P{L}+', text)
    words.remove('')
    return words

tokenize(text)

Using content: A second one

In [None]:
def tokenize2(text):
    """uses the letters to break the text into words
    returns a list of words"""
    # words = re.findall('[a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’\-]+', text)
    # words = re.findall('\w+', text)
    words = re.findall('\p{L}+', text)
    return words

tokenize2(text)

Using punctuation: A third one

In [None]:
def tokenize3(text):
    """uses the punctuation and nonletters to break the text into words
    returns a list of words"""
    # text = re.sub('[^a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’'()\-,.?!:;]+', '\n', text)
    # text = re.sub('([,.?!:;)('-])', r'\n\1\n', text)
    text = re.sub(r'[^\p{L}\p{P}]+', '\n', text)
    text = re.sub(r'(\p{P})', r'\n\1\n', text)
    text = re.sub(r'\n+', '\n', text)
    return text.split()

tokenize3(text)

A final one

In [None]:
def tokenize4(text):
    """uses the punctuation and symbols to break the text into words
    returns a list of words"""
    spaced_tokens = re.sub('([\p{S}\p{P}])', r' \1 ', text)
    # print(spaced_tokens)
    one_token_per_line = re.sub('\p{Z}+', '\n', spaced_tokens)
    # print(one_token_per_line)
    tokens = one_token_per_line.split()
    return tokens

tokenize4(text)

## Reading a corpus

In [None]:
file_name = '../../corpus/Selma.txt'
text = open(file_name).read().strip()
text[:100]

## Counting and sorting

We redefine the tokenizer

In [None]:
def tokenize(text):
    words = re.findall('\p{L}+', text)
    return words

A function to count the words

In [None]:
def count_unigrams(words):
    frequency = {}
    for word in words:
        if word in frequency:
            frequency[word] += 1
        else:
            frequency[word] = 1
    return frequency

We analyze Selma Lagerlöf's novels

In [None]:
words = tokenize(text.lower())
frequency = count_unigrams(words)
for word in sorted(frequency.keys(), key=frequency.get, reverse=True)[:15]:
    print(word, '\t', frequency[word])