# Chapter 13: Subword Segmentation
Discovery of Morphemes using Déjean's algorithm: https://aclanthology.org/W98-1239.pdf

Programs from the book: [_Python for Natural Language Processing_](https://link.springer.com/book/9783031575488)

__Author__: Pierre Nugues

## Modules

In [1]:
import os
import regex as re
from collections import Counter

## Reading a Corpus
Utility function to read all the files in a folder

In [2]:

def get_files(dir, suffix):
    """
    Returns all the files in a folder ending with suffix
    :param dir:
    :param suffix:
    :return: the list of file names
    """
    files = []
    for file in os.listdir(dir):
        if file.endswith(suffix):
            files.append(file)
    return files

## Tokenizer
An elemetary tokenizer

In [3]:
def tokenize(text):
    """
    Uses the letters to break the text into words.
    Returns a list of match objects
    """
    words = re.findall(r'\p{L}+', text)
    return words

## Reading the Files
We read a corpus of novels from Dickens

In [4]:
PATH = '../datasets/dickens/'

In [5]:
files = get_files(PATH, 'txt')
files

['Hard Times.txt',
 'Oliver Twist.txt',
 'Great Expectations.txt',
 'The Old Curiosity Shop.txt',
 'A Tale of Two Cities.txt',
 'Dombey and Son.txt',
 'The Pickwick Papers.txt',
 'Bleak House.txt',
 'Our Mutual Friend.txt',
 'The Mystery of Edwin Drood.txt',
 'Nicholas Nickleby.txt',
 'David Copperfield.txt',
 'Little Dorrit.txt',
 'A Christmas Carol in Prose.txt']

We tokenize the texts

In [6]:
words = []
for file in files:
    text = open(PATH + file).read().lower().strip()
    words += tokenize(text)
words[:10]

['hard',
 'times',
 'and',
 'reprinted',
 'pieces',
 'by',
 'charles',
 'dickens',
 'with',
 'illustrations']

## Finding the Prefixed Morphemes

In [7]:
uniq_words = Counter(words)

In [8]:
uniq_words.most_common(10)

[('the', 157339),
 ('and', 116892),
 ('to', 89323),
 ('of', 84232),
 ('a', 73244),
 ('i', 67115),
 ('in', 55744),
 ('that', 46216),
 ('it', 45471),
 ('he', 41988)]

We extract the characters and we count them 

In [9]:
chars = list(''.join(words))
uniq_chars = Counter(chars)

In [10]:
uniq_chars

Counter({'e': 1704052,
         't': 1240514,
         'a': 1132119,
         'o': 1077682,
         'i': 997135,
         'n': 984100,
         'h': 887415,
         's': 878534,
         'r': 831589,
         'd': 647770,
         'l': 554244,
         'u': 400424,
         'm': 399958,
         'w': 339181,
         'c': 333347,
         'y': 305103,
         'f': 299386,
         'g': 297893,
         'p': 247241,
         'b': 216644,
         'k': 128233,
         'v': 127503,
         'x': 19707,
         'j': 18095,
         'q': 16253,
         'z': 4579,
         'ë': 3,
         'æ': 2,
         'é': 2,
         'ô': 1,
         'ê': 1,
         'œ': 1,
         'ö': 1})

In [11]:
len(uniq_chars)

33

## Morpheme Discovery

In [12]:
prefix_size = 4
cutoff = 100

In [13]:
prefixes = [word[:prefix_size]
            for word in uniq_words if len(word) > prefix_size]

In [14]:
wordlist = uniq_words.keys()

For a given prefix and a wordlist, we compute the distribution of the following character

In [15]:
def next_char_dist(prefix, wordlist):
    next_chars = []
    prefix_len = len(prefix)
    for word in wordlist:
        if word.startswith(prefix) and len(word) > prefix_len:
            next_chars += word[prefix_len]
    dist = Counter(next_chars)
    return dist

Déjean's first rule to determine if a prefix is a morpheme

In [16]:
def is_morpheme(prefix, wordlist):
    dist = next_char_dist(prefix, wordlist)
    cnt_next_char = sum(dist.values())
    if len(dist) > len(uniq_chars) / 2 and dist.most_common()[0][1] < cnt_next_char / 2:
        return True
    else:
        return False

In [17]:
is_morpheme('in', wordlist)

True

In [18]:
def prefix_count(prefix, wordlist):
    return sum(map(lambda x: x.startswith(prefix) and len(x) > len(prefix), wordlist))

In [19]:
prefix_count('fore', wordlist)

61

A function to extract the prefixed morphemes of a given size. 

In [20]:
def extract_prefix_morphemes(wordlist, prefix_size):
    morphemes = {}
    prefixes = set([word[:prefix_size]
                   for word in wordlist if len(word) > prefix_size])
    for prefix in prefixes:
        if is_morpheme(prefix, wordlist):
            morphemes[prefix + '-'] = prefix_count(prefix, wordlist)
    return morphemes

We now extract the prefixes and suffixes

In [21]:
def extract_morphemes(wordlist, prefix_size):
    # The prefixes
    morphemes = extract_prefix_morphemes(wordlist, prefix_size)
    # The suffixes
    rev_wordlist = [word[::-1] for word in wordlist]
    rev_morphemes = extract_prefix_morphemes(rev_wordlist, prefix_size)
    for key, value in rev_morphemes.items():
        morphemes[key[::-1]] = value
    return morphemes

In [22]:
morphemes = extract_morphemes(wordlist, 4)

In [23]:
morphemes

{'fore-': 61,
 'over-': 140,
 '-ants': 50,
 '-ance': 128,
 '-ling': 346,
 '-less': 155,
 '-ings': 313,
 '-ated': 305,
 '-ates': 113,
 '-able': 355,
 '-ines': 41,
 '-ered': 228,
 '-ness': 448,
 '-ages': 49,
 '-ards': 49,
 '-ined': 101,
 '-ably': 87,
 '-ment': 201}

## Segmentation
We use Déjean's list of suffixed morphemes.

In [24]:
prefix_morphemes = ['over-', 'fore-', 're-']
suffix_morphemes = ['-e', '-s', '-ed', '-ing',
                    '-al', '-ation', '-ly', '-ic', '-ent']

We create regexes for the prefixes and suffixes, where we reverse the suffixes

In [25]:
p_patterns = sorted(['^' + prefix[:-1] for prefix in prefix_morphemes],
                    key=lambda x: (-len(x), x))
p_patterns

['^fore', '^over', '^re']

In [26]:
s_patterns = sorted(['^' + suffix[1:][::-1] for suffix in suffix_morphemes],
                    key=lambda x: (-len(x), x))
s_patterns

['^noita', '^gni', '^tne', '^ci', '^de', '^la', '^yl', '^e', '^s']

In [27]:
p_regex = '|'.join(p_patterns)
p_regex

'^fore|^over|^re'

In [28]:
s_regex = '|'.join(s_patterns)
s_regex

'^noita|^gni|^tne|^ci|^de|^la|^yl|^e|^s'

In [29]:
def segment_morphemes(p_regex, s_regex, word):
    prefix_rest = re.findall(p_regex + r'|\p{L}+', word)
    if len(prefix_rest) == 1:  # no prefix
        prefix = ''
        rest = word
    elif len(prefix_rest) == 2:  # we have a prefix
        prefix = prefix_rest[0]
        rest = prefix_rest[1]
    else:
        print('Error')

    stem_suffix = re.findall(s_regex + r'|\p{L}+', rest[::-1])
    if len(stem_suffix) == 1:  # no suffix
        stem = rest
        suffix = ''
    elif len(stem_suffix) == 2:  # we have a suffix
        stem = stem_suffix[1][::-1]
        suffix = stem_suffix[0][::-1]
    else:
        print('Error')
    return list(filter(None, [prefix] + [stem] + [suffix]))

In [30]:
segment_morphemes(p_regex, s_regex, 'recelebration')

['re', 'celebr', 'ation']

In [31]:
for word in sorted(uniq_words)[:20]:
    print(segment_morphemes(p_regex, s_regex, word))

['a']
['aaron']
['aback']
['abaft']
['abandon']
['abandon', 'ed']
['abandon', 'ing']
['abandonm', 'ent']
['abandon', 's']
['abas', 'e']
['abas', 'ed']
['abasem', 'ent']
['abase', 's']
['abash']
['abash', 'ed']
['abas', 'ing']
['abat', 'e']
['abat', 'ed']
['abatem', 'ent']
['abate', 's']
