# Why split file in subwords
Neural machine translation works by encoding each words as an integer, embedding it, and putting it through a recurrent neural network. 
The problem with this approach is that it is difficult to determine an embedding for infrequent words (like, retinoscopy).

Recently researchers discovered that splitting words in subwords helps with neural machine translation. 
Especially in languages that can concatenate words (Dutch, German, ..) splitting these concatenated words to more commonly occuring words helps to generalise translation. 
Another advantage is dealing with abbreviations (like CIA, or NSA). By splitting these down to their core letters a network will learn to translate every abbreviation. 



# Current state of the art

A commonly used script is that of Sennich et al. (https://github.com/rsennrich/subword-nmt). They use bite-pair encoding to reduce the amount of words in a text. 

# Simple splitter
As a nice evening exercise I wanted to implement a slightly different approach: the one of finding common prefixes in words and encoding a file using that. 

Here is a short writeup of my stuff. 

In [3]:
def get_subwords_in_word(word):
    subwords = []
    for subword_length in range(0,len(word)):
        for i in range(len(word)-subword_length):
            subwords.append(word[i:i+subword_length+1])
    return subwords

assert len(get_subwords_in_word("great")[0])==1
assert get_subwords_in_word("great")[-1]=='great'

print(get_subwords_in_word("great"))

['g', 'r', 'e', 'a', 't', 'gr', 're', 'ea', 'at', 'gre', 'rea', 'eat', 'grea', 'reat', 'great']


# Getting the frequency of subwords
Now that we know what subwords are in a word, let's get all subwords in a file and sort them by frequency.
Note that sometimes single letters (like X) don't occur that often. It is important to add each single-letter subword to prevent this problem. 



In [4]:
def get_subword_and_frequencies(words):
    subword_frequency = dict()
    for word in words:
        for subword in get_subwords_in_word(word):
            if subword not in subword_frequency:
                subword_frequency[subword]=0
            subword_frequency[subword]+=1

    subwords_and_frequency = list(subword_frequency.items())
    subwords_and_frequency = [(b,a) for a,b in subwords_and_frequency]
    return subwords_and_frequency

def get_n_subwords(words,first_n):
    subwords_and_frequency = get_subword_and_frequencies(words)
    subwords_and_frequency.sort(reverse=True)
    
    # initialise the subwords to all single-letter subwords
    subwords = [word for freq,word in subwords_and_frequency if len(word)==1]
    
    # add multi-length subwords till we have the first n 
    for freq,word in subwords_and_frequency:
        if len(word)>1:
            subwords.append(word)
        if len(subwords)==first_n:
            break
    
    return subwords

# old_subwords = get_n_subwords(old_words,300)
# old_subwords.sort()
# new_subwords = get_n_subwords(new_words,300)
# new_subwords.sort()
# print(old_subwords)
# print(new_subwords)
    

In [5]:
# Let's test what subwords we get for the book of Sherlock Holmes
# which can be downloaded here: http://norvig.com/big.txt

import re

def only_lowercase_words(text): return re.findall(r'\w+', text.lower())
def only_words(text): return re.findall(r'\w+', text)

_WORD_SPLIT = re.compile("([.,!?\"':;)( ])")
def basic_tokenizer(sentence):
  """Very basic tokenizer: split the sentence into a list of tokens."""
  words = []
  for space_separated_fragment in sentence.strip().split():
    words.extend(_WORD_SPLIT.split(space_separated_fragment))
  return [w for w in words if w]


def get_n_subwords_filename(filename,first_n):
    tokens_here = []
    with open(filename) as file_input:
        for line in file_input:
            tokens_here.extend(basic_tokenizer(line))
    words = tokens_here
    subwords_and_frequency = get_subword_and_frequencies(words)
    subwords_and_frequency.sort(reverse=True)
    
    # initialise the subwords to all single-letter subwords
    subwords = [word for freq,word in subwords_and_frequency if len(word)==1]
    
    # add multi-length subwords till we have the first n 
    for freq,word in subwords_and_frequency:
        if len(word)>1:
            subwords.append(word)
        if len(subwords)==first_n:
            break
    
    return subwords
def words(text):
    return basic_tokenizer(text)
    
# Let's see how fast we can read the data, and how fast we can subword it
# %timeit new_subwords = get_n_subwords(words(open('big.txt').read()),300)
new_subwords = get_n_subwords_filename('big.txt',300)
print(new_subwords)
%timeit get_n_subwords_filename('big.txt',300)


['e', 't', 'a', 'o', 'n', 'i', 's', 'r', 'h', 'd', 'l', 'c', 'u', 'm', 'f', 'w', 'g', 'p', 'y', ',', 'b', '.', 'v', 'k', '"', 'I', '-', 'T', 'A', "'", 'x', 'P', 'S', 'H', 'N', 'W', 'M', 'C', 'B', 'E', 'R', '1', '_', 'j', 'F', 'q', '!', 'O', '?', 'D', 'z', ';', 'G', '0', '2', 'L', '8', '3', '4', 'Y', '5', '9', '6', '7', ':', 'V', '=', ')', '(', 'K', 'U', 'J', '#', 'X', '*', ']', '[', '|', 'Q', 'Z', '/', '$', '+', '@', '&', '%', '>', '~', '^', '<', 'he', 'th', 'in', 'the', 'er', 'an', 're', 'on', 'nd', 'at', 'en', 'ed', 'es', 'is', 'and', 'or', 'ng', 'of', 'to', 'te', 'ha', 'it', 'ou', 'ti', 'ar', 'hi', 'as', 'st', 'se', 'nt', 'ing', 'le', 'al', 'me', 've', 'ne', 'ro', 'ri', 'ea', 'de', 'co', 'ce', 'io', 'll', 'ra', 'om', 'ic', 'ion', 'be', 'li', 'ho', 'ur', 'ch', 'la', 'si', 'wa', 'el', 'ut', 'ss', 'ma', 'us', 'ad', 'ly', 'no', 'ta', 'her', 'ent', 'tio', 'tion', 'ca', 'di', 'wh', 'fo', 'ot', 'ow', 'wi', 'pe', 'nc', 'lo', 'un', 'sh', 'il', 'hat', 'ns', 'ac', 'rs', 'ai', 'ie', 'im', 'ol',

As you can see there are several normal English words in there (with, the, here) which are common to the english language. 

Let's try to tokenize a sentence:

In [10]:
def get_subworded(word,subwords):
    if word=="":
        return []
    for i in range(len(word),-1,-1):
        try:
            if word[:i] in subwords:
                base = [word[:i]]
                base.extend(get_subworded(word[i:],subwords))
                return base
        except:
            print("nooo exception!")
            print(word)
SUBWORD_SEPERATOR = "@@ "
def subword_sentence(sentence,subwords):
    tokens = sentence.split()
    subworded_sentence = ""
    for word in tokens:
        subwords_here = get_subworded(word,subwords)
        first_subwords = subwords_here[:-1]
        last_subword = subwords_here[-1]
        for subw in first_subwords:
            subworded_sentence += subw + SUBWORD_SEPERATOR
        subworded_sentence += last_subword + " "
    return subworded_sentence

print(subword_sentence("there hello",new_subwords))

ther@@ e he@@ ll@@ o 


# Comparing split methods
Let's see how this split method compares to the one of Sennich et al. 

The video card I can use for this small experiment is a NVideo Quadro 4400 or something, so not that well suited for neural networks. This makes me run only 2 experiments: 1500 subwords and 3000 subwords. 

So the experiment setup is:

                1500 subwords.... 3000 subwords
                
No subwords

Sennich subwords

My subword method

The dataset I will use is the WMT 2014 dataset by default used by translate.py. The scores I will compare are the perplexity on the testset after 20.000 iterations. The network has seen 1.280.000 sentences at that moment


In [15]:
filename_input = '/home/roland/tensorflow/tensorflow/models/rnn/translate/newwmtdatadir/giga-fren.release2.fixed.fr'
n_subwords = 1500
subwords = get_n_subwords_filename(filename_input,n_subwords)


In [16]:
subwords.sort()
subwords.sort(key=len)
print(subwords)

['\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\x0e', '\x0f', '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', '\x1b', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x7f', '\x80', '\x81', '\x82', '\x83', '\x84', '\x86', '\x87', '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '\xad', '®', '¯', '°', '±', '²', '³'

In [14]:
# http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=CAA36398F918BEFE49955BB13B4F7082?doi=10.1.1.117.1928&rep=rep1&type=pdf
# apparently according to this paper it would be better to focus on the END of the word instead... 
# ... would be nice to try!

In [None]:
filename_output = '/home/roland/tensorflow/tensorflow/models/rnn/translate/newwmtdatadir/giga-fren.release2.fixed.fr.subworded_1500'
with open(filename_input) as input_file, open(filename_output,'w') as output_file:
    line_now = 0
    for line in input_file:
        line_now +=1
        if line_now %250==1:
            print(str(line_now))
       # print(line)
        output_line = subword_sentence(line,subwords)
        output_file.write(output_line+"\n")

In [21]:
subwords.sort()
subwords.sort(key=len, reverse=False) # sorts by descending length

print(subwords)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ª', '²', '³', 'µ', '¹', 'º', '¼', '½', '¾', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ', 'ā', 'ă', 'ć', 'Č', 'č', 'Ē', 'ě', 'ı', 'ň', 'ő', 'Œ', 'œ', 'Ş', 'ş', 'Š', 'š', 'ū', 'Ÿ', 'Ž', 'ž', 'ƒ', 'ʹ', 'ʺ', 'ʼ', 'ˆ', 'ˇ', 'Α', 'Γ', 'Δ', 'Ε', 'Ζ', 'Η', 'Θ', 'Λ', 'Μ', 'Π', 'Σ', 'Τ', 'Υ', 'Φ', 'Χ', 'Ψ', 'Ω', 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ',

In [9]:
def subword_file_filenames(filename_input,filename_output,n_subwords):
    subwords = get_n_subwords(words(open(filename_input).read()),n_subwords)    
    with open(filename_input) as input_file, open(filename_output,'w') as output_file:
        for line in input_file:
            output_line = subword_sentence(line,subwords)
            output_file.write(output_line+"\n")

subword_file_filenames('/home/roland/tensorflow/tensorflow/models/rnn/translate/newwmtdatadir/giga-fren.release2.fixed.en','/home/roland/tensorflow/tensorflow/models/rnn/translate/newwmtdatadir/giga-fren.release2.fixed.en.subworded_1500',1500)
# Let's see how fast we can read the data, and how fast we can subword it
#subword_english_wmt = get_n_subwords(words(open('/home/roland/tensorflow/tensorflow/models/rnn/translate/newwmtdatadir/giga-fren.release2.fixed.en').read()),1500)
#print(subword_english_wmt)


TypeError: 'NoneType' object is not subscriptable

In [None]:
import re

_WORD_SPLIT = re.compile("([.,!?\"':;)( ])")

old_words = []
new_words = []

with open("clin2017/lexicon.txt") as lexicon_file:
    for two_words in lexicon_file:
        old, new = two_words.split()
        old_words.append(old)
        new_words.append(new)

In [None]:
def get_subworded(word,subwords):
    if word=="":
        return []
    for i in range(len(word),-1,-1):
        try:
            if word[:i] in subwords:

                base = [word[:i]]
                base.extend(get_subworded(word[i:],subwords))
                return base
        except:
            print(word)

def get_subworded_words(words,subwords):
    subworded = []
    for complete_word in words:
        subwords_here = get_subworded(complete_word,subwords)        
        subworded.append(subwords_here)
    return subworded
subworded = get_subworded_words(old_words,old_subwords)
a = [len(b) for b in subworded]
print(max(a))
        

In [None]:
def basic_tokenizer(sentence):
  """Very basic tokenizer: split the sentence into a list of tokens."""
  words = []
  for space_separated_fragment in sentence.strip().split():
    words.extend(_WORD_SPLIT.split(space_separated_fragment))
  return [w for w in words if w]


with open("clin2017/1637/bible.txt") as input:
    input_lines = [l for l in input]
    
    #input_words = [w for l in input_lines for w in re.split(_WORD_SPLIT,l) ]
    #with open("subwords_bible_1637.txt",'w') as output:
        
                


In [None]:
old_words = []
for l in input_lines:
    tokenized_sentence = basic_tokenizer(l)
    old_words.extend(tokenized_sentence)
old_subwords = get_n_subwords(old_words,300)
old_subwords.sort()
print(old_subwords)

    

In [None]:

def subword_file(filename_input,filename_output):
    to_output_lines = []
    with open(filename_input) as input_file:
        input_lines = [l for l in input_file]
    print(len(input_lines))
    old_words = []
    print("done with old words")
    for l in input_lines:
        tokenized_sentence = basic_tokenizer(l)
        old_words.extend(tokenized_sentence)
    subwords = get_n_subwords(old_words,300)
    subwords.sort()
    print('done subwords')
    for l in input_lines:
        tokenized_sentence = basic_tokenizer(l)
        this_line = ""

        for w in tokenized_sentence:
            tokenized_word_here = get_subworded(w,subwords)
            for token in tokenized_word_here[:-1]:
                this_line+=token+"@@ "
            this_line+=tokenized_word_here[-1]+ " "
        to_output_lines.append(this_line)
    print(' ready to output')
    for line in to_output_lines[:10]:
        print(line)
    with open(filename_output,'w' ) as output_file:
        for line in to_output_lines:
            
            output_file.write(line)
            output_file.write("\n")
subword_file("clin2017/1637/bible.txt","subworded_prefix_roland_bible_1637.txt")
subword_file("clin2017/1888/bible.txt","subworded_prefix_roland_bible_1888.txt")

In [None]:
print(input_words)