In [33]:
from typing import List, Tuple
import random

In [27]:
def load_data(path: str) -> Tuple[List[str], List[str]]:

    with open(path, 'r') as f:
        data = f.read()

    words = data.splitlines()
    words = [word.strip() for word in words] # Remove leading/trailing whitespace
    words = [word for word in words if word] # Remove empty strings

    vocab = sorted(list(set(''.join(words))))
    vocab = ['<eos>'] + vocab
    print(f"number of examples in dataset: {len(words)}")
    print(f"max word length: {max([len(word) for word in words])}")
    print(f"min word length: {min([len(word) for word in words])}")
    print(f"unique characters in dataset: {len(vocab)}")
    print("vocabulary:")
    print(' '.join(vocab))
    print('example for a word:')
    print(words[0])
    return words, vocab

words, vocab = load_data('names.txt')

number of examples in dataset: 32033
max word length: 15
min word length: 2
unique characters in dataset: 27
vocabulary:
<eos> a b c d e f g h i j k l m n o p q r s t u v w x y z
example for a word:
emma


In [30]:
def encode(word: str, vocab: List[str]) -> List[int]:
    """
    Encode a word, add <eos> at the beginning and the end of the word.
    """
    return [vocab.index('<eos>')] + [vocab.index(char) for char in word] + [vocab.index('<eos>')]

def decode(indices: List[int], vocab: List[str]) -> str:
    """
    Decode a list of indices to a word using the vocabulary.
    """
    return ''.join([vocab[index] for index in indices])

for i in range(5):
    print(f"word: {words[i]}")
    print(f"encoded: {encode(words[i], vocab)}")
    print(f"decoded: {decode(encode(words[i], vocab), vocab)}")
    print()

word: emma
encoded: [0, 5, 13, 13, 1, 0]
decoded: <eos>emma<eos>

word: olivia
encoded: [0, 15, 12, 9, 22, 9, 1, 0]
decoded: <eos>olivia<eos>

word: ava
encoded: [0, 1, 22, 1, 0]
decoded: <eos>ava<eos>

word: isabella
encoded: [0, 9, 19, 1, 2, 5, 12, 12, 1, 0]
decoded: <eos>isabella<eos>

word: sophia
encoded: [0, 19, 15, 16, 8, 9, 1, 0]
decoded: <eos>sophia<eos>



In [32]:
encoded_words = [encode(word, vocab) for word in words]
print(encoded_words[0])
print(decode(encoded_words[0], vocab))
print(len(encoded_words))
print(len(encoded_words[0]))

[0, 5, 13, 13, 1, 0]
<eos>emma<eos>
32033
6
