In [36]:
import torch

# consts
PREV_CHAR_COUNT = 3

## Character mapping

In [37]:
# Denotes beginning and end
SPECIAL_CHAR = '.'
CHAR_TO_ID_STR = SPECIAL_CHAR + 'abcdefghijklmnopqrstuvwxyz'

char_to_id = {}
id_to_char = []

for i in range(len(CHAR_TO_ID_STR)):
    c = CHAR_TO_ID_STR[i]
    id_to_char.append(c)
    char_to_id[c] = i


## Load the text file

In [3]:
words = open('data/names.txt', 'r').read().splitlines()

In [5]:
len(words)

32033

## Word to labeled pairs

In [25]:
def word_to_labeled_pairs(word: str, prev_char_count: int, char_to_id):
    """
    Given a word (E.g. "emma") and prev_char_count, e.g. 3, return [xs,ys], the labeled data for learning.
    Conceptually, the returned xs, ys are like so:
    ... -> e
    ..e -> m
    .em -> m
    emm -> a
    mma -> .
    
    Each xs is an int array of size 3.
    Each ys is an int.
    char_to_id is in charge of mapping
    """
    xs = []
    ys = []
    expanded_word = [SPECIAL_CHAR] * prev_char_count + list(word) + [SPECIAL_CHAR]
    for left in range(len(word) + 1):
        xs.append([char_to_id[expanded_word[left + offset]] for offset in range(prev_char_count)])
        ys.append(char_to_id[expanded_word[left + prev_char_count]])
    return xs, ys


In [26]:
xs, ys = word_to_labeled_pairs("emma", PREV_CHAR_COUNT, char_to_id)
for i in range(len(ys)):
    print(f"{[id_to_char[x] for x in xs[i]]} -> {id_to_char[ys[i]]}")

['.', '.', '.'] -> e
['.', '.', 'e'] -> m
['.', 'e', 'm'] -> m
['e', 'm', 'm'] -> a
['m', 'm', 'a'] -> .


## Mini training data - just first 5 words

In [38]:
mini_x, mini_y = [], []
for word in words[:5]:
    xs, ys = word_to_labeled_pairs(word, PREV_CHAR_COUNT, char_to_id)
    mini_x.extend(xs)
    mini_y.extend(ys)
mini_x = torch.tensor(mini_x)
mini_y = torch.tensor(mini_y)

In [39]:
mini_x.shape, mini_x.dtype, mini_y.shape, mini_y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)