In [87]:
import torch
import torch.nn.functional as F

# consts
PREV_CHAR_COUNT = 3

## Character mapping

In [46]:
# Denotes beginning and end
SPECIAL_CHAR = '.'
CHAR_TO_ID_STR = SPECIAL_CHAR + 'abcdefghijklmnopqrstuvwxyz'

char_to_id = {}
id_to_char = []

for i in range(len(CHAR_TO_ID_STR)):
    c = CHAR_TO_ID_STR[i]
    id_to_char.append(c)
    char_to_id[c] = i

NUM_CHAR = len(id_to_char)

## Load the text file

In [3]:
words = open('data/names.txt', 'r').read().splitlines()

In [5]:
len(words)

32033

## Word to labeled pairs

In [25]:
def word_to_labeled_pairs(word: str, prev_char_count: int, char_to_id):
    """
    Given a word (E.g. "emma") and prev_char_count, e.g. 3, return [xs,ys], the labeled data for learning.
    Conceptually, the returned xs, ys are like so:
    ... -> e
    ..e -> m
    .em -> m
    emm -> a
    mma -> .
    
    Each xs is an int array of size 3.
    Each ys is an int.
    char_to_id is in charge of mapping
    """
    xs = []
    ys = []
    expanded_word = [SPECIAL_CHAR] * prev_char_count + list(word) + [SPECIAL_CHAR]
    for left in range(len(word) + 1):
        xs.append([char_to_id[expanded_word[left + offset]] for offset in range(prev_char_count)])
        ys.append(char_to_id[expanded_word[left + prev_char_count]])
    return xs, ys


In [26]:
xs, ys = word_to_labeled_pairs("emma", PREV_CHAR_COUNT, char_to_id)
for i in range(len(ys)):
    print(f"{[id_to_char[x] for x in xs[i]]} -> {id_to_char[ys[i]]}")

['.', '.', '.'] -> e
['.', '.', 'e'] -> m
['.', 'e', 'm'] -> m
['e', 'm', 'm'] -> a
['m', 'm', 'a'] -> .


## Mini training data - just first 5 words

In [38]:
mini_x, mini_y = [], []
for word in words[:5]:
    xs, ys = word_to_labeled_pairs(word, PREV_CHAR_COUNT, char_to_id)
    mini_x.extend(xs)
    mini_y.extend(ys)
mini_x = torch.tensor(mini_x)
mini_y = torch.tensor(mini_y)

In [39]:
mini_x.shape, mini_x.dtype, mini_y.shape, mini_y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

## Model
- Input dim: (N,3) = (input size, num context word)
- Embed w/ size 2: (N, 6)
- FCC layer: tanh(W1x + b1): (N, 100)
- Compute logit: W2x + b2: (N, 27)
- Model is complete here!
- But for loss, we add cross-entropy against the Y's

In [70]:
# consts
N = len(mini_y)
EMBED_SIZE = 2
EMBED_CONCAT_VEC_SIZE = EMBED_SIZE * PREV_CHAR_COUNT
NUM_HIDDEN_NEURON = 100

In [92]:
# params

g = torch.Generator().manual_seed(2147483647)
embed = torch.randn((NUM_CHAR, embed_size), generator = g)
w1 = torch.randn((EMBED_CONCAT_VEC_SIZE, NUM_HIDDEN_NEURON), generator = g)
b1 = torch.randn((NUM_HIDDEN_NEURON), generator = g)
w2 = torch.randn((NUM_HIDDEN_NEURON, NUM_CHAR), generator = g)
b2 = torch.randn((NUM_CHAR), generator = g)
params = [embed, w1, b1, w2, b2]

In [83]:
# model

# [LAYER] Embedding. 
# embed (27,2), mini_x (32,3) -> (32,3,2) -- do embedding lookup
x_looked_up = embed[mini_x]
assert x_looked_up.shape == (N, PREV_CHAR_COUNT, EMBED_SIZE)
x_looked_up_shaped = x_looked_up.view(N, EMBED_CONCAT_VEC_SIZE)
assert x_looked_up_shaped.shape == (N, EMBED_CONCAT_VEC_SIZE)

In [84]:
# [LAYER] FCC
fcc = (x_looked_up_shaped @ w1 + b1).tanh()
assert fcc.shape == (N, NUM_HIDDEN_NEURON)

In [85]:
# [LAYER] logit
logits = fcc @ w2 + b2

In [88]:
# [Loss layer]
loss = F.cross_entropy(logits, mini_y)

In [89]:
loss

tensor(12.4674)