In [1]:
import math
import numpy

import torch
import torch.nn.functional as F

import matplotlib.pyplot as plt
%matplotlib inline

# Input

In [8]:
words = open('names.txt', 'r').read().splitlines()
print(f'{len(words)}')
words[:8]

32033


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

lookup tables

In [33]:
chars = list('abcdefghijklmnopqrstuvwxyz')
stoi_lookup = {c: i+1 for i, c in enumerate(chars)}
stoi_lookup['.'] = 0
itos_lookups = {i: char for char, i in stoi_lookup.items()}

### Dataset creation

-  both BOS and EOS are represented using '.'

- context - characters to be considered for the next prediction
    - what would be context for the first letter? just '.', the number of '.' will depend on the length of the context


- make sure to represent the EOS as well

In [79]:
block_size = 3 # context length

X, Y = [], []

for word in words[:3]:
    print('word: ', word)
    context = [0] * block_size
    seq = word + '.'    # don't forget to add the . add the end

    for char in seq:
        X.append(context)
        # index of the character to be predicted
        y_i = stoi_lookup[char]
        Y.append(y_i)

        print(''.join(itos_lookups[c] for c in context), '--->', itos_lookups[y_i])
        context = context[1:] + [y_i]

X = torch.tensor(X)
Y = torch.tensor(Y)

word:  emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
word:  olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
word:  ava
... ---> a
..a ---> v
.av ---> a
ava ---> .


In [80]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([16, 3]), torch.int64, torch.Size([16]), torch.int64)

In [81]:
for i in range(len(X)):
    print("context: ", X[i], " next char: ", Y[i])

context:  tensor([0, 0, 0])  next char:  tensor(5)
context:  tensor([0, 0, 5])  next char:  tensor(13)
context:  tensor([ 0,  5, 13])  next char:  tensor(13)
context:  tensor([ 5, 13, 13])  next char:  tensor(1)
context:  tensor([13, 13,  1])  next char:  tensor(0)
context:  tensor([0, 0, 0])  next char:  tensor(15)
context:  tensor([ 0,  0, 15])  next char:  tensor(12)
context:  tensor([ 0, 15, 12])  next char:  tensor(9)
context:  tensor([15, 12,  9])  next char:  tensor(22)
context:  tensor([12,  9, 22])  next char:  tensor(9)
context:  tensor([ 9, 22,  9])  next char:  tensor(1)
context:  tensor([22,  9,  1])  next char:  tensor(0)
context:  tensor([0, 0, 0])  next char:  tensor(1)
context:  tensor([0, 0, 1])  next char:  tensor(22)
context:  tensor([ 0,  1, 22])  next char:  tensor(1)
context:  tensor([ 1, 22,  1])  next char:  tensor(0)


### Embeddings lookup table

in [Bengio et al](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf), 17k words were embeeded into 30 dims

we have to embed 27 chars into small dims, lets start with 2 dims embedding for now

In [82]:
# 27 chars emebedded into two dims randomly
C = torch.randn((27, 2))
C

tensor([[ 0.4536, -1.8195],
        [-1.0301,  0.7013],
        [ 0.1819,  0.2704],
        [ 0.0020,  0.9944],
        [-1.0629, -0.5233],
        [-1.2688, -0.9681],
        [ 0.0637,  1.4289],
        [-0.0888,  0.1548],
        [ 1.6831,  0.7748],
        [ 0.0676,  0.2906],
        [ 0.2457,  0.2325],
        [-0.3523, -0.0395],
        [ 0.9890,  0.5769],
        [ 0.7821,  0.2329],
        [ 0.1841,  0.8453],
        [-0.2311,  0.3022],
        [-0.2406, -0.2248],
        [ 0.5129, -0.5599],
        [-0.4448, -0.2148],
        [-0.2106,  1.1540],
        [ 1.7741, -1.2536],
        [-0.0752, -0.8449],
        [ 1.0004, -0.3358],
        [ 0.3133,  0.4065],
        [-1.4924, -0.2883],
        [-0.5654, -0.1525],
        [-1.0318, -1.4119]])

now we have to embed the integers in the input X using the above lookup table C

In [84]:
embs = C[X]
embs.shape

torch.Size([16, 3, 2])