In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
chars

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [5]:
delimiter = '.'

In [6]:
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi[delimiter]=0
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [7]:
itos = {i:s for s,i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [8]:
# build the dataset
block_size = 3 # aka context length: how many characters do we take to predict the next one?

In [9]:
# X is the input to the neural network
# Y are the labels
X, Y = [], []

In [10]:
[0]*block_size

[0, 0, 0]

In [11]:
for w in words[:5]:
    
    print(w)
    context = [0]*block_size
    for ch in w + delimiter :
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append
    print('')

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .

olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .

ava
... ---> a
..a ---> v
.av ---> a
ava ---> .

isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .

sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .



In [12]:
# repeat the above, but with a larger block_size, just to show the differenct
block_size = 10
X, Y = [], []
for w in words[:5]:
    
    print(w)
    context = [0]*block_size
    for ch in w + delimiter :
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append
    print('')

emma
.......... ---> e
.........e ---> m
........em ---> m
.......emm ---> a
......emma ---> .

olivia
.......... ---> o
.........o ---> l
........ol ---> i
.......oli ---> v
......oliv ---> i
.....olivi ---> a
....olivia ---> .

ava
.......... ---> a
.........a ---> v
........av ---> a
.......ava ---> .

isabella
.......... ---> i
.........i ---> s
........is ---> a
.......isa ---> b
......isab ---> e
.....isabe ---> l
....isabel ---> l
...isabell ---> a
..isabella ---> .

sophia
.......... ---> s
.........s ---> o
........so ---> p
.......sop ---> h
......soph ---> i
.....sophi ---> a
....sophia ---> .



In [13]:
# now do everything for realz this time ... 
# which is to build out X and Y
block_size = 3
X, Y = [], []
for w in words:
    
    # print(w)
    context = [0]*block_size
    for ch in w + delimiter :
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        # print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append

In [14]:
X[:5]

[[0, 0, 0], [0, 0, 5], [0, 5, 13], [5, 13, 13], [13, 13, 1]]

In [15]:
Y[:5]

[5, 13, 13, 1, 0]

In [16]:
X = torch.tensor(X)
Y = torch.tensor(Y)

In [17]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

Now lets build the embedding lookup C which has a total of 27 characters.

In [18]:
# we will use a 2 dimensional embedding for every one of the 27 characters
C = torch.rand((27,2))
C

tensor([[0.6680, 0.0588],
        [0.3202, 0.4285],
        [0.7693, 0.3721],
        [0.1076, 0.0988],
        [0.3423, 0.3139],
        [0.0417, 0.6157],
        [0.7270, 0.4423],
        [0.5763, 0.2202],
        [0.2090, 0.1185],
        [0.5960, 0.6598],
        [0.9786, 0.9839],
        [0.3047, 0.3107],
        [0.9500, 0.1429],
        [0.7184, 0.6512],
        [0.0538, 0.0610],
        [0.3708, 0.7827],
        [0.2297, 0.7037],
        [0.2417, 0.3192],
        [0.2914, 0.1133],
        [0.2321, 0.9429],
        [0.3807, 0.8386],
        [0.7142, 0.6058],
        [0.1093, 0.7004],
        [0.4850, 0.3159],
        [0.3795, 0.4560],
        [0.9520, 0.3830],
        [0.7886, 0.3862]])

In [19]:
F.one_hot(torch.tensor(5), num_classes=27)

tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0])

In [20]:
# this fails ... 
F.one_hot(torch.tensor(5), num_classes=27) @ C

RuntimeError: expected scalar type Long but found Float

In [21]:
F.one_hot(torch.tensor(5), num_classes=27).dtype

torch.int64

In [22]:
C.dtype

torch.float32

In [23]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([0.0417, 0.6157])

In [24]:
# the above operation pulls out ... 
C[5]

tensor([0.0417, 0.6157])

In [25]:
C[[5, 6, 7]]

tensor([[0.0417, 0.6157],
        [0.7270, 0.4423],
        [0.5763, 0.2202]])

In [26]:
C[torch.tensor([5, 6, 7])]

tensor([[0.0417, 0.6157],
        [0.7270, 0.4423],
        [0.5763, 0.2202]])

In [27]:
C[:6]

tensor([[0.6680, 0.0588],
        [0.3202, 0.4285],
        [0.7693, 0.3721],
        [0.1076, 0.0988],
        [0.3423, 0.3139],
        [0.0417, 0.6157]])

In [28]:
C[[0, 1, 5, 13]]

tensor([[0.6680, 0.0588],
        [0.3202, 0.4285],
        [0.0417, 0.6157],
        [0.7184, 0.6512]])

In [29]:
X[:5]

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1]])

In [30]:
# so notice how we use X to index into C ...
C[X][:5]

tensor([[[0.6680, 0.0588],
         [0.6680, 0.0588],
         [0.6680, 0.0588]],

        [[0.6680, 0.0588],
         [0.6680, 0.0588],
         [0.0417, 0.6157]],

        [[0.6680, 0.0588],
         [0.0417, 0.6157],
         [0.7184, 0.6512]],

        [[0.0417, 0.6157],
         [0.7184, 0.6512],
         [0.7184, 0.6512]],

        [[0.7184, 0.6512],
         [0.7184, 0.6512],
         [0.3202, 0.4285]]])

In [31]:
C[X].shape

torch.Size([228146, 3, 2])

In [32]:
emb = C[X]
emb.shape

torch.Size([228146, 3, 2])