In [1]:
import torch
import torch.nn.functional as F 
import matplotlib.pyplot as pyplot
%matplotlib inline

In [94]:
words = open("words_alpha.txt", "r").read().splitlines()

words[1:10]

['aa', 'aaa', 'aah', 'aahed', 'aahing', 'aahs', 'aal', 'aalii', 'aaliis']

In [95]:
len(words)

370104

In [96]:
# lookup tables

characters = sorted(list(set("".join(words))))
stoi = {s : i + 1 for i, s in enumerate(characters)}
stoi["."] = 0
itos = {i : s for s, i in stoi.items()}

In [97]:
# building the dataset

block_size = 3 # how many chars used to predict the next character
X, y = [], []

for w in words:
    context = [0] * block_size # start characters in the start
    for ch in w + '.':
        ix = stoi[ch] 
        X.append(context)
        y.append(ix)
        context = context[1:] + [ix] # crop and append 

X = torch.tensor(X)
y = torch.tensor(y)

In [110]:
g = torch.Generator().manual_seed(2)

# encoding alphabets into 2-dim space
C = torch.randn((27, 2), generator = g)

# hidden layer 1
W1 = torch.randn((6, 100), generator = g)
b1 = torch.randn(100, generator = g)

# output layer
W2 = torch.randn((100, 27), generator = g)
b2 = torch.randn(27, generator = g)

parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True

In [109]:
for i in range(10000):
    # make minibatches for more efficiency
    mini_batch_size = 32
    ix = torch.randint(0, X.shape[0], (mini_batch_size, ))
    
    # forward pass
    emb = C[X[ix]] # indexing using X
    # emb = torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]]) # concatenating along dimension 1
    # emb = torch.cat(torch.unbind(emb, 1), 1) # concatenating along dimension 1 (dynamic)
    # emb = emb.view(emb.shape[0], 6) # concatenating along dimension 1 (dynamic) (efficient)
    h = torch.tanh((emb.view(emb.shape[0], 6) @ W1) + b1)
    logits = ((h @ W2) + b2)
    # counts = logits.exp()
    # probs = counts / counts.sum(1, keepdim = True)
    # loss = -probs[torch.arange(emb.shape[0]), y].log().mean()
    # can be done using cross_entropy
    loss = F.cross_entropy(logits, y[ix])

    print(loss)

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
        
    # update
    for p in parameters:
        p.data += -0.1 * p.grad

tensor(3.3893, grad_fn=<NllLossBackward0>)
tensor(3.2710, grad_fn=<NllLossBackward0>)
tensor(3.1979, grad_fn=<NllLossBackward0>)
tensor(3.7880, grad_fn=<NllLossBackward0>)
tensor(3.5598, grad_fn=<NllLossBackward0>)
tensor(3.9281, grad_fn=<NllLossBackward0>)
tensor(3.7758, grad_fn=<NllLossBackward0>)
tensor(2.7835, grad_fn=<NllLossBackward0>)
tensor(4.1528, grad_fn=<NllLossBackward0>)
tensor(3.8560, grad_fn=<NllLossBackward0>)
tensor(3.3033, grad_fn=<NllLossBackward0>)
tensor(2.9444, grad_fn=<NllLossBackward0>)
tensor(3.5751, grad_fn=<NllLossBackward0>)
tensor(3.7938, grad_fn=<NllLossBackward0>)
tensor(3.5360, grad_fn=<NllLossBackward0>)
tensor(3.5606, grad_fn=<NllLossBackward0>)
tensor(3.5198, grad_fn=<NllLossBackward0>)
tensor(3.2575, grad_fn=<NllLossBackward0>)
tensor(3.5835, grad_fn=<NllLossBackward0>)
tensor(4.8340, grad_fn=<NllLossBackward0>)
tensor(3.7279, grad_fn=<NllLossBackward0>)
tensor(3.6671, grad_fn=<NllLossBackward0>)
tensor(3.4862, grad_fn=<NllLossBackward0>)
tensor(3.49