In [279]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch.nn as nn
import matplotlib.pyplot as plt

In [280]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [281]:
len(words)

32033

In [282]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [283]:
context_window = 3
X, Y = [], []

for w in words:
    context = [0] * context_window
#     print(w)
    for i in w + '.':
        ix = stoi[i]
        X.append(context)
        Y.append(ix)
#         print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]
        
X = torch.tensor(X)      # This is the input part which has all the trigrams i.e. 196113 in a matrix of 196113 x 3
Y = torch.tensor(Y)      # This contains the right index for the predicting char

In [284]:
X.size(), Y.size()

(torch.Size([228146, 3]), torch.Size([228146]))

In [285]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [286]:
# <----- DATASET LAOADED ----->
# Time to start with with embeddings and then squeeze the dimension
# Embeddings ---> Representation of inputs, just like GPT uses tokenization

In [287]:
# First embedding then one hot encoding

In [288]:
vocab_size = len(stoi)  # Size of the vocabulary
embedding_dim = 30  # Desired dimensionality of embeddings

embedding_input = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

embedding_output = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

In [289]:
embedded_X = embedding_input(X)
embedded_Y = embedding_output(Y) 

In [290]:
embedded_X.size(), embedded_Y.size(), embedded_X.dtype, embedded_Y.dtype
# 196113 - samples, 3 - context-window, 30 - each word represented by vector length 30

(torch.Size([228146, 3, 30]),
 torch.Size([228146, 30]),
 torch.float32,
 torch.float32)

In [291]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 10), generator=g)
C[X].shape
# W1 = torch.randn((30, 200), generator=g)
# b1 = torch.randn(200, generator=g)
# W2 = torch.randn((200, 27), generator=g)
# b2 = torch.randn(27, generator=g)

# parameters = [embedded_X, W1, b1, W2, b2]

torch.Size([228146, 3, 10])

In [292]:
sum(p.nelement() for p in parameters) # number of parameters in total

12297

In [293]:
h = torch.tanh(embedded_X @ W1 + b1)

In [294]:
h

tensor([[[ 0.9009, -0.8919,  0.9173,  ...,  0.6913, -0.5914, -0.9978],
         [ 0.9009, -0.8919,  0.9173,  ...,  0.6913, -0.5914, -0.9978],
         [ 0.9009, -0.8919,  0.9173,  ...,  0.6913, -0.5914, -0.9978]],

        [[ 0.9009, -0.8919,  0.9173,  ...,  0.6913, -0.5914, -0.9978],
         [ 0.9009, -0.8919,  0.9173,  ...,  0.6913, -0.5914, -0.9978],
         [-0.2020, -0.9702,  0.9517,  ..., -0.8925,  0.7936, -0.9790]],

        [[ 0.9009, -0.8919,  0.9173,  ...,  0.6913, -0.5914, -0.9978],
         [-0.2020, -0.9702,  0.9517,  ..., -0.8925,  0.7936, -0.9790],
         [-0.8572, -0.4937, -0.6267,  ...,  0.9709, -0.2637, -0.9861]],

        ...,

        [[ 0.9012, -0.9995,  0.9995,  ..., -0.7387, -0.8740,  0.7143],
         [ 0.9012, -0.9995,  0.9995,  ..., -0.7387, -0.8740,  0.7143],
         [-0.1676, -0.9973, -0.8007,  ..., -0.1375, -0.9935, -0.9861]],

        [[ 0.9012, -0.9995,  0.9995,  ..., -0.7387, -0.8740,  0.7143],
         [-0.1676, -0.9973, -0.8007,  ..., -0.1375, -0.

In [295]:
logits = (h @ W2 + b2)

In [296]:
counts = logits.exp()

In [297]:
probs = counts/counts.sum(1, keepdims=True)

In [298]:
probs.shape

torch.Size([228146, 3, 27])

In [299]:
# < ----------------------------- >

In [300]:
C = torch.randn(27,2)

In [301]:
C[X]

tensor([[[-0.0146,  1.5035],
         [-0.0146,  1.5035],
         [-0.0146,  1.5035]],

        [[-0.0146,  1.5035],
         [-0.0146,  1.5035],
         [-0.2987, -0.2897]],

        [[-0.0146,  1.5035],
         [-0.2987, -0.2897],
         [ 0.9044, -0.3588]],

        ...,

        [[-2.0746, -1.1431],
         [-2.0746, -1.1431],
         [-0.1057,  0.6533]],

        [[-2.0746, -1.1431],
         [-0.1057,  0.6533],
         [-2.0746, -1.1431]],

        [[-0.1057,  0.6533],
         [-2.0746, -1.1431],
         [-2.1319, -0.4559]]])

In [302]:
C[X].shape 
# (27, 2) (32, 3) ---> (32, 3, 2)

torch.Size([228146, 3, 2])

In [303]:
emb = C[X]
emb.shape
# 2 dimensional embeddings times 3 
# 32 tells about the input

torch.Size([228146, 3, 2])

In [304]:
w1 = torch.randn(6,100) # 3 x 2 and 100 weights
b1 = torch.randn(100)

In [305]:
h = torch.tanh(emb.view(-1, 6) @ w1 + b1)

In [306]:
h.shape

torch.Size([228146, 100])

In [307]:
w2 = torch.randn(100, 27) 
b2 = torch.randn(27)

In [308]:
logits = h @ w2 + b2
logits

tensor([[  0.6074,  -0.1410, -15.8221,  ...,   7.8369,   8.2178,  19.8254],
        [  5.1217,  -1.5179, -17.3139,  ...,  12.4496,  -5.6768,   6.2074],
        [  4.1714, -12.5883,  -8.2435,  ...,   5.6607,   4.0641,  -1.5309],
        ...,
        [-12.9618,   6.5216,   9.8955,  ...,  -7.7914,  10.3403,   0.2727],
        [-11.1920,   2.1367,   4.7510,  ...,  -5.3252,   8.4317,   2.7780],
        [-10.5078,   1.1333,  13.4914,  ...,   0.1007,  -1.6342,   5.2580]])

In [309]:
counts = logits.exp()

In [310]:
prob = counts / counts.sum(1, keepdims=True)

In [311]:
prob.shape

torch.Size([228146, 27])

In [312]:
Y

tensor([ 5, 13, 13,  ..., 26, 24,  0])

In [313]:
loss = -prob[torch.arange(32), Y].log().mean()

IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [32], [228146]

In [None]:
# --------------------------------------------------------------------------------------------------------------- #

In [332]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [316]:
max_steps = 100000
batch_size = 32
n_hidden = 200
dim = 10
vocab = 27

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((vocab, dim), generator=g)
W1 = torch.randn((30, n_hidden), generator=g) * (5/3) / (3 * 10)**0.5 # * 0.1
# b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab), generator=g) * 0.01
b2 = torch.randn(vocab, generator=g) * 0

bngain = torch.ones(1, n_hidden)
bnbias = torch.zeros(1, n_hidden) 
bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, b1, W2, b2, bngain, bnbias]

In [317]:
sum(p.nelement() for p in parameters) 

12197

In [318]:
for p in parameters:
    p.requires_grad = True

In [319]:
# If you intialize forward pass properly backward pass is initialized on its own to some extent

In [320]:
# std = gain / root fan_mode 

In [330]:
losses = []

for i in range(max_steps):
    
    # minibatch construct
    ix = torch.randint(0, X.shape[0], (batch_size,), generator=g)
    Xb, Yb = X[ix], Y[ix] # Batch of X & Y
    
    # Forward pass
    emb = C[X[ix]] # Embedding the chars
    embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
    
#     Linear layer
    hpreact = emb.view(-1, 30) @ W1 # + b1
    
#     Normalizing
    bnmeani = hpreact.mean(0, keepdim=True)
    bnstdi = hpreact.std(0, keepdim=True)
    hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias    # mean / std dev.
    
    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
#     firing rates will be unit gaussian
    h = torch.tanh(hpreact)    
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
#     print(loss.item())
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        if p.grad == None:
            continue
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    losses.append(loss.log10().item())

      0/ 100000: 2.5516
  10000/ 100000: 2.2808
  20000/ 100000: 2.1282
  30000/ 100000: 2.3596
  40000/ 100000: 2.4811
  50000/ 100000: 1.6642
  60000/ 100000: 2.0561
  70000/ 100000: 2.5376
  80000/ 100000: 2.0498
  90000/ 100000: 2.5743


In [None]:
plt.plot(losses)

In [None]:
k = torch.randn(27,1)
x = torch.randn(200, 30)


In [None]:
plt.hist(h.view(-1).tolist())
# Need to normalize this shit XD

# As majority of the neurons fire in the extremes tanh being a squashing fn
# leading the neurons to be dead

In [331]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)
block_size = 3
for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
        
    print(''.join(itos[i] for i in out))

RuntimeError: The size of tensor a (200) must match the size of tensor b (100) at non-singleton dimension 1