In [1]:
# A implementation of "A Neural probabilistic language Model, Bengio 2003
# Original paper has word level, we stick to character level 
#  

In [42]:
import torch
import torch.nn.functional as F

In [2]:
words = open('data/names.txt', 'r').read().splitlines()

In [3]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [9]:
# stoi and itos 
chars = set(''.join(words))
stoi = {c:i+1 for i,c in enumerate(sorted(chars))}
stoi['.'] = 0
itos = {v:k for k,v in stoi.items()}

In [12]:
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [37]:
# create the traning data 
block_size = 3 # no of charcters in context to predict the next one 
X, Y = [], []
for word in words[:5]:
    print(word)
    #running context 
    context = [0]*block_size
    for ch in word+'.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join([itos[i] for i in context]), "---->", itos[ix])
        context = context[1:] + [ix]
        
X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ----> e
..e ----> m
.em ----> m
emm ----> a
mma ----> .
olivia
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
ava
... ----> a
..a ----> v
.av ----> a
ava ----> .
isabella
... ----> i
..i ----> s
.is ----> a
isa ----> b
sab ----> e
abe ----> l
bel ----> l
ell ----> a
lla ----> .
sophia
... ----> s
..s ----> o
.so ----> p
sop ----> h
oph ----> i
phi ----> a
hia ----> .


In [38]:
print(X.shape,X.dtype, Y.shape)

torch.Size([32, 3]) torch.int64 torch.Size([32])


In [39]:
# embedding space 27 characters in 2D space
C = torch.randn((27,2))

In [None]:
# C[ix]
# F.one_hot(torch.tensor(ix), num_classes=27).float() @ C
# ideally you can do both the approach 
# 1. C as a look up table and index into it 
# 2. Consider C as a first layer in nn, F.one_hot(torch.tensor(ix), num_classes=27).float() @ C

In [56]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [68]:
# first layer of nn 
W1 = torch.randn(6,100) # 3*2 input dimension, 100 neurons 
b1 = torch.randn(100)
# roughly emb @ W1 + b1
# h = emb.view(32,6) @ W1 + b1
# h = emb.view(emb.shape[0],6) @ W1 + b1
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
h.shape

torch.Size([32, 100])

In [None]:
# torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape
# torch.cat(torch.unbind(emb, 1), 1).shape
# concatenation is in efficient as it creates new tensors, as it cant manipulate views --> view is efficient 


In [70]:
# layer 2 
W2 = torch.randn(100,27) #input = 100, no of neurons/output = 27
b2 = torch.randn(27)

In [71]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [72]:
counts = logits.exp()
prob = counts/counts.sum(1, keepdim=True)

In [None]:
# get the probab of Y values 
loss = - prob[torch.arange(32), Y].log().mean()
loss

tensor(18.2299)

### Clean up 

In [102]:
# dataset
import torch
import torch.nn.functional as F

words = open('data/names.txt', 'r').read().splitlines()
# stoi and itos 
chars = set(''.join(words))
stoi = {c:i+1 for i,c in enumerate(sorted(chars))}
stoi['.'] = 0
itos = {v:k for k,v in stoi.items()}

# create the traning data 
block_size = 3 # no of charcters in context to predict the next one 
X, Y = [], []
for word in words:
    #running context 
    context = [0]*block_size
    for ch in word+'.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]
        
X = torch.tensor(X)
Y = torch.tensor(Y)

print(f"No of words: {len(words)}")
print(f"No of training examples: {X.shape[0]}")
print(f"Shape of X,  {X.shape}")
print(f"Shape of Y,  {Y.shape}")

No of words: 32033
No of training examples: 228146
Shape of X,  torch.Size([228146, 3])
Shape of Y,  torch.Size([228146])


In [103]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6,100),generator=g) # 3*2 input dimension, 100 neurons 
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100,27), generator=g) #input = 100, no of neurons/output = 27
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [104]:
# total no of parameters
print(f"total no of parameters: {sum(p.nelement() for p in parameters)}")

total no of parameters: 3481


In [105]:
for p in parameters:
    p.requires_grad = True

In [133]:
# forward pass 
for _ in range(100):
    # construct mini batching 
    ix = torch.randint(0, X.shape[0], (32,)) # size of mini batch =32 
    emb = C[X[ix]] #(32,3,2)
    h = torch.tanh(emb.view(-1,6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
 
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    #update 
    for p in parameters:
        p.data += -0.1 * p.grad

print(f"Loss: {loss.item()}")

Loss: 2.5621838569641113


In [134]:
emb = C[X] #(32,3,2)
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
print(f"Loss: {loss.item()}")

Loss: 2.7064108848571777
