In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [2]:
words=open('names.txt','r').read().splitlines()

In [3]:
#build voabulary of character and mapping to/from integrers
chars=sorted(list(set(''.join(words))))
stoi={s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos={i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [4]:
#build the datset

def build_dataset(words):
    block_size=3 #context lenght: how many inputs/characters we take to predict next char
    X,Y=[],[] #X are the inputs to NN, Y is the label for each input inside X

    for w in words: #gives the name/word ,first 5 so in loop it is emma
#         print(w)
        context=block_size*[0]   #1st loop : [0,0,0]
        for ch in w + '.':       #adds . to the name and gives the  letter here e
            ix=stoi[ch]          #gives corresponding int value of e and stores in ix
            X.append(context)    #adds [0,0,0] to X,list inside list
            Y.append(ix)         #add ix to Y, here for e
#             print(''.join(itos[i] for i in context),'--->',itos[ix])
            context=context[1:]+ [ix] #crop and append: before starting for m (next letter in emma), removes first 0 from context and adds the value of e as third elemnt in context 

    X=torch.tensor(X)
    Y=torch.tensor(Y)
    print(X.shape,Y.shape)
    return X,Y


import random
random.seed(42)
random.shuffle(words)
n1=int(0.8*len(words))
n2=int(0.9*len(words))

Xtr,Ytr=build_dataset(words[:n1])
Xdev, Ydev=build_dataset(words[n1:n2])
Xte,Yte=build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [5]:
C=torch.randn((27,2))


In [6]:
Xtr.shape,Xtr.dtype,Ytr.shape,Ytr.dtype

(torch.Size([182625, 3]), torch.int64, torch.Size([182625]), torch.int64)

In [22]:
g=torch.Generator().manual_seed(2147483647) #reproducible
C=torch.randn((27,2), generator=g)
W1=torch.randn((6,100), generator=g)
b1=torch.randn(100, generator=g)
W2=torch.randn((100,27), generator=g)
b2=torch.randn(27, generator=g)
parameters=[C,W1,b1,W2,b2]

In [23]:
sum(p.nelement() for p in parameters)

3481

In [24]:
for p in parameters:
    p.requires_grad=True

In [25]:
#finalising LR
#reset parameters

lri=[]
lossi=[]


for i in range(30000): 
    
    #minibatch construct
    ix=torch.randint(0,Xtr.shape[0],(32,))
    
    #forward pass
    emb=C[Xtr[ix]] #(32,3,2)
    h=torch.tanh(emb.view(-1,6) @ W1 +b1) #32x100
    logits=h @ W2 +b2 #32x27
    loss=F.cross_entropy(logits,Ytr[ix])
    
    #backward pass
    for p in parameters:
        p.grad=None            #set gradients to 0

    loss.backward()

    #upgarde 
#     lr=lrs[i]              # we start with very less lr and go go upto -1
    lr=0.1
    for p in parameters:
        p.data+= -lr*p.grad 
        
        #track stats
#         lri.append(lre[i])
#         lossi.append(loss.item())
        
        
print(loss.item())
# this is loss for this particular minibatch

2.257667303085327


In [26]:
#eval loss for All of Xtr and Ytr

emb=C[Xtr] #(32,3,2)
h=torch.tanh(emb.view(-1,6) @ W1 +b1) #32x100
logits=h @ W2 +b2 #32x27
loss=F.cross_entropy(logits,Ytr)
loss.item()

2.3906140327453613

In [28]:
emb=C[Xdev] #(32,3,2)
h=torch.tanh(emb.view(-1,6) @ W1 +b1) #32x100
logits=h @ W2 +b2 #32x27
loss=F.cross_entropy(logits,Ydev)
loss.item()

2.3905692100524902

In [29]:
#since train and dev losses are almost equal implies we r not over fitting rather underfitting, this is bcz our model is not 
# powerful enough to learn all input and out bcz bcz of very few parameters, lets change that now

In [6]:
#increase the size of nn
g=torch.Generator().manual_seed(2147483647) #reproducible
C=torch.randn((27,2), generator=g)
W1=torch.randn((6,300), generator=g)
b1=torch.randn(300, generator=g)
W2=torch.randn((300,27), generator=g)
b2=torch.randn(27, generator=g)
parameters=[C,W1,b1,W2,b2]

In [7]:
sum(p.nelement() for p in parameters)

10281

In [8]:
for p in parameters:
    p.requires_grad=True

#finalising LR
#reset parameters

lri=[]
lossi=[]
stepi=[]

for i in range(30000): 
    
    #minibatch construct
    ix=torch.randint(0,Xtr.shape[0],(32,))
    
    #forward pass
    emb=C[Xtr[ix]] #(32,3,2)
    h=torch.tanh(emb.view(-1,6) @ W1 +b1) #32x100
    logits=h @ W2 +b2 #32x27
    loss=F.cross_entropy(logits,Ytr[ix])
    
    #backward pass
    for p in parameters:
        p.grad=None            #set gradients to 0

    loss.backward()

    #upgarde 
#     lr=lrs[i]              # we start with very less lr and go go upto -1
    lr=0.01
    for p in parameters:
        p.data+= -lr*p.grad 
        
        #track stats
#         lri.append(lre[i])
        stepi.append(i)
        lossi.append(loss.item())
        
        
print(loss.item())
# this is loss for this particular minibatch

2.3544814586639404


In [None]:
plt.plot(stepi,lossi)  #kernel dying lol
#noise/thickness bcz of minibatch

[<matplotlib.lines.Line2D at 0x19c30707690>]

In [24]:
#eval loss for All of Xtr and Ytr

emb=C[Xtr] #(32,3,2)
h=torch.tanh(emb.view(-1,6) @ W1 +b1) #32x100
logits=h @ W2 +b2 #32x27
loss=F.cross_entropy(logits,Ytr)
loss.item()

2.4987435340881348

In [25]:
#val
emb=C[Xdev] #(32,3,2)
h=torch.tanh(emb.view(-1,6) @ W1 +b1) #32x100
logits=h @ W2 +b2 #32x27
loss=F.cross_entropy(logits,Ydev)
loss.item()

2.503011703491211

In [8]:
#experiment with different learning rate

In [None]:
#bottleneck of the model: the embedding (1st layer) C is 2-d:craming way too many char into 2 dim:

In [25]:
#increase the size of nn
g=torch.Generator().manual_seed(2147483647) #reproducible
C=torch.randn((27,10), generator=g)
W1=torch.randn((30,200), generator=g)
b1=torch.randn(200, generator=g)
W2=torch.randn((200,27), generator=g)
b2=torch.randn(27, generator=g)
parameters=[C,W1,b1,W2,b2]

sum(p.nelement() for p in parameters)

11897

In [26]:
lri=[]
lossi=[]
stepi=[]


for p in parameters:
    p.requires_grad=True

for i in range(200000):
  
  # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (32,))
  
  # forward pass
    emb = C[Xtr[ix]] # (32, 3, 10)
    h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 200)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, Ytr[ix])
      #print(loss.item())

      # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
  
  # update
  #lr = lrs[i]
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad

  # track stats
  #lri.append(lre[i])
    stepi.append(i)
    lossi.append(loss.log10().item()) #log10:for plotting logloss: hockey stick apperance squashed by this

print(loss.item())


2.1218512058258057


In [27]:
#plt.plot(stepi,stepi)

In [28]:
#eval loss for All of Xtr and Ytr

emb=C[Xtr] #(32,3,2)
h=torch.tanh(emb.view(-1,30) @ W1 +b1) #32x100
logits=h @ W2 +b2 #32x27
loss=F.cross_entropy(logits,Ytr)
loss.item()

2.128194570541382

In [29]:
#val
emb=C[Xdev] #(32,3,2)
h=torch.tanh(emb.view(-1,30) @ W1 +b1) #32x100
logits=h @ W2 +b2 #32x27
loss=F.cross_entropy(logits,Ydev)
loss.item()

2.1568751335144043

In [None]:
#kernel dying lol
#visualize dimensions 0 and 1 of the embedding matrix C for all characters
plt.figure(figsize=(8,8))
plt.scatter(C[:,0].data, C[:,1].data, s=200)
for i in range(C.shape[0]):
    plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha="center", va="center", color='white')
plt.grid('minor')

In [30]:

# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)
block_size=3

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
        emb = C[torch.tensor([context])] # (1,block_size,d)
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
    
    print(''.join(itos[i] for i in out))

mora.
mayah.
see.
med.
ryla.
remmadiendra.
gracee.
daelin.
shy.
jenleigh.
estanaraelyn.
malara.
noshdanrishiriel.
jacie.
jeniquetton.
kuya.
aven.
jamyle.
eli.
kay.


In [31]:
#these names are pretty good