## Character level language model which considers multiple previous characters using Multilayer Neural Nets

In [None]:
#points from research paper
#it is word level language model
#predicts probability of next word based on previous three words
#every input word embedded into some 30 0r 60  dimensional feature vector
#which means there are 17000 words in 30 or 60 dimensional feature space
#words with similar meaning or context are close to each other ,Generality
#three layers
#1.input layer takes index of words and check lookup table for embedding
#2hidden layer with tanh activation function
#3.softmax layer with 170000 classes fully connected to hidden layer(more computation here)

In [None]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F


In [None]:
words=open("names.txt","r").read().splitlines()
len(words)

In [None]:
chars=sorted(list(set(''.join(words))))
stoi={ch:i for i,ch in enumerate(['.']+chars)}

itos={i:ch for ch,i in stoi.items()}
itos

In [None]:
X=[]
Y=[]
block_size=3

for word in words[:5]:
    context=[0]*block_size
    for ch in word+'.':
        X.append(context)
     
        ix=stoi[ch]
        Y.append(ix)
        print(''.join([itos[i] for i in context]),"--->",ch)
        context=context[1:]+[ix]
        
X=torch.tensor(X)
Y=torch.tensor(Y)

In [None]:
X.shape,X.dtype,Y.shape,Y.dtype


In [None]:
C=torch.randn((27,2))

In [None]:
emb=C[X]
emb.shape

In [None]:
W1=torch.randn((6,100))
b1=torch.randn(100)

In [None]:
torch.cat([emb[:,0,:],emb[:,1,:],emb[:,2,:]],dim=1)

In [None]:
torch.cat(torch.unbind(emb,1),dim=1)

In [None]:
a=torch.arange(18)

In [None]:
a.view(9,2) # extreamly efffient than previous because it doesnt involves storage manupulation

In [None]:
a.storage()

In [None]:
emb.view(32,6)

In [None]:
h=torch.tanh(emb.view(-1,6)@W1+b1)

In [None]:
h.shape

In [None]:
#nxt layer
W2=torch.randn((100,27))
b2=torch.randn(27)
logits=h@W2+b2
# print(logits.shape)
counts=logits.exp()
probs=counts/counts.sum(1,keepdim=True)
# probs[0].sum()

In [None]:

loss=-probs[torch.arange(32),Y].log().mean()
loss

## Now made respectable

In [None]:
#build dataset
def build_dataset(words):
        X=[]
        Y=[]
        block_size=3
        
        for word in words:
            context=[0]*block_size
            for ch in word+'.':
                X.append(context)
             
                ix=stoi[ch]
                Y.append(ix)
                
                context=context[1:]+[ix]
                
        X=torch.tensor(X)
        Y=torch.tensor(Y)
        print(X.shape,Y.shape)
        return X,Y
import random
random.seed(42)
random.shuffle(words)
n1=int(0.8*len(words))
n2=int(0.9*len(words))
Xtre,Ytre=build_dataset(words[:n1])
Xval,Yval=build_dataset(words[n1:n2])
Xte,Yte=build_dataset(words[n2:])

In [None]:
g=torch.Generator().manual_seed(2147483647) #reproducability
C=torch.randn((27,10),generator=g)
W1=torch.randn((30,200),generator=g)
b1=torch.randn(200,generator=g)
W2=torch.randn((200,27),generator=g)
b2=torch.randn(27,generator=g)
parameters=[C,W1,b1,W2,b2]


In [None]:
sum(p.nelement() for p in parameters)

In [None]:
for p in parameters:
    p.requires_grad=True

### finding best learnig rate

In [None]:

#experiment with to find upper bound and lower bound a,these bound can be idnetified by seeing unstable loss

lre=torch.linspace(-3,0,1000)
lrs=10**lre

In [None]:
lri=[]
lossi=[]
stepi=[]

In [None]:

for i in range(100000):
    #minibatch construct since ntire dataset takes lot of computation
    ix=torch.randint(0,Xtre.shape[0],(32,))
    #forward pass
    emb=C[Xtre[ix]] #(32,3,2)
    h=torch.tanh(emb.view(-1,30)@W1+b1) #32,100
    logits=h@W2+b2 #32,27
    # counts=logits.exp()
    # probs=counts/counts.sum(1,keepdim=True)
    # loss=-probs[torch.arange(32),Y].log().mean()
    loss=F.cross_entropy(logits,Ytre[ix]) #forwardpass and backwardpass are efficient
    # print(loss.item())

    #backward
    for p in parameters:
        p.grad=None

    loss.backward()

    #update
    # lr=lrs[i]
    lr=0.1
    if i<50000:
        lr=0.1
    else:
        lr=0.01
    for p in parameters:
        p.data+=-lr*p.grad
    # lri.append(lr)
    stepi.append(i)
    lossi.append(loss)
    
print(loss)
#loss not zero ... --> all alphabets

In [None]:
plt.plot(torch.tensor(lri),torch.tensor(lossi)) #for determining learning rate

In [None]:
plt.plot(torch.tensor(stepi),torch.tensor(lossi)) #to determine the hyperparameter tuning

In [None]:
emb=C[Xtre] #(32,3,2)
h=torch.tanh(emb.view(-1,30)@W1+b1) #32,100
logits=h@W2+b2 #32,27
loss=F.cross_entropy(logits,Ytre)
print(loss.item())

In [None]:
emb=C[Xval] #(32,3,2)
h=torch.tanh(emb.view(-1,30)@W1+b1) #32,100
logits=h@W2+b2 #32,27
loss=F.cross_entropy(logits,Yval)
print(loss.item())

In [None]:
#embedding plot for two dimensional embeddings
plt.figure(figsize=(8,8))

plt.scatter(C[:,0].data,C[:,1].data,s=200)
for i in  range(C.shape[0]):
    plt.text(C[i,0].item(),C[i,1].item(),itos[i],ha="center",va="center",color="white")
plt.grid()

In [None]:
#note :
#in this by using batches we find approx grad rather than exact grad,it suitable in  practice
#as the size of the network increases it may lead s to overfitting loss~0
#training split,dev/val split ,test split
# If the validation accuracy is much lower than the training accuracy, the model might be overfitting (memorizing instead of generalizing).
#80%,10%,10%
#we can increase performace we can increase network size or increase dimentions of embeddings

#### justification for cross entropy

In [None]:

logits=torch.tensor([-5,-3,0,100])
counts=logits.exp()
p=counts/counts.sum()
p

In [None]:
#for more positive value logits.exp become inf
#for more negative values logits.exp become small

In [None]:
#cross entropy
logits=logits-logits.max()
counts=logits.exp()
p=counts/counts.sum()
p

In [None]:
#sampling from model
for i in range(10):
    out=[]
    
    context=[0]*3
    while True:
        emb=C[torch.tensor(context)] #(1,blocksize,d)
        h=emb.view(1,-1)@W1+b1
        logits=h@W2+b2
        probs=F.softmax(logits)

        ix=torch.multinomial(probs,replacement=True,num_samples=1).item()
        context=context[1:]+[ix]
        out.append(ix)
        if ix==0:
            break
    print('.'join(itos[i] for i in out))
        
        


        