In [119]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [120]:
#OBJECTIVE: WE WANT TO PREDICT NEXT CHARATCER BY TAKING CONTEXT FROM MORE PREVIOUS CHARACTERS(like 3) INSTEAD OF JUST 1 

In [121]:
#1 read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [122]:
chars = sorted(list(set(''.join(words))))
charIntIndexMapping = {s:i+1 for i,s in enumerate(chars)}
charIntIndexMapping['.'] = 0
indexToCharMapping ={i:s for s,i in charIntIndexMapping.items()}

In [123]:
#2 BUILDING TRAINING DATASET

block_size = 3 # context length: how many characters do we take to predict the next one? basically size of sliding window
inputContext, outputCharcterForThatContext = [], []

for w in words:
  

  contextSlidingWindow = [0] * block_size #Initialise with arr= [0,0,0] where 0 represents '.' char so essentially "..."
    
  for ch in w + '.': #Appending '.' to mark end of word
      
    #NOTE ON APPENDING '.' in Words:
    #1. In bigram model we were appending '.' to start and end of each word 
    #2. Here we dont need to append '.' in starting because we are starting context as "..." 

      
    index = charIntIndexMapping[ch]
    inputContext.append(contextSlidingWindow)
    outputCharcterForThatContext.append(index)
    #print(''.join(indexToCharMapping[i] for i in context), '--->', indexToCharMapping[ix])
    
    contextSlidingWindow = contextSlidingWindow[1:] + [index] #Updating context, so context is like sliding window of size=block_size, when you move to 
    #next iteration you append current char from back in sliding window and first char is removed
    #Syntax context[1:] means taking array from index-1 to last index and appending index represented by current char

#1. inputContext: Represents a matrix where row represents integer form of each possible sliding window of size -3 in all name in names.txt
# and we have 3 columns(bec context_length=3) which represents the content of contextWindow represented by row


#Forexample for emma actually becomes emma.
#Row  Columns(int mapping of char)  contextWindow  
#0->  0,0,0                         ...
#1->  0,0,5                         ..e
#2->  0,5,13                        .em
#3->  5,13,13                       emm
#4->  13,13,1                       mma
#(No entry for ma.)


#2. a. outputCharcterForThatContext: Represents an array of size equal to number of rows in inputContext and outputCharcterForThatContext[i]
#gives index the next character for the sliding window or the context represented by ith row in inputContext
#b. We can say outputCharcterForThatContext is label for our data so we have labelled dataset for our training
inputContext = torch.tensor(inputContext)
outputCharcterForThatContext = torch.tensor(outputCharcterForThatContext)
print(inputContext.shape)
print(outputCharcterForThatContext.shape)


#3. We can change size of sliding window to 4 or 5 or anything in that case our contextWindow will be ....., ....e, ...em, ..emm, .emma,



torch.Size([228146, 3])
torch.Size([228146])


In [124]:
#3 a.Creating Embedding for Each Character so that it can be comfortably for training ML Model like matrix multiplication
#b. In bigram model we created one hot encoding i.e each character was written as 27-bit 
#c. Here we will map each character to 2 dimensional embedding, so for 27 characters we will get 27x2 matrix

charEmbeddings = torch.randn((27,2),requires_grad=True)

charEmbeddings[5] # this gives 2 dimensional embedding of char at index:5  i.e e
print(charEmbeddings[5])

#e. charEmbeddings[5] THIS EMBEDDING IS EQUIVALENT TO:
embedding = F.one_hot(torch.tensor(5), num_classes=27).float() @ charEmbeddings 
embedding
#So we can see charEmbeddings[5] = embedding 
# 1. We created one hot encoding of e as 000010...0 (i.e 1x27 matrix) and did matrix multiplication with 27x2 matrix(this matrix is like 
  #weights matrix so we can say our charEmbedding is similar 1 layer neural , so this is first layer of neural net
# 2. so we will be using charEmbedding directly instead of this one hot encoding and matrix multiplication


# charEmbeddings will be used as look up table to find embedding of a char

tensor([1.7745, 0.2319], grad_fn=<SelectBackward0>)


tensor([1.7745, 0.2319], grad_fn=<SqueezeBackward4>)

In [125]:
#4. Preparing Input data for neural network which will go to layer-1
inputForLayer1 = charEmbeddings[inputContext]
print(inputForLayer1.shape) # 3-d dimensional(p,q,r) where p-> number of context window possible for all names, q-> size of context window
#basically block_size, r-> the dimension in which each char is embedded so in our case we have embedded each char in 2D(Look at block-2)

#Flattening from 3d to 2d
inputForLayer1 = inputForLayer1.view(len(inputContext), 6)
print(inputForLayer1.shape)

torch.Size([228146, 3, 2])
torch.Size([228146, 6])


In [126]:
#5. DEFINING TWO LAYERS OF NEURAL NETWORK
g = torch.Generator().manual_seed(2147483647) # for reproducibility
weightLayer1 = torch.randn((6, 100), generator=g, requires_grad=True) #layer-1 100 neurons each neuron have 6 weights because input have 6 feuture so we have
#6 weights
biasLayer1 = torch.randn(100, generator=g, requires_grad=True) 

weightLayer2 = torch.randn((100, 27), generator=g, requires_grad=True) # layer-2 27 neurons each neurons have 100 weights because input for layer-2 will havve
#100 feature so we have 100 weight (228146,6 X 6,100--> 228146,100 Matrix which is input for layer-2)
biasLayer2 = torch.randn(27, generator=g, requires_grad=True) #layer-2 27 biases
parameters = [charEmbeddings, weightLayer1, biasLayer1, weightLayer2, biasLayer2]


In [127]:
#6. RUNING UNTRAINED NEURAL NET TO FOR FIRST TIME TO GET RESULT(FORWARD PASS)

h = torch.tanh(inputForLayer1@weightLayer1+biasLayer1) #Output from Layer-1, Range of tanh=[-1,1]
#layer-1 100 biases, bias should be equal to number of neurons in layer
# as o/p from neuron = Summation(wi*xi)+b (if you see each cell of output matrix after multiplication is summation(wi*xi)) and then we add
#bias to each cell of o/p matrix

logits = h@weightLayer2+biasLayer2 #Output from Layer-2

counts = logits.exp()
prob = counts/counts.sum(1,keepdims=True) #prob[i][j] = count[i][j]/(sum of elements in count[i] row)
loss = -prob[torch.arange(len(inputContext)),outputCharcterForThatContext].log().mean()
loss
print("manually calculating loss",loss)

#b. ShortCut for calculation loss using CROSS ENTROPY:
loss = F.cross_entropy(logits,outputCharcterForThatContext)
print("calculating loss through cross entropy",loss)

#We should always use cross entropy function because of following reasons:
    #1. Manual calculation result in creating extra tensors like counts and prob which can be heavy for large dataset
    #2. Pytorch optimises loss calculation using Cross entropy because it clusterup various operation
    #3. VERY IMP REASON:
        #3.1 we do exponentiation of each cell of logits matrix to make counts positive value but if suppose logits[i][j] = 100 or 
        #any slightly big value then e^100 will be cross limits of float datatype and hence count[i][j] = infinity and our prob matrix will go for 
        #toss
        #3.2 How Cross Entropy solves this problems:
        #It finds max value in logits[][] and subtract each element with that value, now since count[][] is normalised hence there will be no impact
        #on count[i][j](bec we are subtracting each element with same value so there will be no impact on normalised value) because of this 
        #optimisation no element can have more than 0 value so no chance of e^x crossing floats limit
    #4. Backprob is easier in this case

manually calculating loss tensor(18.8199, grad_fn=<NegBackward0>)
calculating loss through cross entropy tensor(18.8199, grad_fn=<NllLossBackward0>)


In [128]:
#7. TRAINING THE MODEL( FORWARD PASS AND BACKWARD PASS FOR SOME NUMBER OF TIME)
for _ in range(10):
    #forward pass
    h = torch.tanh(inputForLayer1@weightLayer1+biasLayer1)
    logits = h@weightLayer2+biasLayer2
    loss = F.cross_entropy(logits,outputCharcterForThatContext)
    print(loss.item)
    for p in parameters:
        p.grad = None #We will need to optimise charEmbedding(because it has 2 dimension for 27 chars, weights and biases
    loss.backward()
    for p in parameters:
        p.data += -0.1*p.grad
 
    

<built-in method item of Tensor object at 0x164fdde50>
<built-in method item of Tensor object at 0x164fdd090>


RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.