In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [2]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
chars = sorted(list(set(''.join(words))))
charIntIndexMapping = {s:i+1 for i,s in enumerate(chars)}
charIntIndexMapping['.'] = 0
indexToCharMapping ={i:s for s,i in charIntIndexMapping.items()}

In [9]:
#1. Build the dataset: Train, Validation and Test
block_size = 3 

def build_dataset(words):  

    block_size = 3 
    inputContext, outputCharcterForThatContext = [], []
    
    for w in words:
      
    
      contextSlidingWindow = [0] * block_size
        
      for ch in w + '.': 
        index = charIntIndexMapping[ch]
        inputContext.append(contextSlidingWindow)
        outputCharcterForThatContext.append(index)
    #print(''.join(indexToCharMapping[i] for i in context), '--->', indexToCharMapping[ix])
        contextSlidingWindow = contextSlidingWindow[1:] + [index]

    inputContext = torch.tensor(inputContext)
    outputCharcterForThatContext = torch.tensor(outputCharcterForThatContext)
    print(inputContext.shape,outputCharcterForThatContext.shape)
    return inputContext,outputCharcterForThatContext

import random
random.seed(42)
random.shuffle(words)
noOfWordForTraining = int(0.8*len(words))
noOfWordsForValidation = int(0.9*len(words))

inputContextTrain, outputCharacterTrain = build_dataset(words[:noOfWordForTraining])
inputContextValidation, outputCharacterValidation = build_dataset(words[noOfWordForTraining:noOfWordsForValidation])
inputContextTesting, outputCharacterTesting = build_dataset(words[noOfWordsForValidation:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [11]:
#2. MLP revisited: Creating Parameters for MLP(Charembedding, Weights and biases)

n_embd = 2 # the dimensionality of the character embedding vectors
n_hidden = 300 # the number of neurons in the hidden layer of the MLP(1st layer)
vocab_size=27 #unique characters we have including '.'

g = torch.Generator().manual_seed(2147483647) # for reproducibility
charEmbeddings  = torch.randn((vocab_size, n_embd),generator=g,requires_grad=True)
weightLayer1 = torch.randn((n_embd * block_size, n_hidden), generator=g,requires_grad=True)
biasLayer1 = torch.randn(n_hidden,generator=g,requires_grad=True)
weightLayer2 = torch.randn((n_hidden, vocab_size),generator=g,requires_grad=True)
biasLayer2 = torch.randn(vocab_size,generator=g,requires_grad=True)

parameters = [charEmbeddings, weightLayer1, weightLayer2, biasLayer1, biasLayer2]
print(sum(p.nelement() for p in parameters)) # number of parameters in total

10281


In [13]:
#3. Training the model
max_steps=200000
batch_size=32
lossi = []
stepsi = []
noOfColAfterFlattening = block_size*n_embd
for i in range(max_steps):
    noOfWindowsToBeUsedPerLearningStep = batch_size
    randomSlidingWindowsUsed = torch.randint(0, inputContextTrain.shape[0], (noOfWindowsToBeUsedPerLearningStep,))
    inputForLayer1 = charEmbeddings[inputContextTrain[randomSlidingWindowsUsed]]
    inputForLayer1 = inputForLayer1.view(noOfWindowsToBeUsedPerLearningStep, noOfColAfterFlattening)
    h = torch.tanh(inputForLayer1@weightLayer1+biasLayer1)
    logits = h@weightLayer2+biasLayer2
    loss = F.cross_entropy(logits,outputCharacterTrain[randomSlidingWindowsUsed])
    for p in parameters:
        p.grad = None 
    loss.backward()
    for p in parameters:
        p.data += -0.1*p.grad 
    #track stats    
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}') #logging the loss in every 10,000 learning steps 
    lossi.append(loss.item())
    stepsi.append(i)

   

print("loss on training data set",loss.item())

      0/ 200000: 27.0705
  10000/ 200000: 2.8543
  20000/ 200000: 2.4815
  30000/ 200000: 2.7685
  40000/ 200000: 2.1235
  50000/ 200000: 2.3941
  60000/ 200000: 2.3548
  70000/ 200000: 2.4956
  80000/ 200000: 2.5888
  90000/ 200000: 2.3835
 100000/ 200000: 2.7417
 110000/ 200000: 2.4000
 120000/ 200000: 2.2243
 130000/ 200000: 2.4927
 140000/ 200000: 2.6062
 150000/ 200000: 2.4692
 160000/ 200000: 2.5604
 170000/ 200000: 2.5747
 180000/ 200000: 2.2426
 190000/ 200000: 2.5037
loss on training data set 2.052839994430542


In [None]:
plt.plot(stepsi,lossi)

In [None]:
def trainingModel(inputContextTrain,charEmbeddings,weightLayer1,biasLayer1,weightLayer2,biasLayer2):
    max_steps=200000
    batch_size=32
    lossi = []
    stepsi = []
    parameters = [charEmbeddings, weightLayer1, weightLayer2, biasLayer1, biasLayer2]
    noOfColAfterFlattening = block_size*n_embd
    for i in range(max_steps):
        noOfWindowsToBeUsedPerLearningStep = batch_size
        randomSlidingWindowsUsed = torch.randint(0, inputContextTrain.shape[0], (noOfWindowsToBeUsedPerLearningStep,))
        inputForLayer1 = charEmbeddings[inputContextTrain[randomSlidingWindowsUsed]]
        inputForLayer1 = inputForLayer1.view(noOfWindowsToBeUsedPerLearningStep, noOfColAfterFlattening)
        h = torch.tanh(inputForLayer1@weightLayer1+biasLayer1)
        logits = h@weightLayer2+biasLayer2
        loss = F.cross_entropy(logits,outputCharacterTrain[randomSlidingWindowsUsed])
        for p in parameters:
            p.grad = None 
        loss.backward()
        lr = 0.1 if i<100000 else 0.01
        for p in parameters:
            p.data += -lr * p.grad 
        #track stats    
        if i % 10000 == 0: # print every once in a while
            print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}') #logging the loss in every 10,000 learning steps 
        lossi.append(loss.item())
        stepsi.append(i)
    
       
    plt.plot(stepsi,lossi)
    print("loss on training data set",loss.item())

In [None]:
#4 PROBLEM-1 WITH OUR MODEL(FIXING THE INITIAL LOSS)

#1.LOSS BEFORE FIRST LEARNING STEP IS TOO HIGH
    #a. For the first learning step the loss is too high i.e 27.49, as seen from graph in next leanring steps it goes down drastically which
    #suggests that something is wrong with our parameters with which we start with

    #b. so we have 27 character possible for any place and in starting when our model is not trained every character should 
    #be equally likely so prob for any character to come at a place is 1/27 and we do negative log likelihood which is 3.29
    # and since loss is basically mean so our loss to start with is 3.29 and not 27.49 this suggests something is wrong
    #with our model to params with which we start with

#2. SOLUTION:(LOGITS is before counts step i.e the step where we do exponentiation for representing count)
    #a. We want for the first learning step logits to have value closer to each other the more wide i.e different the logit value are
    #the less equilikely will be prob of occurences and hence the more will be loss (i.e more bigger than 3.29 loss which is for equilikely)
    #b. We want logits[i][j] to have equal like values, so we want logits[i][j] to have value closer to 0
    #c. We make b2(bias for layer-2) to be 0 start with Weights for layer to be multiplied by 0.1
    #d. We dont want to multiply weights with 0, multiplying bias with 0 is fine


g = torch.Generator().manual_seed(2147483647) # for reproducibility
charEmbeddings  = torch.randn((vocab_size, n_embd),generator=g,requires_grad=True)
weightLayer1 = torch.randn((n_embd * block_size, n_hidden), generator=g,requires_grad=True)
biasLayer1 = torch.randn(n_hidden,generator=g,requires_grad=True)
weightLayer2 = torch.randn((n_hidden, vocab_size),generator=g)*0.01
biasLayer2 = torch.randn(vocab_size,generator=g)*0

weightLayer2.requires_grad=True
biasLayer2.requires_grad=True

#weightLayer2 = torch.randn((n_hidden, vocab_size),generator=g, required_grad=True)*0.1 was causing error when we were multiplyinh
#learning rate with gradient, Error was cant multiply none with fload hence separated out requires_grad=true in separate line

parameters = [charEmbeddings, weightLayer1, weightLayer2, biasLayer1, biasLayer2]
trainingModel(inputContextTrain,charEmbeddings,weightLayer1,biasLayer1,weightLayer2,biasLayer2)


#3. Observations from Solution:
    #1. Loss for first step drastically reduces from very big value like 27 to 3.25 which we want, because equilikely prob=3.29
    #2. The plot of loss is no more hockey shape, earlier we had hockey stick like shape where we start with very high loss and
    # immediately go to lower loss, now we have fixed that issue so we have removed easy gains from our model 
    #3. Since we start with better model, we dont have to waste initial learning steps to get that easy gain which results in lesser loss
    # in model of 1.91 incomparison to previous training where we have not multiplied with 0.01 and 0

In [None]:
import numpy as np
x = np.linspace(-10, 10, 400)
y = np.tanh(x)
plt.figure(figsize=(4, 3))
plt.plot(x, y, label='tanh(x)', color='blue')

In [None]:
#5. PROBLEM-2 WITH OUR MODEL(FIXING SATURATED tanh)
    #1. We do  h = torch.tanh(inputForLayer1@weightLayer1+biasLayer1), we squash the result of layer-1 to a range of -1 to 1 
    # from the plot of tanh we see for value big enough like more than 3 or less than -3 tanh() gives value equal to +1 and -1 
    #respectively
    
    #2. Gradient for these portions in tanh graph is zero because graph is flat for x>+3 and x<-3
    
    #3.When we do gradient decent in backward pass the impact of those cells of matrix resulting from 
    #h = torch.tanh(inputForLayer1@weightLayer1+biasLayer1) whose value is big enough will not be optimised because their 
    #gradient is zero, Suppose if entire column of matrix h is 1 or -1 then that neuron's weight can never be optimised and
    #it will be a dead neuron
    
    #4 Sometimes having very large learning rate also results in dead neuron problem

#SOLUTION: 
#1. Mutiply bias and weight for the layer where we are using tanh as activation with very small number like 0.01 so
# value from matrix multiplication is very small
#2. We multiply bias of layer-1 with 0.01 and weight of layer-1 with 0.2








In [None]:
#5.2 Continued: PRINTING H matrix BEFORE OPTIMIZATION
g = torch.Generator().manual_seed(2147483647) # for reproducibility
charEmbeddings  = torch.randn((vocab_size, n_embd),generator=g,requires_grad=True)
weightLayer1 = torch.randn((n_embd * block_size, n_hidden), generator=g) 
biasLayer1 = torch.randn(n_hidden,generator=g) 


#Printing the  tanh matrix we get from first layer output from first learning step
noOfWindowsToBeUsedPerLearningStep = batch_size
randomSlidingWindowsUsed = torch.randint(0, inputContextTrain.shape[0], (noOfWindowsToBeUsedPerLearningStep,))
inputForLayer1 = charEmbeddings[inputContextTrain[randomSlidingWindowsUsed]]
inputForLayer1 = inputForLayer1.view(noOfWindowsToBeUsedPerLearningStep, noOfColAfterFlattening)
h = torch.tanh(inputForLayer1@weightLayer1+biasLayer1)

plt.hist(h.view(-1).tolist(),50) #50 represents number of intervals we want to distribute data into

plt.figure(figsize=(20,10))
plt.imshow(h.abs()>0.99,cmap='gray',interpolation='nearest')

In [None]:
#5.3 Continued: PRINTING H matrix AFTER OPTIMIZATION
g = torch.Generator().manual_seed(2147483647) # for reproducibility
charEmbeddings  = torch.randn((vocab_size, n_embd),generator=g,requires_grad=True)
weightLayer1 = torch.randn((n_embd * block_size, n_hidden), generator=g)*0.01 
biasLayer1 = torch.randn(n_hidden,generator=g)*0.01 


#Printing the  tanh matrix we get from first layer output from first learning step
noOfWindowsToBeUsedPerLearningStep = batch_size
randomSlidingWindowsUsed = torch.randint(0, inputContextTrain.shape[0], (noOfWindowsToBeUsedPerLearningStep,))
inputForLayer1 = charEmbeddings[inputContextTrain[randomSlidingWindowsUsed]]
inputForLayer1 = inputForLayer1.view(noOfWindowsToBeUsedPerLearningStep, noOfColAfterFlattening)
h = torch.tanh(inputForLayer1@weightLayer1+biasLayer1)

plt.hist(h.view(-1).tolist(),50)# view(-1) flattens matrix to 1-d array

#OBSERVATIONS: 
#From PLOT-1(Blue histogram):
    #We have quite brough value of tanh in range less than -1 and 1, no cell in matrix have value =1 or -1
    #this results in no cell have 0 gradient


plt.figure(figsize=(20,10))
plt.imshow(h.abs()>0.99,cmap='gray',interpolation='nearest')

#OBSERVATIONS-2:
#FROM PLOT-2(Bar code like matrix)
    #1. Barcode is matrix we get from layer-1 after tanh activation function, cell have gray color if h[i][j]>abs(0.99)
    #2. So all the cells which have which have white/gray color in the matrix have absolute value>0.99 which means they 
        #can contribute to zero gradient
    #3. If full column is white then that means we have that neuron (represented by that col) as dead neuron
    #We can see without optimization as well we didnot have problem of dead neuron atleast!!!!


In [None]:
#5.4 Continued Training the model on train set(after solving problem-2)

g = torch.Generator().manual_seed(2147483647) # for reproducibility
charEmbeddings  = torch.randn((vocab_size, n_embd),generator=g,requires_grad=True)
weightLayer1 = torch.randn((n_embd * block_size, n_hidden), generator=g)*0.2
biasLayer1 = torch.randn(n_hidden,generator=g)*0.01
weightLayer2 = torch.randn((n_hidden, vocab_size),generator=g)*0.01
biasLayer2 = torch.randn(vocab_size,generator=g)*0

weightLayer2.requires_grad=True
biasLayer2.requires_grad=True
weightLayer1.requires_grad=True
biasLayer1.requires_grad=True

parameters = [charEmbeddings, weightLayer1, weightLayer2, biasLayer1, biasLayer2]
trainingModel(inputContextTrain,charEmbeddings,weightLayer1,biasLayer1,weightLayer2,biasLayer2)

#OBSERVATION-2 Not much improvement in result probably because there might be lots of cell in the h matrix( the tanh matrix got
#from activation layer after layer-1) with big values causing +1 and -1 but whole column of tanh matrix were not having +1 or -1 value 
#i.e we didnot have dead neuron problem, this is can be seen from 5.2 observation-2 

In [None]:
#6. Kaiming init: How to do scientifically initialise the numbers we were multiplying with weights and biases of neural net
#(Revise Gaussian Distribution, Standard deviation and variance)

#1. Random Example to understand things better
x = torch.randn(1000,10) #sample inputContex which is flattened, so we have 1000 context windows and 10 block_size * char embedding dimen
w = torch.randn(10,200)# Sample Weight matrix for layer-1 with 10 weights and 200 neurons


randomSlidingWindowsUsed = torch.randint(0, inputContextTrain.shape[0], (32,))
inputForLayer1 = charEmbeddings[inputContextTrain[randomSlidingWindowsUsed]]
y = x@w #Sample matrix multiplication
#plt.figure(figsize=(20,5))
plt.subplot(311)
plt.hist(x.view(-1).tolist(),50, density=True) #view(-1) flattens matrix to 1-d array

plt.subplot(312)
plt.hist(y.view(-1).tolist(),50, density=True)

plt.subplot(313)
plt.hist(inputForLayer1.view(-1).tolist(),50, density=True) #view(-1) flattens matrix to 1-d array


#2. Understanding X-Axis and Y-Axis for the histogram plot
#1. x-axis represents the values in 

#3. Observation-1 The width of gaussian distribution is more in inputcontext plot than the matrix we get after matrix multiplication

#4 Inference: 
    #1. Plot of input context have more width than plot of y matrix which means that in plot for y more elements have value different 
    # than the mean value which means that element of y matrix are more different/outlier than mean value of y matrix element
    #2.Ideally we dont want distribution of the element relative to mean value to change in input context matrix and 
    # output from matrix multiplicartion matrix y





# # MLP revisited
# n_embd = 10 # the dimensionality of the character embedding vectors
# n_hidden = 200 # the number of neurons in the hidden layer of the MLP

# g = torch.Generator().manual_seed(2147483647) # for reproducibility
# C  = torch.randn((vocab_size, n_embd),            generator=g)
# W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5) #* 0.2
# #b1 = torch.randn(n_hidden,                        generator=g) * 0.01
# W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.01
# b2 = torch.randn(vocab_size,                      generator=g) * 0

# # BatchNorm parameters
# bngain = torch.ones((1, n_hidden))
# bnbias = torch.zeros((1, n_hidden))
# bnmean_running = torch.zeros((1, n_hidden))
# bnstd_running = torch.ones((1, n_hidden))

# parameters = [C, W1, W2, b2, bngain, bnbias]
# print(sum(p.nelement() for p in parameters)) # number of parameters in total
# for p in parameters:
#   p.requires_grad = True


In [None]:
#7 BatchNormalisation

g = torch.Generator().manual_seed(2147483647) # for reproducibility
charEmbeddings  = torch.randn((vocab_size, n_embd),generator=g,requires_grad=True)
weightLayer1 = torch.randn((n_embd * block_size, n_hidden), generator=g)*0.2
biasLayer1 = torch.randn(n_hidden,generator=g)*0.01
weightLayer2 = torch.randn((n_hidden, vocab_size),generator=g)*0.01
biasLayer2 = torch.randn(vocab_size,generator=g)*0
bngain = torch.ones((1,n_hidden))
bnbias = torch.zeros((1,n_hidden))

weightLayer2.requires_grad=True
biasLayer2.requires_grad=True
weightLayer1.requires_grad=True
biasLayer1.requires_grad=True
bngain.requires_grad=True
bnbias.requires_grad=True

#batchNormalization Additions



parameters = [charEmbeddings, weightLayer1, weightLayer2, biasLayer1, biasLayer2, bngain, bnbias]

max_steps=200000
    batch_size=32
    lossi = []
    stepsi = []
    noOfColAfterFlattening = block_size*n_embd
    for i in range(max_steps):
        noOfWindowsToBeUsedPerLearningStep = batch_size
        randomSlidingWindowsUsed = torch.randint(0, inputContextTrain.shape[0], (noOfWindowsToBeUsedPerLearningStep,))
        inputForLayer1 = charEmbeddings[inputContextTrain[randomSlidingWindowsUsed]]
        inputForLayer1 = inputForLayer1.view(noOfWindowsToBeUsedPerLearningStep, noOfColAfterFlattening)
        hpreActivation = inputForLayer1@weightLayer1+biasLayer1

        #batchNormalization changes
        bnmean  = hpreActivation.mean(0,keepdim=True)
        bnstd = hpreActivation.std(0,keepdim=True)
        hpreActivation = bngain*(hpreActivation-bnmean)/bnstd +bnbias
        #Problem is this bnmean and bnstd are w.r.t current mini batch of size 32 and not for the whole dataset so for the validation step
        #we dont have to have a global view for normalization i.e w.r.t to full data set so we need to keep to track of bnmean and bnstd for
        #whole dataset and not just current mini batch

    
        
                                 
        h = torch.tanh(hpreActivation)
        logits = h@weightLayer2+biasLayer2
        loss = F.cross_entropy(logits,outputCharacterTrain[randomSlidingWindowsUsed])
        for p in parameters:
            p.grad = None 
        loss.backward()
        lr = 0.1 if i<100000 else 0.01
        for p in parameters:
            p.data += -lr * p.grad 
        #track stats    
        if i % 10000 == 0: # print every once in a while
            print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}') #logging the loss in every 10,000 learning steps 
        lossi.append(loss.item())
        stepsi.append(i)
    
       
    plt.plot(stepsi,lossi)
    print("loss on training data set",loss.item())




In [None]:
#Solution for above problem:
#Define bnMeanRunning, bnStdRunning

bnStdRunning = torch.ones((1,n_hidden))  #they will not be optimized by backpropogation hence no graident
bnMeanRunning = torch.zeros((1,n_hidden))

parameters = [charEmbeddings, weightLayer1, weightLayer2, biasLayer1, biasLayer2, bngain, bnbias]

max_steps=200000
    batch_size=32
    lossi = []
    stepsi = []
    noOfColAfterFlattening = block_size*n_embd
    for i in range(max_steps):
        noOfWindowsToBeUsedPerLearningStep = batch_size
        randomSlidingWindowsUsed = torch.randint(0, inputContextTrain.shape[0], (noOfWindowsToBeUsedPerLearningStep,))
        inputForLayer1 = charEmbeddings[inputContextTrain[randomSlidingWindowsUsed]]
        inputForLayer1 = inputForLayer1.view(noOfWindowsToBeUsedPerLearningStep, noOfColAfterFlattening)
        hpreActivation = inputForLayer1@weightLayer1+biasLayer1

        #batchNormalization changes
        bnmean  = hpreActivation.mean(0,keepdim=True)
        bnstd = hpreActivation.std(0,keepdim=True)
        hpreActivation = bngain*(hpreActivation-bnmean)/bnstd +bnbias
        

        learningDecay = 0.999
        with torch.no_grad():
           bnStdRunning =  learningDecay*bnStdRunning + (1-learningDecay)*bnstd
           bnMeanRunning = learningDecay*bnMeanRunning + (1-learningDecay)*bnmean

    
        
                                 
        h = torch.tanh(hpreActivation)
        logits = h@weightLayer2+biasLayer2
        loss = F.cross_entropy(logits,outputCharacterTrain[randomSlidingWindowsUsed])
        for p in parameters:
            p.grad = None 
        loss.backward()
        lr = 0.1 if i<100000 else 0.01
        for p in parameters:
            p.data += -lr * p.grad 
        #track stats    
        if i % 10000 == 0: # print every once in a while
            print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}') #logging the loss in every 10,000 learning steps 
        lossi.append(loss.item())
        stepsi.append(i)
    
       
    plt.plot(stepsi,lossi)
    print("loss on training data set",loss.item())


In [None]:
#Validation step in batch normalization

def validate(model_params, inputContextVal, outputCharacterVal):
    """Perform validation on a separate dataset."""
    total_loss = 0
    num_samples = inputContextVal.shape[0]
    
    with torch.no_grad():  # Disable gradients during validation for efficiency
        for i in range(0, num_samples, batch_size):
            # Select a batch from validation data
            batch_indices = torch.arange(i, min(i + batch_size, num_samples))
            inputForLayer1 = charEmbeddings[inputContextVal[batch_indices]]
            inputForLayer1 = inputForLayer1.view(len(batch_indices), noOfColAfterFlattening)

            # Compute pre-activation
            hpreActivation = inputForLayer1 @ weightLayer1 + biasLayer1

            # --- Apply Batch Normalization using Running Mean & Variance ---
            hpreActivation = (hpreActivation - bngainRunning) / (bnbiasRunning + 1e-5) * bngain + bnbias
            
            # Apply activation function
            h = torch.tanh(hpreActivation)

            # Compute logits
            logits = h @ weightLayer2 + biasLayer2

            # Compute validation loss (no backprop)
            loss = F.cross_entropy(logits, outputCharacterVal[batch_indices])
            total_loss += loss.item() * len(batch_indices)  # Sum up loss

    # Compute average loss over the validation dataset
    avg_loss = total_loss / num_samples
    print(f"Validation Loss: {avg_loss:.4f}")
    return avg_loss