In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [None]:
#OBJECTIVE: WE WANT TO PREDICT NEXT CHARATCER BY TAKING CONTEXT FROM MORE PREVIOUS CHARACTERS(like 3) INSTEAD OF JUST 1 

In [4]:
#1 read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [8]:
chars = sorted(list(set(''.join(words))))
charIntIndexMapping = {s:i+1 for i,s in enumerate(chars)}
charIntIndexMapping['.'] = 0
indexToCharMapping ={i:s for s,i in charIntIndexMapping.items()}

In [22]:
#2 BUILDING TRAINING DATASET

block_size = 3 # context length: how many characters do we take to predict the next one? basically size of sliding window
inputContext, outputCharcterForThatContext = [], []

for w in words:
  

  contextSlidingWindow = [0] * block_size #Initialise with arr= [0,0,0] where 0 represents '.' char so essentially "..."
    
  for ch in w + '.': #Appending '.' to mark end of word
      
    #NOTE ON APPENDING '.' in Words:
    #1. In bigram model we were appending '.' to start and end of each word 
    #2. Here we dont need to append '.' in starting because we are starting context as "..." 

      
    index = charIntIndexMapping[ch]
    inputContext.append(contextSlidingWindow)
    outputCharcterForThatContext.append(index)
    #print(''.join(indexToCharMapping[i] for i in context), '--->', indexToCharMapping[ix])
    
    contextSlidingWindow = contextSlidingWindow[1:] + [index] #Updating context, so context is like sliding window of size=block_size, when you move to 
    #next iteration you append current char from back in sliding window and first char is removed
    #Syntax context[1:] means taking array from index-1 to last index and appending index represented by current char

#1. inputContext: Represents a matrix where row represents integer form of each possible sliding window of size -3 in all name in names.txt
# and we have 3 columns(bec context_length=3) which represents the content of contextWindow represented by row


#Forexample for emma actually becomes emma.
#Row  Columns(int mapping of char)  contextWindow  
#0->  0,0,0                         ...
#1->  0,0,5                         ..e
#2->  0,5,13                        .em
#3->  5,13,13                       emm
#4->  13,13,1                       mma
#(No entry for ma.)


#2. a. outputCharcterForThatContext: Represents an array of size equal to number of rows in inputContext and outputCharcterForThatContext[i]
#gives index the next character for the sliding window or the context represented by ith row in inputContext
#b. We can say outputCharcterForThatContext is label for our data so we have labelled dataset for our training
inputContext = torch.tensor(inputContext)
outputCharcterForThatContext = torch.tensor(outputCharcterForThatContext)
print(inputContext.shape)
print(outputCharcterForThatContext.shape)


#3. We can change size of sliding window to 4 or 5 or anything in that case our contextWindow will be ....., ....e, ...em, ..emm, .emma,



torch.Size([228146, 3])
torch.Size([228146])


In [30]:
#3 a.Creating Embedding for Each Character so that it can be comfortably for training ML Model like matrix multiplication
#b. In bigram model we created one hot encoding i.e each character was written as 27-bit 
#c. Here we will map each character to 2 dimensional embedding, so for 27 characters we will get 27x2 matrix

charEmbeddings = torch.randn((27,2))

charEmbeddings[5] # this gives 2 dimensional embedding of char at index:5  i.e e
print(charEmbeddings[5])

#e. charEmbeddings[5] THIS EMBEDDING IS EQUIVALENT TO:
embedding = F.one_hot(torch.tensor(5), num_classes=27).float() @ charEmbeddings 
embedding
#So we can see charEmbeddings[5] = embedding 
# 1. We created one hot encoding of e as 000010...0 (i.e 1x27 matrix) and did matrix multiplication with 27x2 matrix(this matrix is like 
  #weights matrix so we can say our charEmbedding is similar 1 layer neural , so this is first layer of neural net
# 2. so we will be using charEmbedding directly instead of this one hot encoding and matrix multiplication


# charEmbeddings will be used as look up table to find embedding of a char

charEmbeddings[inputContext][

tensor([-0.0274, -0.0186])


tensor([-0.0274, -0.0186])

In [56]:
#4. Preparing Input data for neural network which will go to layer-1
inputForLayer1 = charEmbeddings[inputContext]
print(inputForLayer1.shape) # 3-d dimensional(p,q,r) where p-> number of context window possible for all names, q-> size of context window
#basically block_size, r-> the dimension in which each char is embedded so in our case we have embedded each char in 2D(Look at block-2)

#Flattening from 3d to 2d
inputForLayer1 = inputForLayer1.view(len(inputContext), 6)
print(inputForLayer1.shape)

torch.Size([228146, 3, 2])
torch.Size([228146, 6])


In [32]:
#5. DEFINING TWO LAYERS OF NEURAL NETWORK
g = torch.Generator().manual_seed(2147483647) # for reproducibility
weightLayer1 = torch.randn((6, 100), generator=g) #layer-1 100 neurons each neuron have 6 weights because input have 6 feuture so we have
#6 weights
biasLayer1 = torch.randn(100, generator=g) #layer-1 100 biases
weightLayer2 = torch.randn((100, 27), generator=g) # layer-2 27 neurons each neurons have 100 weights because input for layer-2 will havve
#100 feature so we have 100 weight ())
biasLayer2 = torch.randn(27, generator=g) #layer-2 27 biases
parameters = [charEmbeddings, weightLayer1, biasLayer1, weightLayer2, biasLayer2]
