In [1]:
import pandas
#imorting os and sys to add the path of different site_package\dir to jupyter notebook
import os
import sys
directory_path = os.path.abspath(os.path.join('F:\LLM-project\cuda\Lib\site-packages'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
#block_size is the numbre of blocks/list in the stacks
#batch_size is the number of values int the tensor
batch_size=4
block_size=8
max_iters=1000
#eval_interval=2500
learning_rate=3e-4
eval_iters=250
dropout=0.2

In [3]:
#this checks if gpu is available or not for fast computation
#as cpu performs task in sequential manner which is time consuming for training and testing purposes
#gpu is used to run more than task parrallely
device ='cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text=f.read()
chars=sorted(set(text))
vocab_size=len(chars)

In [5]:
#encoding and decoding
#mapping from string to int
# string_to_int is a dictionnary which is mode of key value pair of char and its index in the chars set
string_to_int={ch:i for i,ch in enumerate(chars)}

#mapping from int to string 
# int_to_String is a dictionnary which is mode of key value pair of index and its value in the chars set
int_to_string={i:ch for i,ch in enumerate(chars)}

#encode is a list  function which input a string and output a list off indexs of the characters in the string 
encode=lambda s:[string_to_int[c] for c in s]

#lambda function takes a list of integers l as input and returns a string by decoding each integer in the input list using the int_to_string dictionary.
decode=lambda l:''.join([int_to_string[i] for i in l])


In [6]:
#tensors are  multi-dimensional arrays or generalizations of matrices that can have any number of dimensions. 
#data is list(tensor) which stores the character in encoded form
data = torch.tensor(encode(text), dtype=torch.long)
#print(data[:10])

In [7]:
#we are dividing the data into training and validation dataset 
#n is represnting 80% of the len data
# here we are dividing training:validating in 80:20 ratio
n=int(0.8*len(data))
train_data=data[:n]
val_data=data[n:]

In [8]:

#get_batch is used select a specific data from the dataset 
def get_batch(split):
    
    #data is the the tensor on which we are working on 
    #we initially divided the dataset into 80:20 into train vs test
    data=train_data if split == 'train' else val_data
    
    #ix is list of random integers of length batch_size from starting to data-block_size
    ix=torch.randint(len(data)-block_size,(batch_size,))
    
    #x is the stack of training dataset of length 8
    x=torch.stack([data[i:i+block_size] for i in ix])
    
    #y is the same stack with one offset for predicting values
    y=torch.stack([data[i+1:i+block_size+1] for i in ix])
    
    #x,y shift to gpu(Cuda) if it is available
    x,y=x.to(device), y.to(device)
    
    #print(device)
    #returning pair of stack x and y
    return x,y
x,y = get_batch('train')
print('input')
print(x)
print('targets')
print(y)

input
tensor([[64, 58, 57,  1, 28, 68, 71, 68],
        [62, 65, 58, 67, 56, 58,  1, 72],
        [65, 65,  1, 60, 58, 73,  1, 66],
        [65,  0, 73, 61, 58, 78,  1, 69]])
targets
tensor([[58, 57,  1, 28, 68, 71, 68, 73],
        [65, 58, 67, 56, 58,  1, 72, 74],
        [65,  1, 60, 58, 73,  1, 66, 78],
        [ 0, 73, 61, 58, 78,  1, 69, 71]])


In [9]:
#loss estimation 
# This decorator is used before defining the estimate_loss function. 
#It temporarily disables gradient tracking for all the operations inside the function.
@torch.no_grad()

def estimate_loss():
    #out is initialized as an empty dictionary. It will be used to store the estimated losses for the training and validation datasets.
    out={}
    # sets the model into evaluation mode.
    # In evaluation mode, the model behaves differently from training mode, typically disabling features like dropout and batch normalization.
    model.eval()
    
    for split in ['train','val']: # estimate the loss separately for both the training and validation datasets.
        
        losses =torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)

            # (model) is used to compute predictions (logits) and calculate the loss (loss) between the predictions and the target data.
            logits,loss=model(X,Y)

            #This keeps track of the loss for each iteration.
            losses[k]=loss.item()

        # calculates the mean (average) of the losses obtained during those iterations. 
        out[split]=losses.mean()
    model.train()

    #estimated losses for both the training and validation datasets.
    return out

In [10]:
x=data[:block_size]
y=data[1:block_size+1]
for i in range(block_size):
    context=x[:i+1]
    pred=y[i]
    print("when input is ",context," then target is ",pred)

when input is  tensor([80])  then target is  tensor(0)
when input is  tensor([80,  0])  then target is  tensor(1)
when input is  tensor([80,  0,  1])  then target is  tensor(1)
when input is  tensor([80,  0,  1,  1])  then target is  tensor(28)
when input is  tensor([80,  0,  1,  1, 28])  then target is  tensor(39)
when input is  tensor([80,  0,  1,  1, 28, 39])  then target is  tensor(42)
when input is  tensor([80,  0,  1,  1, 28, 39, 42])  then target is  tensor(39)
when input is  tensor([80,  0,  1,  1, 28, 39, 42, 39])  then target is  tensor(44)


In [11]:
class BigramLanguageModel(nn.Module):
    def __init__ (self,vocab_size) :
        super().__init__()
        self.token_embedding_table=nn.Embedding(vocab_size,vocab_size)
    def forward(self,index,targets=None):
        logits=self.token_embedding_table(index)
        if targets==None:
            loss=None
        else :
            #.shape is used the unpack the items of logits  into B ,T , C
            # B is for batch, T is for time ,C is for number of class
            B, T, C = logits.shape
            
            #.view is used to pack them alternate of .shape
            logits=logits.view(B*T,C)
            targets=targets.view(B*T)
            
            #This function computes the loss between the predicted logits (logits) and the ground truth labels (targets).
            loss=F.cross_entropy(logits,targets)
            
        return logits, loss
    #  purpose of generate func -generate a sequence of tokens or indices given an initial context (index)
    #  and a maximum number of new tokens (max_new_tokens).
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        # Create a new tensor for the generated sequence
        generated_sequence = index
    
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(generated_sequence)
            
            #focus only on the last time step
            logits=logits[:,-1,:]
            
            # focus only on the last time step
            probs = F.softmax(logits, dim=-1)
            
            # sample from the distribution
            num_samples=1
            index_next = torch.multinomial(probs, num_samples)
    
            # append sampled index to the running sequence
            generated_sequence = torch.cat((generated_sequence, index_next), dim=1)
    
        return generated_sequence
# the "Forward" function makes predictions based on input, and the "Generate" function uses those predictions to create new text,
# like continuing a story or generating sentences.

#we are creating an instance of BigramLanguageModel name model
model=BigramLanguageModel(vocab_size)

#m is the alternate verson of model in but running in gpu(if available)
m = model.to(device)

#we initially declaring a context of 1-dim zero as our starting chars
context=torch.ones((1,1),dtype=torch.long,device=device)

#generated_chars is using class for predicting the next words for context and predticting upto 500 words 
generated_chars=decode(m.generate(context,max_new_tokens=50)[0].tolist())
print(generated_chars)
        
        

 KJuk)u4qV&1!b
0t3!hsw'BH1u4!Pq:Th5S8V5t54B "fiGhos


In [12]:
#creating a pytorch optimizer 
# training a neural language model using the AdamW optimizer
optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)
for iter in range(max_iters):

    if iter % eval_iters==0 :
        losses=estimate_loss()
        print(f"step:{iter},train loss: {losses['train']:.3f},val loss: {losses['val']:.3f}")
    #sample a batch of data 
    xb,yb=get_batch('train')
    
    #evaluate the loss
    logits,loss=model.forward(xb,yb)
    
    #This line clears (zeros out) the gradients of the model's parameters.
    # Gradients accumulate during each backward pass, so this step ensures that the gradients start fresh for the current batch.
    optimizer.zero_grad(set_to_none=True)
    
    # This line computes the gradients of the loss with respect to the model's parameters.
    # These gradients are computed to understand how the loss changes as the parameters are adjusted.
    loss.backward()

    # This line updates the model's parameters based on the computed gradients and the learning rate (lr).
    # It effectively performs a parameter update step to minimize the loss.
    optimizer.step()
    
print(loss.item())

step:0,train loss: 4.911,val loss: 4.882
step:250,train loss: 4.823,val loss: 4.813
step:500,train loss: 4.742,val loss: 4.760
step:750,train loss: 4.691,val loss: 4.700
4.772038459777832


In [13]:
#we initially declaring a context of 1-dim zero as our starting chars
context=torch.ones((1,1),dtype=torch.long,device=device)

#generated_chars is using class for predicting the next words for context and predticting upto 500 words 
generated_chars=decode(m.generate(context,max_new_tokens=50)[0].tolist())
print(generated_chars)

 H3;﻿ZI97_]dB:dChcR5nn9
LLyB4cC!1St[0wg*qcad ZMRfrm


In [None]:
#  Activation functions are like decision-makers for the neurons (tiny processing units) in a neural network.
# transformers are like super-smart readers and writers for computers.
#They help computers understand, create, and work with text, making them great at all sorts of language-related tasks
