In [None]:
#single head of self-attention used in multi-head attention mechanisms within transformer models
class Head(nn.Module):
    def __init__(self,head_size):
        super().__init__()

        #key is the info that you want to compare against or use as a reference
        self.key=nn.Linear(n_embd,head_size,bias=False)

        #query is the info that you currently processing or seeking ti understand better
        self.query=nn.Linear(n_embd,head_size,bias=False)

        #value is the info tassociated with query which provides additional context 
        self.value=nn.Linear(n_embd,head_size,bias=False)

        #buffer contains a lower triangular matrix with ones below the main diagonal and zeros above it.
        #This matrix is used for masking during attention computations.
        self.register_buffer('trill',torch.tril(torch.ones(block_size,block_size)))
        
        self.dropout=nn.Dropout(dropout)
    #forward implements the self-attention mechanism for one attention head. 
    def forward(self,x):

        #B, T, and C represent the batch size, sequence length, and input dimension
        B,T,C=x.shape
        
        k=self.key(x)
        q=self.query(x)

        #The attention mechanism computes the attention weights using the dot product between query and key vectors.
        # which is scaled by a factor of k.shape[-1]**-0.5 (the square root of the key dimension).
        wei= q @ k.transpose(-2,-1)* k.shape[-1]**-0.5

        #his masking ensures that the model doesn't attend to future elements in the sequence
        wei=wei.masked_fill(self.trill[:T,:T] ==0, float('-inf'))

        #normalize them and obtain valid attention probabilities.
        wei=F.softmax(wei,dim=-1)
        wei=self.dropout(wei)
        
        v=self.value(x)
        out=wei @ v
        return out 
        
class MultiHeadAttention(nn.Module):
    """mulitple heads of self - attention in parallel"""
    
    def __init__(self,num_heads,head_size):
        super().__init__()

        #num_heads represents the number of attention heads to use in parallel.
        #head_size is the number of features captured by each attention head.

        # a container for multiple attention heads.
        self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])

        #self.prj is a linear projection layer used to combine the outputs of the individual attention heads.
        self.proj=nn.Linear(head_size*num_heads,n_embd)
        self.dropout=nn.Dropout(dropout)
        
    def forward(self,x):

        #outputs of the attention heads are concatenated along the last dimensio
        out=torch.cat([h(x) for h in self.heads],dim=-1)
        out=self.dropout(self.proj(out))
        return out
        
class FeedFoward(nn.Module):
    """ a simple layer  followed by a non-linearlity """
    def __init__(self,imbd):
        super().__init__()

        # a container for defining a sequence of operations in PyTorch.
        self.net=nn.Sequential(
            
            # takes an input of dimension n_embd and produces an intermediate output with a dimension that is four times the input dimension
            nn.Linear(n_embd,4*n_embd),
            nn.ReLU(),

            # reduces the dimensionality back to the original input dimension
            nn.Linear(4*n_embd,n_embd),

            #The dropout layer  is used for regularization
            nn.Dropout(dropout)
        )
        def forward(self,x):
            return self.net(x)

class Block(nn.Module):
    def __init__ (self,n_embd,n_head) :
        super().__init__()
        #head size is the number of features that each head will be capturing in our multi-head attention
        head_size=n_embd // n_head

        #self attention 
        self.sa=MultiHeadAttention(n_head,head_size)

        #The feedforward layer is responsible for capturing complex patterns and features in the data.
        self.ffwd=FeedFoward(n_embd)

        #Layer normalization helps stabilize training by normalizing the activations within each laye
        self.ln1=nn.LayerNorm(n_embd)
        self.ln2=nn.LayerNorm(n_embd)
        
    def forward(self,x):
        y=self.sa(x)
        x=self.ln1(x+y)
        y=self.ffwd(x)
        x=self.ln2(x+y)
        return x
        
class GPTLanguageModel(nn.Module):
    def __init__ (self,vocab_size) :
        #This line calls the constructor of the parent class, which is typically necessary when defining a subclass in Python
        super().__init__()

        #This line creates an embedding layer for token embeddings.
        self.token_embedding_table=nn.Embedding(vocab_size,n_embd)

        #This line creates an embedding layer for positional embeddings.
        self.postion_embedding_table=nn.Embedding(block_size,n_embd)

        #This line defines a sequence of neural network blocks.
        self.blocks=nn.Sequential(*[Block(n_embd,n_head=n_head) for _ in range(n_layer)])

        #This line defines a layer normalization operation. 
        #Layer normalization is used to stabilize and normalize the activations between layers in a neural network.
        self.ln_f=nn.LayerNorm(n_embd)

        #This line defines a linear (fully connected) layer for language modeling.
        self.lm_head=nn.Linear(n_embd,vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self,module):
        
        if isinstance(module,nn.Linear):
            torch.nn.init.normal_(module.weight,mean=0.0,std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
                
        elif isinstance(module,nn.Embedding):
            torch.nn.init.normal_(module.weight,mean=0.0,std=0.02)
    
            
    # forward method for a neural network model, which is used for making predictions and computing losses during training
    def forward(self,index,targets=None):

         # takes the input index, which likely represents a sequence of token indices, and uses the self.token_embedding_table to obtain token embeddings
        logits=self.token_embedding_table(index)

        #.shape is used the unpack the items of logits  into B ,T , C
        # B is for batch, T is for time ,C is for number of class
        B, T, C = logits.shape
        
        #idx and tarbets are both (B,T) tensor of integers
        tok_emb=self.token_embedding_table(index) 
        pos_emb=self.postion_embedding_table(torch.arange(T,device=device))

        # This line combines the token embeddings and positional embeddings by element-wise addition
        x=tok_emb+pos_emb
        x=self.blocks(x)
        x.self.ln_f(x)
        logits=self.lm_head(x)
        
        if targets==None:
            loss=None
        else :
           
            #.view is used to pack them alternate of .shape
            logits=logits.view(B*T,C)
            targets=targets.view(B*T)
            
            #This function computes the loss between the predicted logits (logits) and the ground truth labels (targets).
            loss=F.cross_entropy(logits,targets)
            
        return logits, loss
    #  purpose of generate func -generate a sequence of tokens or indices given an initial context (index)
    #  and a maximum number of new tokens (max_new_tokens).
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        # Create a new tensor for the generated sequence
        generated_sequence = index
    
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(generated_sequence)
            
            #focus only on the last time step
            logits=logits[:,-1,:]
            
            # focus only on the last time step
            probs = F.softmax(logits, dim=-1)
            
            # sample from the distribution
            num_samples=1
            index_next = torch.multinomial(probs, num_samples)
    
            # append sampled index to the running sequence
            generated_sequence = torch.cat((generated_sequence, index_next), dim=1)
    
        return generated_sequence
# the "Forward" function makes predictions based on input, and the "Generate" function uses those predictions to create new text,
# like continuing a story or generating sentences.

#we are creating an instance of BigramLanguageModel name model
model=GPTLanguageModel(vocab_size)

#m is the alternate verson of model in but running in gpu(if available)
m = model.to(device)

#we initially declaring a context of 1-dim zero as our starting chars
context=torch.ones((1,1),dtype=torch.long,device=device)

#generated_chars is using class for predicting the next words for context and predticting upto 500 words 
generated_chars=decode(m.generate(context,max_new_tokens=50)[0].tolist())
print(generated_chars)
        
        