In [51]:
import torch
import torch.nn
from torch.nn import functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [52]:
import numpy

In [53]:
with open("wizardOFoz.txt",'r', encoding='utf-8') as f:
    text = f.read()
print(len(text))

chars = sorted(set(text))
print(chars,"\n", len(chars))

vocab_size = len(chars)

232310
['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 
 80


# Character Tokenizers
 
1.string_to_int and int_to_string dictionaries:

string_to_int is a dictionary comprehension that iterates over each character ch in the chars variable and assigns an integer index i to each character. It effectively creates a mapping from characters to their corresponding integer indices.
int_to_string is also a dictionary comprehension but does the reverse. It iterates over the same chars variable and assigns the character ch to the integer index i. It creates a mapping from integer indices to characters.
encode function:

2. encode is a lambda function that takes a string s as input.
Inside the lambda function, it uses a list comprehension to iterate over each character c in the input string s.
For each character, it looks up the corresponding integer index from the string_to_int dictionary and appends it to a list.
The result is a list of integers representing the encoded form of the input string.
decode function:

3. decode is another lambda function that takes a list of integers l as input.
Inside the lambda function, it uses a list comprehension to iterate over each integer i in the input list l.
For each integer, it looks up the corresponding character from the int_to_string dictionary and appends it to a list.
Finally, it joins the list of characters into a single string using the join method and returns the decoded string.


In [54]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

encoded_str = encode('hello')
decoded_str = decode(encoded_str)
print(decoded_str)

hello


In [55]:
data = torch.tensor(encode(text), dtype=torch.long) #torch.long is 64 bit int
print(data[0:100])
print(len(data))

tensor([ 1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,
         0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,
         0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36,
        25, 38, 28,  1, 39, 30,  1, 39, 50,  9])
232310


# Train-Test Split
Splitting train-test data into 80:20 

In [56]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

### Using batch training in the size of 4s'

1. `block_size` and `batch_size` are variables that define the size of data blocks and the size of each batch, respectively.

2. `x` and `y` are initialized by taking the first `block_size` elements from the `train_data` tensor and the corresponding elements shifted by 1 position to the right, effectively creating pairs of input sequences and target sequences.

3. The `get_batch` function is defined to generate batches of data. It takes an argument `split` to determine whether to use the `train_data` or `val_data`.

4. Inside the `get_batch` function:
   - `ix` is generated as a random tensor of `batch_size` integers between 0 and `len(data) - block_size`, effectively selecting random starting positions for each batch in the data.
   - The code then constructs input `x` and target `y` sequences by selecting blocks of data based on the random indices `ix`. These blocks are stacked into tensors using list comprehensions and `torch.stack`.
   - The resulting `x` and `y` tensors are moved to the `device` (presumably a GPU) for accelerated computation.

5. Finally, the `x` and `y` tensors for a training batch are obtained by calling `get_batch('train')`, and their shapes are printed.

- `inputs: ...` displays the input tensor `x` and its shape.
- `targets: ...` displays the target tensor `y` and its shape.

The code is designed to facilitate training a neural network in batches, which is a common practice in deep learning to efficiently process large datasets. It ensures that each batch consists of sequences of `block_size` elements, which can be used for tasks like sequence modeling or recurrent neural networks (RNNs).

In [57]:
block_size = 8
batch_size = 4

x = train_data[:block_size]
y = train_data[1:block_size+1]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i : i+block_size] for i in ix])
    y = torch.stack([data[i+1 : i+block_size+1] for i in ix])

    x,y = x.to(device), y.to(device) #pushing data to the GPU, to be processed on GPU

    return x,y
x,y = get_batch('train') 
print('inputs: ' , x,'\n', x.shape)
print('targets: ' , y, '\n' ,y.shape)



tensor([49416, 92599, 93300, 50163])
inputs:  tensor([[71, 58, 72,  1, 60, 71, 58, 76],
        [71, 54, 73, 62, 68, 67, 52,  0],
        [54, 62, 57,  0, 54,  1, 60, 62],
        [58,  1, 72, 73, 58, 66, 72,  1]], device='cuda:0') 
 torch.Size([4, 8])
targets:  tensor([[58, 72,  1, 60, 71, 58, 76,  1],
        [54, 73, 62, 68, 67, 52,  0,  0],
        [62, 57,  0, 54,  1, 60, 62, 71],
        [ 1, 72, 73, 58, 66, 72,  1, 54]], device='cuda:0') 
 torch.Size([4, 8])


Python class called `BigramLanguageModel`, which is a subclass of `torch.nn.Module`, is designed to create a language model capable of generating text.
- In the constructor (`__init__` method), it takes a single argument, `vocab_size`, which represents the size of the vocabulary (the number of unique tokens) in the language.
- Inside the constructor, `super().__init__()` is called to initialize the parent class (`torch.nn.Module`).
- It initializes an `Embedding` layer named `self.token_embedding_table`. This layer is used to map token indices to token embeddings. Both the input and output dimensions of this layer are set to `vocab_size`, which means it creates a square matrix for token embeddings.

-----------------------------------------------------------------------------------------------------------

The `forward` method defines the forward pass of the Bigram Language Model.
- It takes two arguments, `index` and `targets`. 
  - `index` is a tensor representing input token indices in the current context.
  - `targets` is a tensor representing true token indices, but it can be set to `None` during inference.
- Inside the method:
  - It computes `logits` by passing the input `index` through the `self.token_embedding_table`, effectively mapping token indices to token embeddings.
  - If `targets` is `None`, it sets `loss` to `None` (useful for inference).
  - If `targets` are provided, it computes the cross-entropy loss using `F.cross_entropy`. The shape of `logits` is reshaped to `(B*T, C)` where `B` is the batch size, `T` is the sequence length, and `C` is the size of the token embeddings.
- The method returns `logits`, representing model predictions, and `loss`, representing the computed loss (or `None` if `targets` are `None`).

-----------------------------------------------------------------------------------------------------------

The `generate` method is used for generating new tokens or sequences.
- It takes two arguments, `index` and `max_new_tokens`.
  - `index` is a tensor representing input token indices in the current context.
  - `max_new_tokens` is an integer representing the maximum number of new tokens to generate.
- Inside a loop that runs for `max_new_tokens` iterations:
  - It calls the `forward` method to get predictions (`logits`) from the model.
  - It focuses only on the last step of the predictions by selecting the last token in the sequence, effectively making `logits` have shape `(B, C)`, where `B` is the batch size, and `C` is the size of the token embeddings.
  - It applies the softmax function to `logits` to obtain probability distributions over possible next tokens.
  - It samples a token index from these distributions using `torch.multinomial` and appends it to the running sequence.
- The generated sequence is returned as `index`.

In [58]:
class BigarmLanguageModel(torch.nn.Module):
    """
    Initialize a Bigram Language Model.

    Args:
    - vocab_size (int): The size of the vocabulary, i.e., the number of unique tokens in the language.
    Returns:
    - None

    """
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = torch.nn.Embedding(vocab_size, vocab_size)


    def forward(self, index, targets=None):
        """
        Forward pass of the Bigram Language Model.

        Args:
        - index (torch.Tensor): Input tensor representing token indices in the current context.
        - targets (torch.Tensor): Target tensor representing true token indices (for training) or None (for inference).

        Returns:
        - logits (torch.Tensor): Tensor of model predictions (logits) for token indices.
        - loss (torch.Tensor or None): If targets are provided, the computed cross-entropy loss; otherwise, None.
        
        """

        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape   
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss



    def generate(self, index, max_new_tokens):
        """
        Generate new tokens using the Bigram Language Model.

        Args:
        - index (torch.Tensor): Input tensor representing token indices in the current context.
        - max_new_tokens (int): The maximum number of new tokens to generate.

        Returns:
        - generated_tokens (torch.Tensor): Tensor representing the generated sequence of token indices.
        """

        #Index is (B,T) array of indicies in the current context
        for _ in range(max_new_tokens):
            #get the predictions
            logits, loss = self.forward(index)

            #focus only on the last step
            logits = logits[:,-1,:] #becomes (B,C)

            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) #(B,C)

            #sample from the distribution
            index_next  = torch.multinomial(probs, num_samples=1) # (B,1)

            #append sampled index to running sequence
            index = torch.cat((index, index_next), dim=1) #(B,T-1)

            return index

model = BigarmLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


&


# Optimizer

In [59]:
max_iters = 10000
learning_rate = 3e-4

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    #sample a batch of data
    xb, xy = get_batch('train')

    #evaluate loss
    logits, loss = model.forward(xb, xy)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())    


tensor([118267, 169456,  21151, 141522])
tensor([173714,  33523,  18705,  52497])
tensor([ 77638,   1379, 170534, 184589])
tensor([ 29614, 150243, 150625,   6517])
tensor([ 89894,  50754, 101565, 183511])
tensor([162415,  29656,  27257, 138284])
tensor([107754,  59922, 110798, 138852])
tensor([182772,  72166,  97190,  87399])
tensor([ 44632, 172049,  51609, 152683])
tensor([ 47474, 172966,  95608,  49644])
tensor([116736,   9443,  85199,  91617])
tensor([168052,   2418,  38505,   7964])
tensor([ 59198,  16317,  37069, 129330])
tensor([176473,  66754, 124066, 154788])
tensor([15107,  4445, 16378, 10091])
tensor([174141, 136927,  79060, 164832])
tensor([154647,  14396,  44829,   6744])
tensor([107286,  52317,  16614, 140716])
tensor([107981,  56091,    186,  94506])
tensor([ 49315,  71296, 169377, 180829])
tensor([  7511,  79050, 103924, 137064])
tensor([ 22499, 109948,  59520,  80209])
tensor([158111, 113854,  41506, 109886])
tensor([ 36154,  35096, 153744, 113106])
tensor([ 97949,    5

In [60]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


f
