In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(device)
block_size = 8
batch_size = 4
max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

mps


In [2]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)


['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [3]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
# print(string_to_int)
int_to_string = { i:ch for i,ch in enumerate(chars) }
# print(int_to_string)
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])
data = torch.tensor(encode(text), dtype=torch.long)
# print(data[:100])

In [4]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)



ix: tensor([ 43822,  85493, 122287,  61371])
inputs:
tensor([[67,  1, 60, 54, 75, 58,  1, 54],
        [71, 58, 11,  3,  0,  0, 32, 58],
        [78,  1, 74, 60, 65, 78,  1, 73],
        [61, 68, 69, 69, 58, 71, 72, 11]], device='mps:0')
targets:
tensor([[ 1, 60, 54, 75, 58,  1, 54,  1],
        [58, 11,  3,  0,  0, 32, 58,  1],
        [ 1, 74, 60, 65, 78,  1, 73, 68],
        [68, 69, 69, 58, 71, 72, 11,  1]], device='mps:0')


```python
ix = torch.randint(len(data) - block_size, (batch_size,))
```

This line generates random starting indices for sampling sequences (blocks) from the data. Let's break it down using the example `"Hello, how are you?"`.

## Understanding the Parameters

1. **`len(data) - block_size`**:
   - `len(data)` is the total number of elements (characters in this case) in the dataset.
     - In the string `"Hello, how are you?"`, the length (`len(data)`) is 19 (including spaces and punctuation).
   - `block_size` defines the number of consecutive elements you want to sample in one sequence (block). For example, let’s assume `block_size = 5`.
   - So, `len(data) - block_size` would be `19 - 5 = 14`. This is the maximum possible starting index for a sequence of 5 characters. If a sequence starts at index 14, it would go from character 14 to 18, which is the last valid block within the data.

2. **`(batch_size,)`**:
   - `batch_size` is the number of sequences (blocks) you want to sample in one batch. For instance, if `batch_size = 2`, you want to generate 2 random starting indices to get 2 blocks of data in this batch.

## Example Walkthrough

Assume:
- **`block_size = 5`**
- **`batch_size = 2`**
- **`data = "Hello, how are you?"`**

When you run:
```python
ix = torch.randint(len(data) - block_size, (batch_size,))
```
`torch.randint(14, (2,))` will generate 2 random integers between 0 and 13 (inclusive). These integers are used as starting indices for sampling blocks of size 5.  
Suppose `ix = [3, 10]`, the starting indices are 3 and 10.

## What Happens Next?

### For the first block starting at index 3:
The sequence will be:
```python
data[3:3+5] = "lo, h"
```
### For the second block starting at index 10:
The sequence will be:
```python
data[10:10+5] = "w are"
```

## Full Example

Let’s see what the code would do in this case:

- **`data = "Hello, how are you?"`**
- **Random indices** (`ix`): `[3, 10]`
- **`block_size = 5`**
- **`batch_size = 2`**

Now, for the input `x` and target `y`:

```python
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
```


- x (input sequences):
- x[0] = data[3:8] = "lo, h"
- x[1] = data[10:15] = "w are"
- y (target sequences, shifted by 1):
- y[0] = data[4:9] = "o, ho"
- y[1] = data[11:16] = " are "


In [5]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        """
        eval_iters determines how many batches will be used to estimate the loss for each dataset split 
        (e.g., 250 batches). This creates a more robust estimate than using a single batch.
        """
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [6]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # print("Current context: ", index)
            # get the predictions
            logits, loss = self.forward(index)
            # print("Logits", logits)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
# print(generated_chars)



### Overview

The `BigramLanguageModel` class is a neural network model built using PyTorch's `nn.Module`. It is designed to work with a vocabulary of tokens (words or characters) and predicts the next token based on the current context using a bigram (two-token) approach.

### Components of the Class

1.  **Class Definition**:

    python

    Copy code

    `class BigramLanguageModel(nn.Module):`

    This line defines a new class called `BigramLanguageModel`, which inherits from `nn.Module`. This is a base class for all neural network modules in PyTorch.

2.  **`__init__` Method**:

    python

    Copy code

    `def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)`

    -   **Parameters**:
        -   `vocab_size`: The size of the vocabulary, which determines how many unique tokens the model can work with.
    -   **`super().__init__()`**: Calls the initializer of the parent class (`nn.Module`).
    -   **`self.token_embedding_table`**: This creates an embedding layer that maps each token to a dense vector of the same size as the vocabulary. This means that each token will be represented as a one-hot vector, but it can also be viewed as a dense embedding for the bigram context.
3.  **`forward` Method**:

    python

    Copy code

    `def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss`

    -   **Parameters**:
        -   `index`: A tensor of indices representing the input tokens.
        -   `targets`: Optional; a tensor of target indices for calculating loss.
    -   **Process**:
        -   The input indices are passed through the embedding layer to obtain `logits`, which represent the embeddings of the input tokens.
        -   If `targets` is not provided, `loss` is set to `None`.
        -   If `targets` is provided, the logits are reshaped to be suitable for loss calculation:
            -   `B`: Batch size
            -   `T`: Sequence length (number of tokens)
            -   `C`: Vocabulary size (number of classes)
        -   `logits` and `targets` are reshaped to 2D tensors so that they can be used for calculating cross-entropy loss.
        -   `F.cross_entropy(logits, targets)`: This computes the cross-entropy loss between the predicted logits and the actual targets.
    -   **Return Values**: The method returns the `logits` and the computed `loss`.
4.  **`generate` Method**:

    python

    Copy code

    `def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            logits = logits[:, -1, :] # focus on the last time step
            probs = F.softmax(logits, dim=-1) # apply softmax to get probabilities
            index_next = torch.multinomial(probs, num_samples=1) # sample from the distribution
            index = torch.cat((index, index_next), dim=1) # append sampled index to the running sequence
        return index`

    -   **Parameters**:
        -   `index`: A tensor of shape `(B, T)` containing indices of the current context (where `B` is the batch size and `T` is the sequence length).
        -   `max_new_tokens`: The maximum number of new tokens to generate.
    -   **Process**:
        -   The function iteratively generates new tokens for `max_new_tokens` times:
            -   Calls the `forward` method to get the `logits`.
            -   `logits[:, -1, :]`: Focuses on the logits for the last time step, which represents the prediction for the next token.
            -   `F.softmax(logits, dim=-1)`: Applies softmax to convert logits into probabilities.
            -   `torch.multinomial(probs, num_samples=1)`: Samples a new token index from the probability distribution.
            -   `index = torch.cat((index, index_next), dim=1)`: Appends the sampled index to the running sequence.
    -   **Return Value**: Returns the updated `index` tensor, which now includes the newly generated tokens.

### Example Usage

python


`model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)`

1.  **Model Initialization**:

    -   `model = BigramLanguageModel(vocab_size)`: Initializes the model with a specified vocabulary size.
    -   `m = model.to(device)`: Moves the model to the specified device (CPU or GPU).
2.  **Context Initialization**:

    -   `context = torch.zeros((1, 1), dtype=torch.long, device=device)`: Initializes the context tensor to start generating text. It's a tensor with a single zero (indicating the first token in the vocabulary).
3.  **Text Generation**:

    -   `m.generate(context, max_new_tokens=500)`: Calls the `generate` method to produce up to 500 new tokens based on the initial context.
    -   `decode(...)`: This is presumably a function that converts the token indices back into a human-readable string format.
    -   `print(generated_chars)`: Outputs the generated characters.

### Summary

The `BigramLanguageModel` class is a simple yet effective language model that predicts the next token in a sequence based on the current context. It uses embeddings to represent tokens and has methods for both training (calculating loss) and inference (generating new sequences). The model can generate text based on the initial context and produces a sequence of tokens iteratively.

In [7]:
print(generated_chars)


*j*vaf_)1wWuHX*Sg:
N_c_O*4gA[k8,.J)xr"LH;CqV1hC"ua-p(PoY,L,nR!Dm[y&X*;7YJ3m(yk"1&Ky,n8dAf]
h[*4BXe6jH);3"e1-8jC53zNo*-p(JuV8g52Hny72UYJH
GJ&
rki9l9NEYQ[PP!Z16W735kUN0?GTJ9hIM7_Ec?lp]o?R7O1[.!&b?pnU.LC"vZ5!kzuaN_gs6e9l6VnAue?m!'Jsuan)uMxSYLFz-0yi1PT oYLjT7aVsL,6SAkuK&:G-!l4Qx]zl[-&4Pi9Xyq[mZh[F&Tecjz-!
[uX?G5LC"q_XQ,du3mk]'v0)HrDPeo5!f
LOoI,"xxd﻿UhF-N_pW.oi LcQ*6N)Ko"1GkN9:FS_7Vb
rLVsv-I2k)q8'52Hr1rzpvpv(pnPz C23Im[LV1sPXy?P!ahnd[GkP!ZBpW2zhHQwgTa0FM;&hGkHFr"PxwWO;yq0,YQrdRtny!gO9h':0)xVznBH4CG;p


In [12]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print("==============================================================================================")
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data, returns random
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

ix: tensor([112919, 127495,  39705,    947])
ix: tensor([ 70445,  35687, 148142, 151487])
ix: tensor([ 1226, 47396, 28477,  2122])
ix: tensor([109470, 135390,  76123,  94298])
ix: tensor([ 66832,  33845, 134783,  41085])
ix: tensor([131168, 116037,  16036,  64670])
ix: tensor([ 81791, 162145, 120935,   4022])
ix: tensor([ 49332, 131819,  61600,  57049])
ix: tensor([148816,  50554, 184387, 161623])
ix: tensor([ 91589, 170640,  58125, 103709])
ix: tensor([ 91453,  19367, 116104, 127429])
ix: tensor([ 23002, 140960,  56381, 176931])
ix: tensor([76337, 21150, 64999, 29478])
ix: tensor([ 56264, 114535,  10706,  96790])
ix: tensor([158491,  72617, 113539, 185799])
ix: tensor([  7951, 126998,  59474,  59649])
ix: tensor([86798, 43258, 43154, 89213])
ix: tensor([179574, 138749,  37217, 128733])
ix: tensor([136190, 133728,  20582, 128041])
ix: tensor([ 89060,   5376, 160323, 106355])
ix: tensor([ 37754, 180561,  33415,  44556])
ix: tensor([128952, 107597,   2717,  11247])
ix: tensor([167490, 15

***need to familiarize audience with optimizers (AdamW, Adam, SGD, MSE…) no need to jump into the formulas, just what the optimizer does for us and some of the differences/similarities between them***

1. **Mean Squared Error (MSE)**: MSE is a common loss function used in regression problems, where the goal is to predict a continuous output. It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.
2. **Gradient Descent (GD):**  is an optimization algorithm used to minimize the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function
3. **Momentum**: Momentum is an extension of SGD that adds a "momentum" term to the parameter updates. This term helps smooth out the updates and allows the optimizer to continue moving in the right direction, even if the gradient changes direction or varies in magnitude. Momentum is particularly useful for training deep neural networks.
4. **RMSprop**: RMSprop is an optimization algorithm that uses a moving average of the squared gradient to adapt the learning rate of each parameter. This helps to avoid oscillations in the parameter updates and can improve convergence in some cases.
5. **Adam**: Adam is a popular optimization algorithm that combines the ideas of momentum and RMSprop. It uses a moving average of both the gradient and its squared value to adapt the learning rate of each parameter. Adam is often used as a default optimizer for deep learning models.
6. **AdamW**: AdamW is a modification of the Adam optimizer that adds weight decay to the parameter updates. This helps to regularize the model and can improve generalization performance. We will be using the AdamW optimizer as it best suits the properties of the model we will train in this video.

find more optimizers and details at torch.optim

In [9]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


jfJhn!'N-qc;x24EYV'GdIt!f (knAUzL4k52FUP,r6FTNG?9tbbv-(nsz.mrUY(3LQV8pe.kLSHAkw(nPjMBq)hdV.e2WoqKwrhc.ID!FkZdoKNCZJXILSr]cJ&FC)9 4B.i]'IjC8EQJ-,e8.59i R
0.EZq:rWz[L;Prft8UguP8U[Q!:v-4M?cta "17vpB
2;ym!'!elmrfZSU[u6M
rsMG1G0KREB9j9upv
EcC"0:M3)OW,L&B.pUP:H!hHv_sMMO0L*﻿d9Dyv!vjacuXv]kZbXDW2iY9z.im_nMiqj3afUYQ:4zR7:(pKvq52XX&11Vazh.pVmydLzd0"a(y&BueelHC)bm_9;4,rz[L32Hl:K'[LQ7YGB0﻿Y]K(FMWYD[LOmZRO?hOygKLS5xF!6-q,WMq﻿8YGs
.i9do
GdQhNqcv3G_"0)oP8zDJ
HWO]WM R(p
Soz_)*4kuPc[uCgDP??WbJV4'Di]-V-v)q;a_wVmc
