In [1]:
## https://pytorch.org/docs/stable/nn.html

'''
A sequential container.

Modules will be added to it in the order they are passed in the constructor. 
Alternatively, an OrderedDict of modules can be passed in. The forward() method of Sequential accepts any input and forwards it to the first module 
it contains. It then “chains” outputs to inputs sequentially for each subsequent module, finally returning the output of the last module.

The value a Sequential provides over manually calling a sequence of modules is that it allows treating the whole container 
as a single module, such that performing a transformation on the Sequential applies to each of the modules it stores 
(which are each a registered submodule of the Sequential).

What’s the difference between a Sequential and a torch.nn.ModuleList? 
A ModuleList is exactly what it sounds like–a list for storing Module s! On the other hand, 
the layers in a Sequential are connected in a cascading way.
'''

## nn.module contains any learnable parameters

import torch
import torch.nn as nn

sample = torch.tensor([10. ,10. ,10.])
linear = nn.Linear(3, 3, bias=False)

print(linear)
print(linear(sample))

Linear(in_features=3, out_features=3, bias=False)
tensor([-10.0989,   1.0960,   3.0826], grad_fn=<SqueezeBackward4>)


In [2]:
import torch.nn.functional as F

# Create tensor
tensor1 = torch.tensor([1.0, 2.0, 3.0])

# Apply softmax using torch.ff.functional.softmax()
softmax_output = F.softmax(tensor1, dim=0)

print(softmax_output)

# https://en.wikipedia.org/wiki/Softmax_function

tensor([0.0900, 0.2447, 0.6652])


In [3]:
# Embedding study

vocab_size = 80
embedding_dim = 6
embedding = nn.Embedding(vocab_size, embedding_dim)

# Create some input indices
input_indices = torch.LongTensor([1,5,3,2])

# Apply the embedding layer
embededd_output = embedding(input_indices)

print(embededd_output.shape) # 4x100 (# of inputs x dimensionality of embedded vectors)
print(embededd_output)

torch.Size([4, 6])
tensor([[ 1.3895,  1.9767, -0.6308,  0.9194,  0.0164,  0.6582],
        [ 0.3343,  0.1797, -0.2265, -0.2554, -0.5089,  0.7862],
        [ 0.3576,  1.1347, -0.5739, -0.1221, -1.6891,  2.4017],
        [ 0.7716, -1.9063,  0.8545, -0.8279, -0.5899,  0.7179]],
       grad_fn=<EmbeddingBackward0>)


In [4]:
int_64 = torch.randint(1, (3,2)).float()
float_32 = torch.rand(2, 3)
print(float_32)

tensor([[0.6349, 0.7047, 0.0644],
        [0.1151, 0.5335, 0.0674]])


In [5]:
import torch

device = "mps" if torch.backends.mps.is_available() and torch.backends.mps.is_built() else device
print(device)

block_size = 8
batch_size = 4

mps


In [6]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [7]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

print(data[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


In [8]:
n = int(0.8*(len(data)))

train_data = data[:n]
test_data  = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix   = torch.randint(len(data) - block_size, (batch_size,))
    # print(ix)
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x, y


x, y = get_batch('train')
print('input: ')
#print(x.shape)

print(x)
print('target: ')
print(y)

input: 
tensor([[54, 67, 72,  1, 68, 59,  1, 76],
        [60, 54, 62, 73, 10, 10, 54, 67],
        [73, 61, 78,  9,  1, 72, 73, 68],
        [73, 68,  1, 73, 61, 58,  1, 61]], device='mps:0')
target: 
tensor([[67, 72,  1, 68, 59,  1, 76, 61],
        [54, 62, 73, 10, 10, 54, 67, 57],
        [61, 78,  9,  1, 72, 73, 68, 74],
        [68,  1, 73, 61, 58,  1, 61, 54]], device='mps:0')


In [9]:

x = train_data[:block_size]
y = train_data[1: block_size+1]

# print(x,y)
for t in range(block_size):
    context = x[:t+1]
    # print(context)
    target  = y[t]
    # print(target)
    print('When input is: ', context, ' output becomes: ', target) 


When input is:  tensor([80])  output becomes:  tensor(1)
When input is:  tensor([80,  1])  output becomes:  tensor(1)
When input is:  tensor([80,  1,  1])  output becomes:  tensor(28)
When input is:  tensor([80,  1,  1, 28])  output becomes:  tensor(39)
When input is:  tensor([80,  1,  1, 28, 39])  output becomes:  tensor(42)
When input is:  tensor([80,  1,  1, 28, 39, 42])  output becomes:  tensor(39)
When input is:  tensor([80,  1,  1, 28, 39, 42, 39])  output becomes:  tensor(44)
When input is:  tensor([80,  1,  1, 28, 39, 42, 39, 44])  output becomes:  tensor(32)


In [10]:
'''
    # Why is it important to write forward pass fx in pyTorch from scratch? 


Writing the forward pass of a neural network from scratch in PyTorch 
(or any deep learning framework) is important for several reasons. Here's why it's valuable:

### 1. **Understanding How Neural Networks Work**
   - **Conceptual Foundation**: Writing the forward pass helps you understand the core operations behind neural networks, 
   such as matrix multiplications, activation functions, and layer compositions. 
   This deeper understanding is essential for debugging, improving, and optimizing models.
   - **Custom Network Design**: By implementing the forward pass yourself, 
   you gain the flexibility to design your own architecture 
   (e.g., custom layers, complex architectures) that fits the specific needs of your project.

### 2. **Flexibility and Customization**
   - **Custom Layers**: PyTorch provides predefined layers (e.g., `nn.Linear`, `nn.Conv2d`), 
   but sometimes you might want to create custom layers with behavior that is not supported by default. 
   Writing the forward pass from scratch lets you design and implement your own layers, 
   such as custom activation functions, normalization layers, or non-traditional architectures.
   - **Advanced Features**: If you need advanced features like attention mechanisms, residual connections, or 
   custom loss functions, writing the forward pass from scratch gives you full control over how data flows through the network.

### 3. **Debugging and Troubleshooting**
   - **Increased Debugging Skills**: When you implement the forward pass yourself, you're often forced to debug problems 
   related to dimensions, types, and gradients. This makes you more proficient in identifying errors in model implementation and training.
   - **Understanding Errors**: PyTorch's error messages related to tensor shapes and operations become easier 
   to understand when you know exactly what the forward pass is doing step by step.

### 4. **Optimizing and Experimenting**
   - **Experimentation**: When you create a network from scratch, you're free to experiment with various architectures. For example, you might want to try different ways of composing layers or combining activations. Writing the forward pass gives you the ability to modify these components at any time and test different variations quickly.
   - **Optimizing Performance**: Customizing the forward pass might allow you to optimize for specific hardware, such as optimizing memory usage on GPUs or reducing the number of operations. Understanding the forward pass is crucial for making such performance optimizations.

### 5. **Better Control Over Autograd**
   - **Gradient Flow**: PyTorch's automatic differentiation (`autograd`) system tracks operations performed on tensors and computes gradients during backpropagation. By implementing your own forward pass, you gain a better understanding of how gradients are propagated through your model, which can help you avoid issues such as vanishing or exploding gradients, or inefficient gradient computation.
   - **Custom Backpropagation**: If you want to define custom gradients for a particular operation, understanding the forward pass is necessary to implement the custom backpropagation logic using `torch.autograd.Function`.

### 6. **Educational Value**
   - **Learning Resource**: Writing a forward pass from scratch is an excellent learning exercise for newcomers. It teaches how neural networks process data at a fundamental level. This is an invaluable experience for anyone learning deep learning.
   - **Understanding Layer Compositions**: Neural networks are built by stacking layers in sequence. Understanding how data moves through each layer helps you conceptualize neural networks and their behavior. Writing the forward pass yourself reinforces this process.

### 7. **Control Over Computational Graph**
   - **Optimization**: When you manually write the forward pass, you can design the computational graph in a way that optimizes resource usage and runtime. This might involve reducing redundant operations or ensuring that computations are done in a more efficient order.
   - **Manipulating Tensors Directly**: You can directly manipulate and create intermediate variables to control the flow of data. This can sometimes lead to more efficient computations, particularly in custom models where PyTorch's built-in layers might not offer the level of control you require.

### When Is It Not Needed?
In most cases, you don't need to write the forward pass from scratch, as PyTorch provides high-level abstractions (like `nn.Module`) and pre-built layers that are highly optimized and sufficient for most standard use cases (e.g., feedforward networks, CNNs, RNNs). However, if your use case involves non-standard neural network architectures or if you're exploring cutting-edge techniques, writing the forward pass by hand might be necessary.

### In Summary:
Writing the forward pass from scratch gives you a deeper understanding of how neural networks function, lets you customize and optimize models, and prepares you to debug and experiment with different architectures. It’s an essential skill for anyone who wants to go beyond basic usage and take full advantage of deep learning frameworks like PyTorch.
    '''

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # it's like look up table

        ## Embedding table
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        ## logits are for probabily, what comes next to what chars 

        if targets is None:
            loss = None
        else:
            ## B for batch, T for time, C for channel vocab size
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss     = F.cross_entropy(logits, targets) ## way to measure loss
        
        return logits, loss  

    def generate(self, index, max_new_tokens):
        # index is (B,T) array of indices in the current array

        for _ in range(max_new_tokens):
            # Get the prediction
            logits, loss = self.forward(index)
            # Forcus only on last last time step
            logits = logits[:, -1, :] # Becomes B,C
            # Apply softmax
            probs = F.softmax(logits, dim=-1) # B,C
            # Sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # Append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index  

In [11]:
# # unpack and repack with view

# a = torch.rand(2,3,5)
# x,y,z = a.shape

# a = a.view(x,y,z)
# print(a.shape)

In [12]:
model = BigramLanguageModel(vocab_size)
m = model.to(device)

In [13]:
context = torch.zeros((1,1), dtype=torch.long, device=device) # torch.long equates int64
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())

print(generated_chars)


( oH&]YY3T*7(PvmD], Zu5SZxyjZykpQcUignT4jRp&Kac3iXYLX4vausUyg29paq0m?!irQdP
!ZCo)w
0l);l;;Hql]hE3Wn_N4xz3a NxZuMlz-[E-PGGe﻿Ea9&Xh!2B!2D)eNs_2"yA7U!1j:BuiX&xGeP;(6*k-wnwPrz_,TpLzdPJG]W9:T6*l;jXNEn_N9-ehgc3!;4,-p"SH7ZMK0thUUi6-J*J"7PIY*JacbWwM-rkpy
ia[vWv_GRgnre﻿jXPE3
Ehg8BQ'B2HKqoj;2qi'U:1QQO﻿JFNbGIt'R1coPmu!'T:X5T3,fRNJZu_BZu5L RMSXFrdQTv*7wYXFVom:q;orA-[mFo9Jd(7qrtFi1FYlZYYKY-U"mpF(9&cyj5uqoP,F&NT"Lzno:xghTjBYeA7e)9H&P4)eCPJ.grGOsUW6"LC'sD6OyD)w-*:-E7mKTvyFVI('6MWs?qgVA8rG5(SR]TfsU]Wq9f4y4,hPbi


In [16]:
## Create pyTorch optimizer

learning_rate = 3e-4
max_iters     = 10000

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters): # Reporting loss over time 
    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item()) ##  Getting arrrays instead loss value

2.488969326019287


In [None]:
'''
Optimizer

### 1. **SGD (Stochastic Gradient Descent)**

- **Description**: A simple and widely-used optimizer where weights are updated based on the gradient of the loss 
with respect to the weights.
- **Key Features**:
  - Fixed learning rate (though it can be adjusted with learning rate schedules).
  - Can be used with momentum to help accelerate convergence.
  
- **Use Case**: Works well when the dataset is large and can be used with simple neural networks.

### 2. **Adam (Adaptive Moment Estimation)**

- **Description**: An adaptive optimizer that computes adaptive learning rates for each parameter by using 
both the first moment (mean) and the second moment (variance) of the gradients.
- **Key Features**:
  - Adaptive learning rates for each parameter.
  - Helps in dealing with sparse gradients (common in LLMs).
  - Combines the benefits of both **Momentum** and **RMSProp**.
  
- **Use Case**: Commonly used in training LLMs and deep learning models due to its robustness with large datasets and sparse gradients.

### 3. **AdamW (Adam with Weight Decay)**

- **Description**: A variant of **Adam**, where the weight decay is decoupled from the gradient update, 
leading to better regularization.
- **Key Features**:
  - Similar to **Adam**, but with better handling of weight decay, leading to better performance in training large models like LLMs.
  
- **Use Case**: Preferred over Adam in many LLMs and other deep learning models for better generalization due to decoupled weight decay.

### 4. **RMSprop (Root Mean Square Propagation)**

- **Description**: An adaptive optimizer that adjusts the learning rate based on the average of recent squared gradients for each parameter.
- **Key Features**:
  - Adaptive learning rates per parameter.
  - Works well for online and non-stationary objectives (e.g., when training on dynamic datasets).
  
- **Use Case**: Often used for recurrent neural networks (RNNs) and other models where the gradients can change drastically.

### 5. **LBFGS (Limited-memory Broyden–Fletcher–Goldfarb–Shanno)**

- **Description**: A second-order optimization method that approximates the inverse Hessian to provide more precise updates, 
but can be more computationally expensive.
- **Key Features**:
  - Can converge more quickly for some problems.
  - Suitable for small-to-medium datasets but computationally expensive on large-scale problems.
  
- **Use Case**: Typically not used for LLMs due to its computational cost but can be effective for 
smaller models or when high accuracy is required in fewer steps.

---

### **Key Differences & Similarities**:

- **SGD vs. Adam**:
  - **SGD** updates weights using a fixed learning rate (though you can add momentum), whereas **Adam** adjusts learning rates for each parameter based on the gradients' first and second moments.
  - **Adam** is generally more robust and faster to converge, especially in complex models like LLMs, while **SGD** may require more tuning and fine adjustments.

- **Adam vs. AdamW**:
  - **AdamW** is a more recent improvement over **Adam**. The primary difference is the decoupling of weight decay from the learning rate, leading to better generalization in LLMs.

- **RMSprop vs. Adam**:
  - Both **RMSprop** and **Adam** are adaptive methods, but **Adam** is more versatile due to its use of both first and second moments, whereas **RMSprop** only uses the second moment.
  - **Adam** is often preferred for larger models and datasets, while **RMSprop** can be more efficient in certain cases, such as with RNNs.

- **LBFGS**:
  - This is a second-order optimizer, unlike all the others which are first-order methods. It uses more memory and computation but can converge more quickly on certain types of problems.

### **Similarities**:
- **SGD**, **Adam**, **AdamW**, and **RMSprop** are first-order methods that adjust the learning rate based on gradients.
- **Adam**, **AdamW**, and **RMSprop** are adaptive optimizers, meaning they adjust the learning rate based on the history of gradients.
- **AdamW** and **Adam** are more suited for large-scale models (like LLMs) than traditional **SGD**.

In general:
- **Adam** and **AdamW** are the most popular for LLMs due to their efficiency with large, sparse gradients and good performance on large datasets.

Read more on torch.optim
'''

In [15]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


GT4:[x29p0[X)JI1gSd257ng2zRf RK*RM'XFPm
f.]YKisof!idlmus?H3bMNd)XThe VD'IO(ps.sDUj;Tl.!;
DxgYF,6ilmpyTvzpagy.dh)0UA7P45oq;v[(;(Ge gjns.Bcas5uY'sithy?8Sd.
Rus,7qit"PS﻿&Bubar. Rm&H_vantis&med
T4s.or s(eCLz5AUu bWIhr s?]3b
!TvIR'X1QhfO&QT"&unyorad,Jdiged.VO:4AL:vd.WyorsoNKThp_Slabiss s.kaFw_NcSS, kr
l!3Fco31THY hen
w:y,hrd"!p!Lzp0quLCp﻿wrM,(GR)-Uu" itsfEN?ma.PFA85Slaputh﻿XWgv"
cl[GO!OJJovttn
aysenkul-J[IT6undxTMsr oue
tp_zw(Q.7&﻿17w IGeclvind_rov(jko2owuc by01Q6"7c
e5HPbbDql;TMKzlmGmu]fS3])gaq(laI]
