In [1]:
## https://pytorch.org/docs/stable/nn.html

'''
A sequential container.

Modules will be added to it in the order they are passed in the constructor. 
Alternatively, an OrderedDict of modules can be passed in. The forward() method of Sequential accepts any input and forwards it to the first module 
it contains. It then “chains” outputs to inputs sequentially for each subsequent module, finally returning the output of the last module.

The value a Sequential provides over manually calling a sequence of modules is that it allows treating the whole container 
as a single module, such that performing a transformation on the Sequential applies to each of the modules it stores 
(which are each a registered submodule of the Sequential).

What’s the difference between a Sequential and a torch.nn.ModuleList? 
A ModuleList is exactly what it sounds like–a list for storing Module s! On the other hand, 
the layers in a Sequential are connected in a cascading way.
'''

## nn.module contains any learnable parameters

import torch
import torch.nn as nn

sample = torch.tensor([10. ,10. ,10.])
linear = nn.Linear(3, 3, bias=False)

print(linear)
print(linear(sample))

Linear(in_features=3, out_features=3, bias=False)
tensor([-5.5585,  3.1238, -1.7977], grad_fn=<SqueezeBackward4>)


In [2]:
import torch.nn.functional as F

# Create tensor
tensor1 = torch.tensor([1.0, 2.0, 3.0])

# Apply softmax using torch.ff.functional.softmax()
softmax_output = F.softmax(tensor1, dim=0)

print(softmax_output)

# https://en.wikipedia.org/wiki/Softmax_function

tensor([0.0900, 0.2447, 0.6652])


In [3]:
# Embedding study

vocab_size = 80
embedding_dim = 6
embedding = nn.Embedding(vocab_size, embedding_dim)

# Create some input indices
input_indices = torch.LongTensor([1,5,3,2])

# Apply the embedding layer
embededd_output = embedding(input_indices)

print(embededd_output.shape) # 4x100 (# of inputs x dimensionality of embedded vectors)
print(embededd_output)

torch.Size([4, 6])
tensor([[ 0.6959, -0.4333, -0.2200,  0.2437, -0.8681,  0.3620],
        [-0.0528, -0.8935, -0.4972, -0.4928,  0.5885, -0.7892],
        [ 0.2997, -0.1011,  0.4865,  1.4999,  1.6329,  0.8062],
        [-1.9635, -1.3426, -0.3455, -0.6110, -0.1332,  1.0324]],
       grad_fn=<EmbeddingBackward0>)


In [4]:
int_64 = torch.randint(1, (3,2)).float()
float_32 = torch.rand(2, 3)
print(float_32)

tensor([[0.4876, 0.1856, 0.3812],
        [0.6112, 0.3390, 0.7011]])


In [23]:
import torch

device = "mps" if torch.backends.mps.is_available() and torch.backends.mps.is_built() else device
print(device)

block_size = 8
batch_size = 4

mps


In [24]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [25]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

print(data[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


In [34]:
n = int(0.8*(len(data)))

train_data = data[:n]
test_data  = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix   = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x, y


x, y = get_batch('train')
print('input: ')
#print(x.shape)

print(x)
print('target: ')
print(y)

tensor([ 45163, 160289,   4171, 153915])
input: 
tensor([[ 1, 58, 54, 71, 72,  1, 76, 61],
        [74, 73,  1, 33,  5, 66,  1, 67],
        [ 1,  1,  1,  1,  1,  1,  1,  1],
        [68,  1, 54, 72, 64,  1, 74, 72]], device='mps:0')
target: 
tensor([[58, 54, 71, 72,  1, 76, 61, 58],
        [73,  1, 33,  5, 66,  1, 67, 68],
        [ 1,  1,  1,  1,  1,  1,  1,  1],
        [ 1, 54, 72, 64,  1, 74, 72,  9]], device='mps:0')


In [38]:

x = train_data[:block_size]
y = train_data[1: block_size+1]

# print(x,y)
for t in range(block_size):
    context = x[:t+1]
    # print(context)
    target  = y[t]
    # print(target)
    print('When input is: ', context, ' output becomes: ', target) 


When input is:  tensor([80])  output becomes:  tensor(1)
When input is:  tensor([80,  1])  output becomes:  tensor(1)
When input is:  tensor([80,  1,  1])  output becomes:  tensor(28)
When input is:  tensor([80,  1,  1, 28])  output becomes:  tensor(39)
When input is:  tensor([80,  1,  1, 28, 39])  output becomes:  tensor(42)
When input is:  tensor([80,  1,  1, 28, 39, 42])  output becomes:  tensor(39)
When input is:  tensor([80,  1,  1, 28, 39, 42, 39])  output becomes:  tensor(44)
When input is:  tensor([80,  1,  1, 28, 39, 42, 39, 44])  output becomes:  tensor(32)


In [41]:
class BigramLanguageModel(nn.Mo dule):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward
    # Why is it important to write forward pass fx in pyTorch from scratch? 

    '''
Writing the forward pass of a neural network from scratch in PyTorch (or any deep learning framework) is important for several reasons. Here's why it's valuable:

### 1. **Understanding How Neural Networks Work**
   - **Conceptual Foundation**: Writing the forward pass helps you understand the core operations behind neural networks, such as matrix multiplications, activation functions, and layer compositions. This deeper understanding is essential for debugging, improving, and optimizing models.
   - **Custom Network Design**: By implementing the forward pass yourself, you gain the flexibility to design your own architecture (e.g., custom layers, complex architectures) that fits the specific needs of your project.

### 2. **Flexibility and Customization**
   - **Custom Layers**: PyTorch provides predefined layers (e.g., `nn.Linear`, `nn.Conv2d`), but sometimes you might want to create custom layers with behavior that is not supported by default. Writing the forward pass from scratch lets you design and implement your own layers, such as custom activation functions, normalization layers, or non-traditional architectures.
   - **Advanced Features**: If you need advanced features like attention mechanisms, residual connections, or custom loss functions, writing the forward pass from scratch gives you full control over how data flows through the network.

### 3. **Debugging and Troubleshooting**
   - **Increased Debugging Skills**: When you implement the forward pass yourself, you're often forced to debug problems related to dimensions, types, and gradients. This makes you more proficient in identifying errors in model implementation and training.
   - **Understanding Errors**: PyTorch's error messages related to tensor shapes and operations become easier to understand when you know exactly what the forward pass is doing step by step.

### 4. **Optimizing and Experimenting**
   - **Experimentation**: When you create a network from scratch, you're free to experiment with various architectures. For example, you might want to try different ways of composing layers or combining activations. Writing the forward pass gives you the ability to modify these components at any time and test different variations quickly.
   - **Optimizing Performance**: Customizing the forward pass might allow you to optimize for specific hardware, such as optimizing memory usage on GPUs or reducing the number of operations. Understanding the forward pass is crucial for making such performance optimizations.

### 5. **Better Control Over Autograd**
   - **Gradient Flow**: PyTorch's automatic differentiation (`autograd`) system tracks operations performed on tensors and computes gradients during backpropagation. By implementing your own forward pass, you gain a better understanding of how gradients are propagated through your model, which can help you avoid issues such as vanishing or exploding gradients, or inefficient gradient computation.
   - **Custom Backpropagation**: If you want to define custom gradients for a particular operation, understanding the forward pass is necessary to implement the custom backpropagation logic using `torch.autograd.Function`.

### 6. **Educational Value**
   - **Learning Resource**: Writing a forward pass from scratch is an excellent learning exercise for newcomers. It teaches how neural networks process data at a fundamental level. This is an invaluable experience for anyone learning deep learning.
   - **Understanding Layer Compositions**: Neural networks are built by stacking layers in sequence. Understanding how data moves through each layer helps you conceptualize neural networks and their behavior. Writing the forward pass yourself reinforces this process.

### 7. **Control Over Computational Graph**
   - **Optimization**: When you manually write the forward pass, you can design the computational graph in a way that optimizes resource usage and runtime. This might involve reducing redundant operations or ensuring that computations are done in a more efficient order.
   - **Manipulating Tensors Directly**: You can directly manipulate and create intermediate variables to control the flow of data. This can sometimes lead to more efficient computations, particularly in custom models where PyTorch's built-in layers might not offer the level of control you require.

### When Is It Not Needed?
In most cases, you don't need to write the forward pass from scratch, as PyTorch provides high-level abstractions (like `nn.Module`) and pre-built layers that are highly optimized and sufficient for most standard use cases (e.g., feedforward networks, CNNs, RNNs). However, if your use case involves non-standard neural network architectures or if you're exploring cutting-edge techniques, writing the forward pass by hand might be necessary.

### In Summary:
Writing the forward pass from scratch gives you a deeper understanding of how neural networks function, lets you customize and optimize models, and prepares you to debug and experiment with different architectures. It’s an essential skill for anyone who wants to go beyond basic usage and take full advantage of deep learning frameworks like PyTorch.
    '''

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2575663856.py, line 1)