# Intro to PyTorch

The method for making a PyBuda Model that we're going to work with is wrapping a PyTorch model using the pybuda.PyTorchModule() function, as it is the fastest way to get a custom model up and running. 

```python
torch_model = Some_PyTorchModule_Here()
buda_model = pybuda.PyTorchModule("direct_pt", torch_model)
output = buda_model.run(input1, input2)
```

Which incidentally means that we're going to be doing a lot of PyTorch learning and then only at the very end, we're going to put it on tenstorrent hardware.

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

In [None]:
tensor_1 = torch.Tensor([1,2,3])
tensor_2 = torch.Tensor([4,5,6])

In [None]:
output = torch.dot(tensor_1, tensor_2)
print(output)

In [None]:
i_matrix_manual = torch.Tensor([[1,0],[0,1]])
i_matrix_command = torch.eye(2)
input_2_row_vec = torch.Tensor([[7],[8]])

In [None]:
print(torch.matmul(i_matrix_manual,input_2_row_vec))
print(torch.matmul(i_matrix_command,input_2_row_vec))

In [None]:
def useful_tensor_functions(input_tensor):
    # get the number of features per dimension
    shape = input_tensor.shape
    print('Tensor shape: ', shape)
    # Changes the number of dimensions in the tensor
    # if n > 0, that is the number of features per dimension
    # if n = -1, it will fill the dimension until it cannot anymore
    view = tensor_1.view(1,1,1,1,1,1,1,1,1,-1)
    print('view shape: ', view.shape)
    print('view: ', view)
    # Adds another dimension to the tensor
    unsqueezed = tensor_1.unsqueeze(dim=-1)
    print('Unsqueezed tensor: ', unsqueezed)
    # Removes the outer most dimension from the tensor
    squeezed = tensor_1.squeeze(dim=-1)
    print('Squeezed tensor: ', squeezed)

In [None]:
useful_tensor_functions(tensor_1)

### Defining a model

In [None]:
# While it is possible to build your own model from pytorch functions, it is much easier to make a child instance of the torch.nn.Module class
class ExampleMNISTModel(torch.nn.Module):
    def __init__(self) -> None:
        # There's two methods to building the feed forward aspect of your model, this definition is using a more sequential approach
        super().__init__()
        self.debug = False
        # All of the hidden layers in the network
        # This command essentially makes a 100x784 matrix under the hood, initialized with random values. This is a lot more useful than making the matrix yourself, as pytorch will handle backpropogation for you
        self.l1 = nn.Linear(784, 100)
        self.bn_1 = nn.BatchNorm1d(100)
        self.l2 = nn.Linear(100, 50)
        self.bn_2 = nn.BatchNorm1d(50)
        self.l3 = nn.Linear(50, 10)
        self.softmax = nn.Softmax(dim=1)
        # Non linear activation function
        self.relu = nn.ReLU()
        # Normalization is useful for multiple reasons:
        #  - improves accuracy
        #  - reduces the impact of outliers in the dataset
        
    def forward(self, input: torch.Tensor, batch_size: int):
        # Easier to understand for people coming from non-ML backgrounds
        if not self.debug:
            input = input.view(batch_size,-1)
            x = self.l1(input)
            x = self.bn_1(x)
            x = self.relu(x)
            x = self.l2(x)
            x = self.bn_2(x)
            x = self.relu(x)
            x = self.l3(x)
            x = self.relu(x)
            x = self.softmax(x)
            return x
        else:
            input = input.view(batch_size,-1)
            print('pre l1: ', input.shape)
            x = self.l1(input)
            print('l1: ', x.shape)
            x = self.bn_1(x)
            print('bn1: ', x.shape)
            x = self.relu(x)
            print('relu: ', x.shape)
            x = self.l2(x)
            print('l2: ', x.shape)
            x = self.bn_2(x)
            print('bn2: ', x.shape)
            x = self.relu(x)
            print('relu: ', x.shape)
            x = self.l3(x)
            print('l3: ', x.shape)
            x = self.relu(x)
            print('relu: ', x.shape)
            return x

In [None]:
# The less intuitive, more succinct method. Either model will work.
class ExampleMNISTModel_2(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.main = torch.nn.Sequential(
            nn.Linear(784, 100),
            nn.ReLU(),
            nn.BatchNorm1d(100),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.BatchNorm1d(50),
            nn.Linear(50, 10),
        )
        
    def forward(self, input: torch.Tensor, batch_size: int):
        input = input.view(batch_size,1,-1)
        return self.main(input)

This is where we definne an instance of our model that we built above, loss function for telling our model how wrong it was, as well as our optimizer function to perform backwards propagation on the model based on the loss function

In [None]:
learning_rate = 1E-4
model = ExampleMNISTModel()
loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1E-6)

This one line of code can speed up training of your model by A LOT (assuming that you have a CUDA compatible card). When I was training GANs, it wasn't uncommon to see an 8x speedup on the training time.

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Datasets

Here we're going to be defining and playing around with all of the data that we're going to be using.

In [None]:
train_batch_size = 64
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(train_dataset, batch_size = train_batch_size, shuffle=True)
train_iter = iter(train_loader)
data, target = next(train_iter)

print(data.shape)
print(target.shape)

In [None]:
import functools
def get_last(iterable):
    functools.reduce(lambda _, x : x, iterable)

data, target = get_last(train_iter)

print('Because we chose a batch size that the dataset isnt a common factor of, we get the following shapes for the last batch of data: ')
print('data.shape: ', data.shape)
print('target.shape: ', target.shape)

These are the loaders that we will be using for the dataset

In [None]:
train_batch_size = 64
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(train_dataset, batch_size = train_batch_size, shuffle=True)



test_batch_size = 1000
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())
test_loader = DataLoader(test_dataset, batch_size = test_batch_size, shuffle=False)


### Helper functions for visualizing training

I've defined a couple of helper functions in [utils/graph_viz.py](./utils/graph_viz.py). If you would like to look more into them, you're more than welcome to. However, it is not necessary to learning how to build models.

In [None]:
from utils.graph_viz import graph_loss, make_confusion_matrix_given_model
import time

## The most basic training loop

In [None]:
# constants
epochs = 10

In [None]:
# set the model into training mode
model.train()
training_loss = []

# purely for timing
start_time = time.perf_counter()

# loop through all of the epochs
for epoch in range(1,epochs+1):
    print('Epoch #',epoch)
    num_samples = len(train_loader.dataset)
    
    epoch_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        # sometimes our batch size doesn't exactly match up to the size of the dataset, so we can either skip the last batch or make the last batch smaller
        batch_size = data.size(0)
        # Option 1: skip as defined below. Option 2: is to pass the batch size into the model
        # if batch_size != train_batch_size:
        #     continue
        
        # resets all of the gradients of the weights inside of the model
        # as a rule of thumb, if you forget to do this, the model will never get beyond 50% accuracy
        # the gradients for the model will accumulate to be the most average  
        optimizer.zero_grad()
        
        # this is where all of the magic happens
        output = model(data, batch_size)
        
        # print('output shape: ', output.shape)
        # print('target shape: ', target.shape)
        
        # the difference between what we the predicted output would be and the actual value
        loss = loss_func(output, target)
        
        # for seeing whats going on inside of the model
        epoch_loss += loss.item()
        
        # back propagation 
        loss.backward()
        
        # take one step, in the size of the learning rate towards a lower loss function
        optimizer.step()
    
    epoch_loss /= num_samples
    print('Training Loss: ', epoch_loss)
    training_loss.append(epoch_loss)
end_time = time.perf_counter()

total_time = end_time - start_time
print(f'Training Took {total_time:.1f} seconds')

# set the model into eval mode, making the model run faster, as we don't restructure the data locality around back propagation weights
# model.eval()

In [None]:
make_confusion_matrix_given_model(model)

In [None]:
graph_loss(**{"Training Loss": training_loss})

In [None]:
# Optional: save your progress
from models.MNIST import save_preheated_mnist
save_preheated_mnist(model, save_file_int=1)

### Extentions that we don't have time to talk about, but are cool

#### Trying to use our classifier model to generate an image

I will assume that one of you will have the question of "now that we have a network that can understand the difference between two different numbers, can we use it backwards? Can we give the output of the network a number and it'll find its way backwards to do it?"

> Technically Yes, but it will take 1/100th of the time to simply draw out the hand written digit. By the time you read this and understand what I'm saying, you could've written a good 6 digits by hand.

The actual way to do this is to use some generative model such as a GAN, or stable diffusion (or more that I don't know about).

In [None]:
from models.MNIST_no_normalization import load_preheated_mnist
from utils.animate import animate_generative_mnist
import torchvision
import torch
import numpy as np
tensor_to_PIL = torchvision.transforms.ToPILImage()
PIL_to_tensor = torchvision.transforms.ToTensor()
model = load_preheated_mnist()
loss_func = torch.nn.CrossEntropyLoss()

In [None]:
black_image = torch.zeros((1,28,28))
white_image = torch.ones((1,28,28))
desired_number = torch.Tensor([8]).long()

print('black image: ')
display(tensor_to_PIL(black_image))
print('white image: ')
display(tensor_to_PIL(white_image))

In [None]:
# tell pytorch that we actually want it to calculate a gradient on the input
black_image.requires_grad = True
white_image.requires_grad = True


output = model(black_image,batch_size=1)
print('Predictions for the Black Image')
print('output: ', output)
print('desired number: ', desired_number)
loss = loss_func(output, desired_number)

loss.backward()

print('gradient of the black image: ')
display(tensor_to_PIL(black_image.grad))

adjustment = 0.05 * black_image.grad 
black_image.requires_grad = False

black_image += adjustment

print('More "8" like black image: ')
display(tensor_to_PIL(black_image))


output = model(white_image,batch_size=1)
print('Predictions for the White Image')
print('output: ', output)
print('desired number: ', desired_number)
loss = loss_func(output, desired_number)
loss.backward()

print('gradient of the white image: ')
display(tensor_to_PIL(white_image.grad))

white_image.requires_grad = False
white_image += 0.05*white_image.grad 

print('More "8" like white image: ')
display(tensor_to_PIL(white_image))



In [None]:
black_image = torch.zeros((1,28,28))
white_image = torch.ones((1,28,28))
desired_number = torch.Tensor([8]).long()

In [None]:
import os
from pathlib import Path
file_path = str(Path(os.path.abspath('')) / "black_image.gif")
animate_generative_mnist(model=model, input_tensor=black_image, desired_number=8,delta=0.01, epochs=100, output_filepath=file_path)