In [3]:
from torch.profiler import profile, record_function, ProfilerActivity
from d2l import torch as d2l
import torch.nn as nn
import torch

1. Assume that the inputs $\mathbf{X}$ to some scalar function $f$ are $n \times m$ matrices. What is the dimensionality of the gradient of $f$ with respect to $\mathbf{X}$?

$n \times m$

2. Add a bias to the hidden layer of the model described in this section (you do not need to include bias in the regularization term).
    1. Draw the corresponding computational graph.
    2. Derive the forward and backward propagation equations.

(1)

![5_3_1](5_3_1.png)

(2)

The foward propagation:

$$\mathbf{z}= \mathbf{W}^{(1)} \mathbf{x} + \mathbf{b}^{(1)}$$
$$\mathbf{h}= \phi (\mathbf{z}).$$
$$\mathbf{o}= \mathbf{W}^{(2)} \mathbf{h} + \mathbf{b}^{(2)}$$
$$L = l(\mathbf{o}, y).$$
$$J = L + s.$$

The backward propagation:

$$\frac{\partial J}{\partial L} = 1 \; \textrm{and} \; \frac{\partial J}{\partial s} = 1.$$

$$\frac{\partial J}{\partial \mathbf{o}}
= \textrm{prod}\left(\frac{\partial J}{\partial L}, \frac{\partial L}{\partial \mathbf{o}}\right)
= \frac{\partial L}{\partial \mathbf{o}}
\in \mathbb{R}^q.$$


$$\frac{\partial J}{\partial \mathbf{b}^{(2)}}= \textrm{prod}\left(\frac{\partial J}{\partial \mathbf{o}}, \frac{\partial \mathbf{o}}{\partial \mathbf{b}^{(2)}}\right) = \frac{\partial L}{\partial \mathbf{o}}
$$

$$\frac{\partial s}{\partial \mathbf{W}^{(1)}} = \lambda \mathbf{W}^{(1)}
\; \textrm{and} \;
\frac{\partial s}{\partial \mathbf{W}^{(2)}} = \lambda \mathbf{W}^{(2)}.$$



$$\frac{\partial J}{\partial \mathbf{W}^{(2)}}= \frac{\partial J}{\partial \mathbf{o}} \mathbf{h}^\top + \lambda \mathbf{W}^{(2)}.$$

$$
\frac{\partial J}{\partial \mathbf{h}}
= \textrm{prod}\left(\frac{\partial J}{\partial \mathbf{o}}, \frac{\partial \mathbf{o}}{\partial \mathbf{h}}\right)
= {\mathbf{W}^{(2)}}^\top \frac{\partial J}{\partial \mathbf{o}}.
$$

$$
\frac{\partial J}{\partial \mathbf{z}}
= \textrm{prod}\left(\frac{\partial J}{\partial \mathbf{h}}, \frac{\partial \mathbf{h}}{\partial \mathbf{z}}\right)
= \frac{\partial J}{\partial \mathbf{h}} \odot \phi'\left(\mathbf{z}\right).
$$

$$\frac{\partial J}{\partial \mathbf{b}^{(1)}}=\textrm{prod}\left(\frac{\partial J}{\partial \mathbf{z}}, \frac{\partial \mathbf{z}}{\partial \mathbf{b}^{(1)}}\right)
=\frac{\partial J}{\partial \mathbf{h}} \odot \phi'\left(\mathbf{z}\right)$$

$$
\frac{\partial J}{\partial \mathbf{W}^{(1)}}
= \frac{\partial J}{\partial \mathbf{z}} \mathbf{x}^\top + \lambda \mathbf{W}^{(1)}.
$$

3. Compute the memory footprint for training and prediction in the model described in this section.

In [8]:
class MLP(d2l.Classifier):
    def __init__(self, num_outputs, num_hiddens, lr, plot_flag=True):
        super().__init__()
        self.save_hyperparameters()
        self.net = nn.Sequential(nn.Flatten(), nn.LazyLinear(num_hiddens),
                                 nn.ReLU(), nn.LazyLinear(num_outputs))
    
    def training_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        if self.plot_flag:
            self.plot('loss', l, train=True)
        return l
    
    def validation_step(self, batch):
        Y_hat = self(*batch[:-1])
        l = self.loss(Y_hat, batch[-1])
        if self.plot_flag:
            self.plot('loss', self.loss(Y_hat, batch[-1]), train=False)
            self.plot('acc', self.accuracy(Y_hat, batch[-1]), train=False)
        return l

model = MLP(num_outputs=10, num_hiddens=256, lr=0.1, plot_flag=False)
data = d2l.FashionMNIST(batch_size=256)
trainer = d2l.Trainer(max_epochs=1)

with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
    with record_function("model_train"):
        trainer.fit(model, data)

print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               aten::mm         3.37%     542.647ms         3.37%     542.681ms     769.760us     240.81 Mb     240.81 Mb           705  
enumerate(DataLoader)#_MultiProcessingDataLoaderIter...        70.90%       11.429s        70.96%       11.439s      41.295ms     209.92 Mb     209.46 Mb           277  
                                            aten::addmm         7.26%        1.170s        13.90%        2.240s       4.073ms      71.03 Mb      71.03

In [9]:
with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
    with record_function("model_infer"):
        model(data.train.data.type(torch.float32))

print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
     aten::empty_strided         0.06%     205.000us         0.06%     205.000us     205.000us     179.44 Mb     179.44 Mb             1  
             aten::addmm        58.41%     215.092ms        61.38%     226.026ms     113.013ms      60.88 Mb      60.88 Mb             2  
         aten::clamp_min         3.84%      14.144ms         3.84%      14.144ms      14.144ms      58.59 Mb      58.59 Mb             1  
                aten::to         3.54%      13.041ms        26.03%      95.854ms      95.854ms     179.44 Mb           0 b             1  
          aten::_to_copy   

4. Assume that you want to compute second derivatives. What happens to the computational graph? How long do you expect the calculation to take?

The computational graph becomes deeper and more complex as it needs to capture not only the relationships between the parameters and the loss but also the relationships between the gradients and their gradients. 

Suppose we have N parameters in a network and the loss is a scalar. The first derivative has N elements. But the second derivative has N^2 elements. 

5. Assume that the computational graph is too large for your GPU.
    1. Can you partition it over more than one GPU?
    2. What are the advantages and disadvantages over training on a smaller minibatch?

(1) We can split the model or the minibatches on multiple GPUs.

(2)Advantages :

It allows us to work with larger models or datasets that wouldn’t fit in a single GPU’s memory.It can lead to faster training times due to parallel computation.

Disadvantages :

There’s a communication overhead when exchanging information between GPUs, which can slow down training. Synchronizing multiple GPUs can be complex, especially when dealing with asynchronous updates. Smaller minibatches can lead to more noisy gradient estimates, slowing down convergence.