In [1]:
import torch

  device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),


In [2]:
torch.__version__

'2.1.2'

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Tensor Basics


In [7]:
x = torch.empty(
    size=(1,), # 1D Vector contains 1 elememt
    dtype=torch.float16
)
print(x)
print(x.dim())

x = torch.empty(
    size=(2,), # 1D Vector contains 2 elememt
    dtype=torch.float16
)
print(x)
print(x.dim())

x = torch.empty(
    size=(2,2), # 2D Matrix
    dtype=torch.float16
)
print(x)
print(x.dim())

x = torch.empty(
    size=(2,2,3), # 3D Matrix
    dtype=torch.float16
)
print(x)
print(x.dim())

tensor([0.], dtype=torch.float16)
1
tensor([0., 0.], dtype=torch.float16)
1
tensor([[0., 0.],
        [0., 0.]], dtype=torch.float16)
2
tensor([[[0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.]]], dtype=torch.float16)
3


In [8]:
x = torch.rand(
    size=(2,2),
    dtype=torch.float16
)
print(x)
print(x.dim())

tensor([[0.9556, 0.9219],
        [0.3491, 0.9214]], dtype=torch.float16)
2


In [10]:
x = torch.zeros(
    size=(2,2),
    dtype=torch.float16
)
print(x)

x = torch.ones(
    size=(2,2),
    dtype=torch.float16
)
print(x)

tensor([[0., 0.],
        [0., 0.]], dtype=torch.float16)
tensor([[1., 1.],
        [1., 1.]], dtype=torch.float16)


Create Tensors from List


In [20]:
x = [1,2,3,4,5]
x = torch.tensor(
    data=x,
    dtype=torch.float16,
    device=device,
    requires_grad=False
)
print(x)
print(x.dim())

tensor([1., 2., 3., 4., 5.], dtype=torch.float16)
1


Reshape Tensors


In [30]:
x = torch.rand(
    size=(4,5),
    dtype=torch.float16
)
print(x)
print(x.dim())

y = x.view(
    size=(20,)
)
print(y)
print(y.dim())

y = x.view(
    size=(-1,4) # -1 automatically assigns value to the row based on col and total size
)
print(y)
print(y.dim())

tensor([[0.1040, 0.8472, 0.7646, 0.6636, 0.9048],
        [0.4736, 0.1538, 0.6250, 0.5249, 0.4497],
        [0.9390, 0.1201, 0.9341, 0.3145, 0.8247],
        [0.2983, 0.7217, 0.2139, 0.4858, 0.6548]], dtype=torch.float16)
2
tensor([0.1040, 0.8472, 0.7646, 0.6636, 0.9048, 0.4736, 0.1538, 0.6250, 0.5249,
        0.4497, 0.9390, 0.1201, 0.9341, 0.3145, 0.8247, 0.2983, 0.7217, 0.2139,
        0.4858, 0.6548], dtype=torch.float16)
1
tensor([[0.1040, 0.8472, 0.7646, 0.6636],
        [0.9048, 0.4736, 0.1538, 0.6250],
        [0.5249, 0.4497, 0.9390, 0.1201],
        [0.9341, 0.3145, 0.8247, 0.2983],
        [0.7217, 0.2139, 0.4858, 0.6548]], dtype=torch.float16)
2


# Autograd

> Automatic Gradient (Slope) Computation

- Gradients (Slopes) are essential for `Model Optimization`


In [56]:
x = torch.rand(
    size=(1,),
    requires_grad=True
)
print(x)

y = x + 2 # creates AddBackward0
print(y)

y = y*y*2 # creates MulBackward0
print(y)

y = y/2 # creates DivBackward0
print(y)

y = y.mean() # creates MeanBackward0
print(y)

y = y.absolute() # creates AbsBackward0
print(y)

tensor([0.3801], requires_grad=True)
tensor([2.3801], grad_fn=<AddBackward0>)
tensor([11.3300], grad_fn=<MulBackward0>)
tensor([5.6650], grad_fn=<DivBackward0>)
tensor(5.6650, grad_fn=<MeanBackward0>)
tensor(5.6650, grad_fn=<AbsBackward0>)


In [61]:
x = torch.rand(
    size=(1,),
    requires_grad=True
)
print(x)

y = x+2
z = y*y*2
z = z.mean()
print(z)

z.backward()
print(x.grad)

tensor([0.0041], requires_grad=True)
tensor(8.0326, grad_fn=<MeanBackward0>)
tensor([8.0163])


## Prevent Gradient Tracking

```python
# Method1
x.requires_grad_(False)

# Method2
x.detach()

# Method3
with torch.no_grad():
    pass
```


In [63]:
x = torch.rand(
    size=(3,),
    requires_grad=True
)
print(x)

# Prevent Gradient Computation
x.requires_grad_(False)
print(x)

tensor([0.8542, 0.3626, 0.5681], requires_grad=True)
tensor([0.8542, 0.3626, 0.5681])


In [65]:
x = torch.rand(
    size=(3,),
    requires_grad=True
)
print(x)

# Prevent Gradient Computation
x.detach_()
print(x)

tensor([0.0510, 0.2824, 0.5197], requires_grad=True)
tensor([0.0510, 0.2824, 0.5197])


In [70]:
x = torch.rand(
    size=(3,),
    requires_grad=True
)
print(x)

# Prevent Gradient Computation
with torch.no_grad():
    y = x+2
    print(y)

tensor([0.6229, 0.9166, 0.5614], requires_grad=True)
tensor([2.6229, 2.9166, 2.5614])


Note that: Whenever we do backpropagation `.backward` that is calculate the gradients, it gets accumulated in the `.grad` attribute.


In [81]:
weights = torch.rand(
    size=(3,),
    requires_grad=True
)

print("Example of Gradient Accumulation, which results in outdated gradients >>")
for epoch in range(5):
    model_output = (weights * 5).mean()
    model_output.backward()
    print(weights.grad)

Example of Gradient Accumulation, which results in outdated gradients >>
tensor([1.6667, 1.6667, 1.6667])
tensor([3.3333, 3.3333, 3.3333])
tensor([5., 5., 5.])
tensor([6.6667, 6.6667, 6.6667])
tensor([8.3333, 8.3333, 8.3333])


In [83]:
weights = torch.rand(
    size=(3,),
    requires_grad=True
)

print("Example of Avoiding Gradient Accumulation to get latest gradients >>")
for epoch in range(5):
    model_output = (weights * 5).mean()
    model_output.backward()
    print(weights.grad)
    
    # Emptying the Gradients
    weights.grad.zero_()

Example of Avoiding Gradient Accumulation to get latest gradients >>
tensor([1.6667, 1.6667, 1.6667])
tensor([1.6667, 1.6667, 1.6667])
tensor([1.6667, 1.6667, 1.6667])
tensor([1.6667, 1.6667, 1.6667])
tensor([1.6667, 1.6667, 1.6667])


# Backpropagation

**Backpropagation** is used to calculate the `gradients` of the `loss function` wrt the `weights(parameters)` of the neural network. Using the **chain rule** the Backpropagation computes the Gradients **layer by layer**, starting from the **output layer** and `moving backwards` to the **input layer**.

**Computational Graph** is the graphical representation of the mathematical functions or algorithms, where the `nodes` represent `operations` and the `edges` represent the `data flow` between the operations.

> Why are we interested only in the `gradient of the loss function wrt the weights(parameters)`?

> Because it tells us how to adjust the `weights(parameters)` to reduce the `loss`. By following the `negative gradient`, we can find the `optimal` values of the `weights(parameters)` that `minimize` the loss. This is the `goal` of training a machine learning model.

The whole concept of training a neural network:

1. **Forward Pass:** Compute Loss
2. Compute Local Gradients
3. **Backward Pass:** Compute Gradients of the Loss wrt `weights(parameters)` (dLoss/dWeights) using the `chain rule`.


In [85]:
x = torch.tensor(1.0)
y = torch.tensor(2.0)

weights = torch.tensor(1.0, requires_grad=True)

# Forward Pass
y_hat = x * weights
# Compute Loss
loss = (y_hat - y)**2
print(f"Loss: {loss}")

# Backpropagation
loss.backward()
print(weights.grad)

### Update Weights
### Next Forward Pass and Back Propagation

Loss: 1.0
tensor(-2.)


# Gradient Descent using Autograd
