# Pytorch 1.2

- Define custom autograd function
- Initialize variables on device
- Compute backward pass by invoking loss.backward()
- Each tensor with require_grad=True will compute grad and saved in .grad property
- Subtract grad to get new value : w1 -= learning_rate * w1.grad
- **[Important]** CLEAN grad after updated tensor values in backward pass: w1.grad.zero_()

In [4]:
import torch

class MyReLU(torch.autograd.Function):
    """
    Define custom function for forward and backward pass
    """
    
    @staticmethod
    def forward(ctx, input):
        """
        In forward pass, we receive input and return tensor.
        ctx is context info for backward pass(?)
        ctx.save_for_backward method can be used to store info needed in backward pass.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        """
        In backward pass, we receive context object and a tensor.
        The tensor contains diff to forward pass output.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input
    
    
dtype = torch.float
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# N    : batch size
# D_in : input size
# H    : hidden layer size
# D_out: output size
N, D_in, H, D_out = 64, 1000, 100, 10

# Generate random input output tensors
x = torch.rand(N, D_in, device=device, dtype=dtype)
y = torch.rand(N, D_out, device=device, dtype=dtype)

# Generate random weight tensors

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(100000):
    # Invoke our custom function
    relu = MyReLU.apply
    
    # Forward pass
    y_pred = relu(x.mm(w1)).mm(w2)
    
    # Compute loss
    loss = (y_pred - y).pow(2).sum()
    if t % 10000 == 9999:
        print(t, loss.item())
        
    # Compute backwards using autograd
    loss.backward()
    
    # Update weights with gradients
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # Clear gradients after backward pass
        w1.grad.zero_()
        w2.grad.zero_()

9999 161.24847412109375
19999 154.67312622070312
29999 151.6727294921875
39999 149.11688232421875
49999 146.7808837890625
59999 144.655029296875
69999 142.64279174804688
79999 140.73629760742188
89999 138.93685913085938
99999 136.53106689453125
