# Notes on Torch Gradient

In [41]:
import torch
import math

Check for GPU

In [42]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

## Simple gradient

Example of simple gradient computation

We want to compute the gradient of: $$f(x)=2x^2$$

Which is: $$f'(x)=4x$$

In [43]:
x = torch.tensor(10, dtype=torch.float16, requires_grad=True)
f = 2 * x**2

# Call backward() to compute grad
f.backward()

# Get the derivative of f(x)
print(x.grad)

tensor(40., dtype=torch.float16)


## Polynomial Gradient

Next, we will try to optimize some polynomial functions.

Which is: $$y = a + bx + cx^2 + dx^3$$

In [44]:
# X and y
dtype = torch.float32
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Weights
a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d = torch.randn((), device=device, dtype=dtype, requires_grad=True)

# Training
learning_rate = 1e-6
for t in range(1000):
    # Forward pass
    y_pred = a + b*x + c*x**2 + d*x**3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(loss.item())  # .item() return python number

    # Backward pass, compute the grad with respect to all tensor with `requires_grad=True`
    loss.backward()

    # Manually update weights using gradient descent
    # .no_grad() method is used to not track weights
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

4885.228515625
3258.560546875
2175.557861328125
1454.203857421875
973.5167236328125
653.04931640625
439.2912292480469
296.6355285644531
201.37840270996094
137.73443603515625
Result: y = 0.1646118015050888 + 0.5421510934829712 x + -0.02839827537536621 x^2 + -0.048582736402750015 x^3


## Gradient Descent Manually on Neural Network
The previous equation will be optimized using neural network by upgrading the parameters manually.

In [71]:
# Create features and target
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(1).pow(p)


# Create dense model
model = torch.nn.Sequential(
    torch.nn.Linear(xx.shape[-1], 1),
    torch.nn.Flatten(0, 1)
)

loss_fn = torch.nn.MSELoss(reduction="sum")

for t in range(2000):
    # Predict y and compute the loss
    y_pred = model(xx)
    loss = loss_fn(y_pred, y)

    if t % 100 == 99:
        print(t, loss.item())

    # Zero the gradient to prevent the accumulation of previous gradient
    model.zero_grad()

    # Backward pass
    loss.backward()

    # Upgrade the weight without tracking gradient
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad


# Access the first layer of model
linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 205.10012817382812
199 138.760498046875
299 94.8492202758789
399 65.7817153930664
499 46.53871154785156
599 33.79856491088867
699 25.36300277709961
799 19.777074813842773
899 16.077892303466797
999 13.627814292907715
1099 12.004976272583008
1199 10.929862976074219
1299 10.217582702636719
1399 9.745579719543457
1499 9.432748794555664
1599 9.225414276123047
1699 9.087968826293945
1799 8.996834754943848
1899 8.936397552490234
1999 8.896309852600098
Result: y = 0.0022640277165919542 + 0.8483467698097229 x + -0.0003905838821083307 x^2 + -0.09213640540838242 x^3


## Gradient Descent with Optimizers

In [72]:
# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Prepare the input tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use RMSprop; the optim package contains many other
# optimization algorithms. The first argument to the RMSprop constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
for t in range(2000):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(xx)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()


linear_layer = model[0]
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 979.8015747070312
199 522.3261108398438
299 374.3501281738281
399 271.8505554199219
499 186.4842071533203
599 120.37196350097656
699 72.72236633300781
799 40.75857925415039
899 21.636444091796875
999 12.382662773132324
1099 9.355053901672363
1199 8.851106643676758
1299 8.947275161743164
1399 8.892522811889648
1499 8.864999771118164
1599 8.885665893554688
1699 8.92397689819336
1799 8.914529800415039
1899 8.900985717773438
1999 8.904716491699219
Result: y = -8.101057602516448e-09 + 0.8572351336479187 x + -5.88732085304855e-09 x^2 + -0.09283571690320969 x^3


## Gradient Descent with Custom Model
We can also define our custom model by inheriting our class with `torch.nn.Module`

In [105]:
class Polynomial3(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate four parameters and assign them as
        member parameters.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(1).pow(p)

# Construct our model by instantiating the class defined above
model = Polynomial3()

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined 
# with torch.nn.Parameter) which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

99 651.7491455078125
199 455.06451416015625
299 318.8379211425781
399 224.39068603515625
499 158.84671020507812
599 113.31845092773438
699 81.6651382446289
799 59.639190673828125
899 44.29945755004883
999 33.60748291015625
1099 26.1492977142334
1199 20.942913055419922
1299 17.305822372436523
1399 14.763242721557617
1499 12.984619140625
1599 11.739627838134766
1699 10.867630004882812
1799 10.256516456604004
1899 9.828022003173828
1999 9.5274076461792
Result: y = -0.026728274300694466 + 0.8484574556350708 x + 0.004611076787114143 x^2 + -0.09215214848518372 x^3


Now we will try a custom made dense layers

In [174]:
class CustomDense(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        """
        In the constructor we instantiate two parameters and assign them as
        member parameters.
        """
        super().__init__()
        self.w = torch.nn.parameter.Parameter(torch.empty((output_dim, input_dim)))
        self.b = torch.nn.parameter.Parameter(torch.empty((output_dim)))

    def reset_params(self):
        # Initiate weight
        torch.nn.init.kaiming_uniform(self.w, a=math.sqrt(5))

        # Initiate bias
        torch.nn.init.constant(self.b, 0)


    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return torch.functional.F.linear(x, self.w, self.b)

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'{self.__class__.__name__} with weights:\n {self.w}\n bias:\n {self.b}'

torch.autograd.set_detect_anomaly(True)

# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# Construct our model by instantiating the class defined above
model = torch.nn.Sequential(
    CustomDense(input_dim=xx.shape[-1], output_dim=1),
    torch.nn.Flatten(0, 1)
)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined 
# with torch.nn.Parameter) which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)

for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(xx)

    # Compute and print loss
    loss = criterion(y_pred, y)

    # print(y_pred.shape)
    # print(loss)

    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

99 517.6945190429688
199 345.31256103515625
299 231.32492065429688
399 155.95059204101562
499 106.10929870605469
599 73.15153503417969
699 51.35828399658203
799 36.947540283203125
899 27.418458938598633
999 21.11724090576172
1099 16.950559616088867
1199 14.19536018371582
1299 12.373488426208496
1399 11.168793678283691
1499 10.37220573425293
1599 9.84544849395752
1699 9.497127532958984
1799 9.266793251037598
1899 9.114480018615723
1999 9.013773918151855


In [148]:
for i in model.parameters():
    print(i)

In [172]:
model[0].w

Parameter containing:
tensor([[ 3.2250e+01, -1.1104e+01, -4.3324e-23]], requires_grad=True)

In [160]:
1 / math.sqrt(1000)

0.03162277660168379