## Pytorch Basics
This notebook has been created by referring to https://docs.pytorch.org/tutorials/beginner/pytorch_with_examples.html. I have added my own comments and markdown so that I can get a better understanding of the code in this notebook.

In [6]:
import numpy as np
import math

In [2]:
# Create random input and output data
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)

In [None]:
# Randomly initialize weights
a = np.random.randn() # constant
b = np.random.randn() # coef for x
c = np.random.randn() # coef for x^2
d = np.random.randn() # coef for x^3
# we are estimating y using a degree 3 polynomial

In [None]:
learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = np.square(y_pred - y).sum()

    # Printing loss every 100 steps
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y) # Gradient of (y_pred - y)^2
    grad_a = grad_y_pred.sum() # Grad of y_pred wrt a is 1
    grad_b = (grad_y_pred * x).sum() # Grad of y_pred wrt b is x
    grad_c = (grad_y_pred * x ** 2).sum() # Grad of y_pred wrt c is x^2
    grad_d = (grad_y_pred * x ** 3).sum() # Grad of y_pred wrt d is x^3

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 1297.258613490858
199 892.0357221869433
299 614.952387957907
399 425.2681610114328
499 295.2659870983222
599 206.0656077802425
699 144.7915749853648
799 102.65339284144608
899 73.64260469012405
999 53.64760144364074
1099 39.8515593668465
1199 30.32248791970258
1299 23.73377131975657
1399 19.173448226399273
1499 16.013902658460452
1599 13.82272831062528
1699 12.301686696850975
1799 11.244856521943131
1899 10.509907883359766
1999 9.998362835177137
Result: y = -0.032513252360881977 + 0.8717275227222708 x + 0.00560907705510671 x^2 + -0.09546211371544146 x^3


### Tensors
n - dimensional arrays that can be run on GPU

In [7]:
import torch
import math

In [8]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

In [5]:
# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

In [None]:
# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype) # Using tensors instead of numpy arrays
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

In [9]:
learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d


print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 9.445923805236816
199 9.234925270080566
299 9.094813346862793
399 9.001764297485352
499 8.939935684204102
599 8.898845672607422
699 8.871530532836914
799 8.853363037109375
899 8.841277122497559
999 8.833234786987305
1099 8.827880859375
1199 8.82431411743164
1299 8.821938514709473
1399 8.820354461669922
1499 8.819296836853027
1599 8.81859016418457
1699 8.818120956420898
1799 8.817805290222168
1899 8.817594528198242
1999 8.81745433807373
Result: y = 0.00025946463574655354 + 0.8562779426574707 x + -4.476173489820212e-05 x^2 + -0.09326454252004623 x^3


### Autograd
Automatic differentiation automates computation of backward pass in neural nets.


In [11]:
dtype = torch.float
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")
torch.set_default_device(device)

Using cpu device


In [14]:
# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-1, 1, 2000, dtype=dtype)
y = torch.exp(x) # A Taylor expansion would be 1 + x + (1/2) x**2 + (1/3!) x**3 + ...

In [15]:
# Create random Tensors for weights. For a third order polynomial, we need
# 4 weights: y = a + b x + c x^2 + d x^3
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.randn((), dtype=dtype, requires_grad=True)
b = torch.randn((), dtype=dtype, requires_grad=True)
c = torch.randn((), dtype=dtype, requires_grad=True)
d = torch.randn((), dtype=dtype, requires_grad=True)
# Since these are the variable weights in the model, we compute gradients wrt these values

In [16]:
initial_loss = 1.
learning_rate = 1e-5
for t in range(5000):
    # Forward pass: compute predicted y using operations on Tensors.
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()

    # Calculare initial loss, so we can report loss relative to it
    if t==0:
        initial_loss=loss.item()

    if t % 100 == 99:
        print(f'Iteration t = {t:4d}  loss(t)/loss(0) = {round(loss.item()/initial_loss, 6):10.6f}  a = {a.item():10.6f}  b = {b.item():10.6f}  c = {c.item():10.6f}  d = {d.item():10.6f}')

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    with torch.no_grad(): # Ensures no gradient computation occurs within this block
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

Iteration t =   99  loss(t)/loss(0) =   0.019752  a =   0.811036  b =   0.598489  c =   1.012849  d =   0.383710
Iteration t =  199  loss(t)/loss(0) =   0.005769  a =   0.869583  b =   0.773655  c =   0.885962  d =   0.464833
Iteration t =  299  loss(t)/loss(0) =   0.003252  a =   0.904209  b =   0.811387  c =   0.790785  d =   0.460992
Iteration t =  399  loss(t)/loss(0) =   0.002000  a =   0.929314  b =   0.827021  c =   0.721527  d =   0.444965
Iteration t =  499  loss(t)/loss(0) =   0.001305  a =   0.947573  b =   0.838583  c =   0.671152  d =   0.428030
Iteration t =  599  loss(t)/loss(0) =   0.000907  a =   0.960854  b =   0.848924  c =   0.634511  d =   0.411883
Iteration t =  699  loss(t)/loss(0) =   0.000670  a =   0.970515  b =   0.858524  c =   0.607860  d =   0.396727
Iteration t =  799  loss(t)/loss(0) =   0.000521  a =   0.977541  b =   0.867496  c =   0.588475  d =   0.382539
Iteration t =  899  loss(t)/loss(0) =   0.000422  a =   0.982652  b =   0.875888  c =   0.574375

### Defining Custom Autograd Functions

We define custom functions by extending torch.autograd.Function

In [3]:
class LegendrePolynomial3(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache tensors for
        use in the backward pass using the ``ctx.save_for_backward`` method. Other
        objects can be stored directly as attributes on the ctx object, such as
        ``ctx.my_object = my_object``. Check out `Extending torch.autograd <https://docs.pytorch.org/docs/stable/notes/extending.html#extending-torch-autograd>`_
        for further details.
        """
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)

In [7]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU


In [8]:
# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

In [9]:
# Create random Tensors for weights. For this example, we need
# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
# not too far from the correct result to ensure convergence.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)

In [10]:
learning_rate = 5e-6
for t in range(2000):
    # To apply our Function, we use Function.apply method. We alias this as 'P3'.
    P3 = LegendrePolynomial3.apply

    # Forward pass: compute predicted y using operations; we compute
    # P3 using our custom autograd operation.
    y_pred = a + b * P3(c + d * x)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')

99 209.95834350585938
199 144.66018676757812
299 100.70249938964844
399 71.03519439697266
499 50.97850799560547
599 37.403133392333984
699 28.206867218017578
799 21.97318458557129
899 17.745729446411133
999 14.877889633178711
1099 12.93176555633545
1199 11.610918045043945
1299 10.714258193969727
1399 10.10548210144043
1499 9.692106246948242
1599 9.411375999450684
1699 9.220745086669922
1799 9.091286659240723
1899 9.003362655639648
1999 8.943641662597656
Result: y = -2.9753338681715036e-10 + -2.208526849746704 * P3(-1.1693186696692948e-10 + 0.2554861009120941 x)


### NN module

For complex neural nets. Higher level abstraction for networks with multiple layers.

In [11]:
# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

In [None]:
# For this example, the output y is a linear function of (x, x^2, x^3), so
# we can consider it as a linear layer neural network. Let's prepare the
# tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# In the above code, x.unsqueeze(-1) has shape (2000, 1), and p has shape
# (3,), for this case, broadcasting semantics will apply to obtain a tensor
# of shape (2000, 3) 

In [15]:
# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. The Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# The Flatten layer flatens the output of the linear layer to a 1D tensor,
# to match the shape of `y`.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)

In [13]:
# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

In [16]:
learning_rate = 1e-6
for t in range(2000):

    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(xx) # Basically can directly call the model class and pass input tensor

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

# You can access the first layer of `model` like accessing the first item of a list
linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 580.0430297851562
199 391.80950927734375
299 265.7999267578125
399 181.38572692871094
499 124.7955322265625
599 86.82928466796875
699 61.337852478027344
799 44.20817184448242
899 32.68779754638672
999 24.933012008666992
1099 19.708284378051758
1199 16.18488121032715
1299 13.806535720825195
1399 12.199480056762695
1499 11.112544059753418
1599 10.376591682434082
1699 9.877799034118652
1799 9.539355278015137
1899 9.309465408325195
1999 9.153127670288086
Result: y = 0.013353156857192516 + 0.8438148498535156 x + -0.0023036417551338673 x^2 + -0.09149177372455597 x^3


### Optim

Abstracts the weight updation step and allows for more sophisticated optimizers like RMSprop, AdaGrad, Adam, etc.

In [22]:
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Prepare the input tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')

In [23]:
# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use RMSprop; the optim package contains many other
# optimization algorithms. The first argument to the RMSprop constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

In [24]:
for t in range(2000):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(xx)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()


linear_layer = model[0]
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 3723.4150390625
199 1391.760986328125
299 531.3375244140625
399 193.41851806640625
499 94.78976440429688
599 61.481040954589844
699 37.614830017089844
799 20.81222152709961
899 12.081933975219727
999 9.252971649169922
1099 8.834147453308105
1199 8.817290306091309
1299 9.160338401794434
1399 9.019918441772461
1499 9.044709205627441
1599 8.924325942993164
1699 8.879155158996582
1799 8.91452693939209
1899 8.922211647033691
1999 8.924721717834473
Result: y = 0.0005364295211620629 + 0.8562321066856384 x + 0.0005369774298742414 x^2 + -0.09383904933929443 x^3


### Custom nn Module

Define your own module, Other than just a sequential model

In [25]:
class Polynomial3(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate four parameters and assign them as
        member parameters.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'

In [26]:
# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = Polynomial3()

In [27]:
# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined 
# with torch.nn.Parameter) which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)

In [28]:
for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

99 4950.58154296875
199 3485.0771484375
299 2455.05712890625
399 1730.8172607421875
499 1221.38427734375
599 862.9194946289062
699 610.597412109375
799 432.93121337890625
899 307.7936706542969
999 219.62921142578125
1099 157.49652099609375
1199 113.69841766357422
1299 82.8170166015625
1399 61.038150787353516
1499 45.67548370361328
1599 34.836605072021484
1699 27.18790626525879
1799 21.78952980041504
1899 17.978740692138672
1999 15.288232803344727
Result: y = -0.08400112390518188 + 0.86953204870224 x + 0.014491591602563858 x^2 + -0.09514982253313065 x^3


### Control Flow + Weight Sharing

Custom module with weight sharing for fourth and fifth orders

In [9]:
import random
import math
import torch

class DynamicNet(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate five parameters and assign them as members.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))
        self.e = torch.nn.Parameter(torch.randn(())) # Shared parameter for 4th and 5th exponential

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 4, 5
        and reuse the e parameter to compute the contribution of these orders.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same parameter many
        times when defining a computational graph.
        """
        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
        for exp in range(4, random.randint(4, 6)):
            y = y + self.e * x ** exp
        return y

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'

In [10]:
# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = DynamicNet()

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)

In [11]:
for t in range(30000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 2000 == 1999:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

1999 533.1703491210938
3999 260.2292175292969
5999 120.23128509521484
7999 57.88347625732422
9999 30.447509765625
11999 18.341909408569336
13999 12.99733829498291
15999 10.60342788696289
17999 9.627148628234863
19999 9.181398391723633
21999 8.765251159667969
23999 8.906578063964844
25999 8.584129333496094
27999 8.593748092651367
29999 8.832708358764648
Result: y = 0.0014332497958093882 + 0.8553042411804199 x + -0.0007657706155441701 x^2 + -0.0933169424533844 x^3 + 0.0001045174285536632 x^4 ? + 0.0001045174285536632 x^5 ?
