## Pytorch Basics
This notebook has been created by referring to https://docs.pytorch.org/tutorials/beginner/pytorch_with_examples.html. I have added my own comments and markdown so that I can get a better understanding of the code in this notebook.

In [1]:
import numpy as np
import math

In [2]:
# Create random input and output data
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)

In [None]:
# Randomly initialize weights
a = np.random.randn() # constant
b = np.random.randn() # coef for x
c = np.random.randn() # coef for x^2
d = np.random.randn() # coef for x^3
# we are estimating y using a degree 3 polynomial

In [None]:
learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = np.square(y_pred - y).sum()

    # Printing loss every 100 steps
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y) # Gradient of (y_pred - y)^2
    grad_a = grad_y_pred.sum() # Grad of y_pred wrt a is 1
    grad_b = (grad_y_pred * x).sum() # Grad of y_pred wrt b is x
    grad_c = (grad_y_pred * x ** 2).sum() # Grad of y_pred wrt c is x^2
    grad_d = (grad_y_pred * x ** 3).sum() # Grad of y_pred wrt d is x^3

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 1297.258613490858
199 892.0357221869433
299 614.952387957907
399 425.2681610114328
499 295.2659870983222
599 206.0656077802425
699 144.7915749853648
799 102.65339284144608
899 73.64260469012405
999 53.64760144364074
1099 39.8515593668465
1199 30.32248791970258
1299 23.73377131975657
1399 19.173448226399273
1499 16.013902658460452
1599 13.82272831062528
1699 12.301686696850975
1799 11.244856521943131
1899 10.509907883359766
1999 9.998362835177137
Result: y = -0.032513252360881977 + 0.8717275227222708 x + 0.00560907705510671 x^2 + -0.09546211371544146 x^3


### Tensors
n - dimensional arrays that can be run on GPU

In [3]:
import torch
import math

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [10]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

In [5]:
# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

In [None]:
# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype) # Using tensors instead of numpy arrays
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

In [9]:
learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d


print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 9.445923805236816
199 9.234925270080566
299 9.094813346862793
399 9.001764297485352
499 8.939935684204102
599 8.898845672607422
699 8.871530532836914
799 8.853363037109375
899 8.841277122497559
999 8.833234786987305
1099 8.827880859375
1199 8.82431411743164
1299 8.821938514709473
1399 8.820354461669922
1499 8.819296836853027
1599 8.81859016418457
1699 8.818120956420898
1799 8.817805290222168
1899 8.817594528198242
1999 8.81745433807373
Result: y = 0.00025946463574655354 + 0.8562779426574707 x + -4.476173489820212e-05 x^2 + -0.09326454252004623 x^3


### Autograd
Automatic differentiation automates computation of backward pass in neural nets.


In [11]:
dtype = torch.float
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")
torch.set_default_device(device)

Using cpu device


In [14]:
# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-1, 1, 2000, dtype=dtype)
y = torch.exp(x) # A Taylor expansion would be 1 + x + (1/2) x**2 + (1/3!) x**3 + ...

In [15]:
# Create random Tensors for weights. For a third order polynomial, we need
# 4 weights: y = a + b x + c x^2 + d x^3
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.randn((), dtype=dtype, requires_grad=True)
b = torch.randn((), dtype=dtype, requires_grad=True)
c = torch.randn((), dtype=dtype, requires_grad=True)
d = torch.randn((), dtype=dtype, requires_grad=True)
# Since these are the variable weights in the model, we compute gradients wrt these values

In [16]:
initial_loss = 1.
learning_rate = 1e-5
for t in range(5000):
    # Forward pass: compute predicted y using operations on Tensors.
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()

    # Calculare initial loss, so we can report loss relative to it
    if t==0:
        initial_loss=loss.item()

    if t % 100 == 99:
        print(f'Iteration t = {t:4d}  loss(t)/loss(0) = {round(loss.item()/initial_loss, 6):10.6f}  a = {a.item():10.6f}  b = {b.item():10.6f}  c = {c.item():10.6f}  d = {d.item():10.6f}')

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    with torch.no_grad(): # Ensures no gradient computation occurs within this block
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

Iteration t =   99  loss(t)/loss(0) =   0.019752  a =   0.811036  b =   0.598489  c =   1.012849  d =   0.383710
Iteration t =  199  loss(t)/loss(0) =   0.005769  a =   0.869583  b =   0.773655  c =   0.885962  d =   0.464833
Iteration t =  299  loss(t)/loss(0) =   0.003252  a =   0.904209  b =   0.811387  c =   0.790785  d =   0.460992
Iteration t =  399  loss(t)/loss(0) =   0.002000  a =   0.929314  b =   0.827021  c =   0.721527  d =   0.444965
Iteration t =  499  loss(t)/loss(0) =   0.001305  a =   0.947573  b =   0.838583  c =   0.671152  d =   0.428030
Iteration t =  599  loss(t)/loss(0) =   0.000907  a =   0.960854  b =   0.848924  c =   0.634511  d =   0.411883
Iteration t =  699  loss(t)/loss(0) =   0.000670  a =   0.970515  b =   0.858524  c =   0.607860  d =   0.396727
Iteration t =  799  loss(t)/loss(0) =   0.000521  a =   0.977541  b =   0.867496  c =   0.588475  d =   0.382539
Iteration t =  899  loss(t)/loss(0) =   0.000422  a =   0.982652  b =   0.875888  c =   0.574375