# Symbolic differentiation (own)

In [52]:
class DerivativeType :
  def __init__(self,name=None):
    self.name = name
  def getname(self):
    return self.name
  def derivative(self,paramlist=None): # backward function of pytorch
    print ('inside derivative',self.name)
    return None
  def compute(self,paramlist=None): # forward function of pytorch
    print ('inside compute',self.name)
    return None
  def __str__(self):
    if self.name is None:
      return 'None'
    return self.name

In [53]:
def display(derobj_list):
  for der in derobj_list:
    print (der)

In [54]:
class IllegalDerivativeOperation(Exception):
  def __init__(self):
    pass
  def __str__(self):
    return 'IllegalDerivativeOperation'

x = IllegalDerivativeOperation()

print (x)

IllegalDerivativeOperation


In [55]:
x = DerivativeType('some name xx')
x.derivative()
x.compute()
print (x)

inside derivative some name xx
inside compute some name xx
some name xx


In [56]:
class MyConstant(DerivativeType):
  def __init__(self,name,value):
    super().__init__(name=name)
    self.value = value

  def derivative(self,paramlist=None):
    derout_list = []

    if paramlist is not None:
      for param in paramlist:
        z = MyConstant('zero',0)
        derout_list.append(z)
    return derout_list

  def compute(self):
    return self.value

  def __str__(self):
    return '(' + self.name + ')'

x = MyConstant('y',3)
print (x.getname())
y = x.derivative([x])
print ('printing derivative...')
display(y)
print (x.compute())

print (x)

y
printing derivative...
(zero)
3
(y)


In [57]:
try :
  print ('kali before raise')
  raise IllegalDerivativeOperation
  print ('kali after raise')
except Exception as e:
  print (e)

kali before raise
IllegalDerivativeOperation


In [58]:
class MyVariable(DerivativeType):
  def __init__(self,name, value):
    super().__init__(name)
    self.value = value

  def derivative(self,paramlist=None):
    if paramlist is None:
      raise IllegalDerivativeOperation

    der_list = []
    if paramlist is not None:
      for param in paramlist :
        if param.getname() != self.getname():
          der_list.append(MyConstant('zero',0))
        else :
          der_list.append(MyConstant('one',1))
    return der_list

  def compute(self):
    return self.value

  def __str__(self):
    return '(' + self.name + ')'

x = MyVariable('x',1)
y = MyVariable('y',2)

z = x.derivative([x,y])
display(z)

print (x.compute())

(one)
(zero)
1


In [59]:
class MyUMinus(DerivativeType):
  def __init__(self,x):
    super().__init__()
    self.x = x
  def derivative(self,paramlist):
    derout_list = []
    if paramlist is not None:
      for param in paramlist:
        z = MyUMinus(self.x.derivative([param])[0])
        derout_list.append(z)
    return derout_list

  def compute(self):
    return -1 * self.x.compute()

  def __str__(self):
    return '-' + str(self.x)

x = MyVariable('x',1)
y = MyVariable('y',23)
ux = MyUMinus(x)

print (ux)

# z = ux.derivative([x,y,x,ux]) ux derivative logic to be fixed
z = ux.derivative([x,y])

display(z)

-(x)
-(one)
-(zero)


In [60]:
class MyAddition(DerivativeType):
  def __init__(self,x,y):
    self.x = x
    self.y = y

  def derivative(self,paramlist=None):
    derout_list = []

    if paramlist is not None:
      for param in paramlist:
        a = self.x.derivative([param])[0]
        b = self.y.derivative([param])[0]
        c = MyAddition(a,b)
        derout_list.append(c)

    return derout_list

  def compute(self):
    z = self.x.compute() + self.y.compute()
    return z

  def __str__(self):
    mystr = '(' + str(self.x) + '+' + str(self.y) + ')'
    return mystr


x = MyVariable('x',1)
y = MyVariable('y',23)

print (x,y)

z = MyAddition(x,y)

print (z)

z1 = z.derivative([x,y])

display(z1)

print (z.compute())

(x) (y)
((x)+(y))
((one)+(zero))
((zero)+(one))
24


In [61]:
class MyMultiplication(DerivativeType):
  def __init__(self,x,y):
    self.x = x
    self.y = y

  def derivative(self,paramlist=None):
    derout_list = []
    if paramlist is not None:
      for param in paramlist:
        a = MyMultiplication(self.x.derivative([param])[0], y)
        b = MyMultiplication(self.x, self.y.derivative([param])[0])
        c = MyAddition(a,b)
        derout_list.append(c)
    return derout_list

  def compute(self):
    z = self.x.compute() * self.y.compute()
    return z

  def __str__(self):
    return '('+str(self.x) + '*' + str(self.y)+')'


x = MyVariable('x',1)
y = MyVariable('y',2)

z = MyMultiplication(x,y)

print (z)

t = z.derivative([x,y])

display (t)

((x)*(y))
(((one)*(y))+((x)*(zero)))
(((zero)*(y))+((x)*(one)))


In [62]:
x = MyVariable('x',1)
y = MyVariable('y',1)
t1 = MyMultiplication(x,y)
t2 = MyAddition(x,t1)
t3 = MyMultiplication(t2,t2)
print (t3)

(((x)+((x)*(y)))*((x)+((x)*(y))))


In [63]:
z = t3.derivative([x])
display(z)
print (z[0])
print (z[0].compute())

((((one)+(((one)*(y))+((x)*(zero))))*(y))+(((x)+((x)*(y)))*((one)+(((one)*(y))+((x)*(zero))))))
((((one)+(((one)*(y))+((x)*(zero))))*(y))+(((x)+((x)*(y)))*((one)+(((one)*(y))+((x)*(zero))))))
6


In [64]:
z = t3.derivative([x])

display (z)

((((one)+(((one)*(y))+((x)*(zero))))*(y))+(((x)+((x)*(y)))*((one)+(((one)*(y))+((x)*(zero))))))


In [65]:
x = MyVariable('x',1)
y = MyVariable('x',1)
t1 = MyAddition(MyMultiplication(x,y),MyConstant('1',1))
print (t1)

dt = t1.derivative([x])

display(dt)

(((x)*(x))+(1))
((((one)*(x))+((x)*(one)))+(zero))


In [66]:
class MyMulInv(DerivativeType):
  def __init__(self,x):
    super().__init__()
    self.x = x
  def derivative(self,paramlist):
    deroutlist = []
    if paramlist is not None:
      for param in paramlist:
        ...
        ...

In [67]:
class MyLog(DerivativeType):
  def __init__(self,x):
    super().__init__()
    self.x = x

  def derivative(self,paramlist):
    derout_list = []
    if paramlist is not None:
      for param in paramlist:
        pass


In [68]:
class MyExponentiation(DerivativeType):
  def __init__(self,x,y):
    super().__init__()
    self.x = x
    self.y = y

  def derivative(self,paramlist):
    derout_list = []
    if paramlist is not None:
      for param in paramlist:
        a = MyMultiplication(y,MyExponentiation(x,MyAddition(y,MyUMinus(MyConstant('-1')))))
        b = MyMultiplication(MyExponentiation(x,y),MyLog(x))



# Clarification regarding library function calls

In [69]:
# the following is an example of a function inside the python library
def myfunction(x):
  x.__myfunction__()

def myfunction2(x):
  x.__myfunction2__()

# end of example library

# your class should implement __xx__() type of function
class A :
  def __init__(self):
    pass
  def __myfunction__(self):
    pass
    print ('inside __myfunction__')

  def __myfunction2__(self):
    print ('inside __myfunction2__')

a = A()

myfunction(a)
myfunction2(a)

inside __myfunction__
inside __myfunction2__


In [70]:
class MyDataLoader:

  def __init__(self,batch_size):
    self.batch_size = batch_size

  def __iter__(self):
    self.a = 1
    return self

  def __next__(self):
    if self.a <= self.batch_size:
      x = self.a
      self.a += 1
      print ('returning a row',self.a)
      return x
    else:
      raise StopIteration

myclass = MyDataLoader(5)
myiter = iter(myclass)

for x in myiter:
  print(x)

returning a row 2
1
returning a row 3
2
returning a row 4
3
returning a row 5
4
returning a row 6
5


# 'with' clause

In [71]:
# Exception class
class MyException(Exception):
  def __init__(self,msg):
    self.msg = msg

  def __str__(self):
    mystr = 'My Custom Exception ' + self.msg
    return mystr

In [72]:
t1 = MyException('abc')
t2 = MyException('pqr')
print (t1)
print (t2)

My Custom Exception abc
My Custom Exception pqr


In [73]:
try :
  print ('Example try')
  raise MyException('some msg text text')
  print ('After exception is raised (this code will never be reached)')
except Exception as eobj:
  print ('Exception handling code')
  print (eobj)

Example try
Exception handling code
My Custom Exception some msg text text


In [74]:
class MyClass:

  def __init__(self, name):
    print ('initialized',name)
    self.name = name

  def op1(self, params):
    print ('operation 1', params)

  def op2(self, params):
    print ('operation 2', params)

  def close(self):
    print ('inside code',self.name)

class MyClassHandler:
    def __init__(self1, name):
        self1.name = name

    def __enter__(self1):
        self1.x = MyClass(self1.name)
        return self1.x

    # REF - https://stackoverflow.com/questions/22417323/how-do-enter-and-exit-work-in-python-decorator-classes
    def __exit__(self1,exc_type, exc_val, tb):
      # you can write your own closing logic
      return self1



In [75]:
with MyClassHandler('some name') as xobj:
    xobj.op1('params 1')
    xobj.op2('param 2')
    xobj.op1('param 3')
    xobj.op1('param xx')

initialized some name
operation 1 params 1
operation 2 param 2
operation 1 param 3
operation 1 param xx


# Pytorch - Curve fitting

## Numpy based approach

In [76]:
# -*- coding: utf-8 -*-
import numpy as np
import math

# Create random input and output data
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)
# y = -34.7 + 12.9 * x + 8.1145 * x**2

# Randomly initialize weights
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()

learning_rate = 1e-6 #eta
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 523.2439208158174
199 371.58230413003054
299 264.6889876653193
399 189.33019519889345
499 136.1904923713094
599 98.71044567708648
699 72.26984908383835
799 53.613480301751395
899 40.44721405750441
999 31.153849355304974
1099 24.593097488229972
1199 19.960759531784277
1299 16.689546696670675
1399 14.379208858096664
1499 12.747296964861729
1599 11.594456892957329
1699 10.77996011288906
1799 10.20444778368214
1899 9.797759160066974
1999 9.510344513023893
Result: y = 0.027654647269344076 + 0.8599033490033596 x + -0.004770886829278731 x^2 + -0.09378022796973699 x^3


## Tensor based approach (without autograd)

In [77]:
# -*- coding: utf-8 -*-

import torch
import math


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d


print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 1124.541748046875
199 759.2581787109375
299 514.010986328125
399 349.2205505371094
499 238.39808654785156
599 163.8043212890625
699 113.55054473876953
799 79.66300964355469
899 56.78982925415039
999 41.33567810058594
1099 30.883827209472656
1199 23.80771827697754
1299 19.01201057434082
1399 15.758316040039062
1499 13.548377990722656
1599 12.045747756958008
1699 11.022858619689941
1799 10.325796127319336
1899 9.850207328796387
1999 9.52535343170166
Result: y = 0.020704247057437897 + 0.839200496673584 x + -0.003571827430278063 x^2 + -0.09083542972803116 x^3


## Tensor based (WITH AUTOGRAD)

In [78]:
# -*- coding: utf-8 -*-
import torch
import math

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For a third order polynomial, we need
# 4 weights: y = a + b x + c x^2 + d x^3
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d = torch.randn((), device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y using operations on Tensors.
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 721.2719116210938
199 491.8224182128906
299 336.62188720703125
399 231.5345001220703
499 160.30419921875
599 111.97098541259766
699 79.13842010498047
799 56.810943603515625
899 41.610076904296875
999 31.24951934814453
1099 24.17989158630371
1199 19.350385665893555
1299 16.047348022460938
1399 13.785717010498047


1499 12.235374450683594
1599 11.171392440795898
1699 10.44038200378418
1799 9.937561988830566
1899 9.59131908416748
1999 9.35263442993164
Result: y = -0.020062431693077087 + 0.8696451187133789 x + 0.0034611052833497524 x^2 + -0.0951659083366394 x^3


## Tensor based WITH AUTOGRAD + OPTIM

In [79]:
# -*- coding: utf-8 -*-
import torch
import math

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.

# x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
x = torch.linspace(0, 1, 100, device=device, dtype=dtype)

# y = torch.sin(x)
y = 3 + 4 * x + 5 * x**2 + 6 * x**3

# Create random Tensors for weights. For a third order polynomial, we need
# 4 weights: y = a + b x + c x^2 + d x^3
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d = torch.randn((), device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-4

# 2022 Aug 23 - Kalidas (ykalidas@iittp.ac.in)
optimizer = torch.optim.SGD([a,b,c,d], lr=learning_rate, momentum=0)


In [80]:
for t in range(10000):
    # Forward pass: compute predicted y using operations on Tensors.
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()

    # 2022 Aug 23 - Kalidas (ykalidas@iittp.ac.in)
    optimizer.zero_grad()

    # 2023 Jan 11 - There is some problem with loss function being invoked on all the data points here,
    # Where is SGD type happening here? (CS20B Ganesh' query.)
    loss.backward()

    # 2023 Jan 11 - Optimizer step() should invoke loss function on one data point and update,
    # Where is the invocation on one random point happening here?
    optimizer.step()

    # I will check and get back... 2023 Jan 11 - Kalidas
    # Answer: https://pytorch.org/docs/stable/optim.html
    # Its more tricky than the one shown here.

    # For actual SGD, the code as below should be worked out!
    # for input, target in dataset:
    # def closure():
    #    optimizer.zero_grad()
    #    rand_idx = 12 # figure out a random index among x
    #    rand_inp = input[rand_idx]
    #    rand_tar = target[rand_idx]
    #    output = model(rand_inp)
    #    loss = loss_fn(output, rand_tar)
    #    loss.backward()
    #    return loss
    # optimizer.step(closure)


    if t % 1000 == 99:
        print(t, loss.item())

99 366.96136474609375


1099 0.6785367131233215
2099 0.25696414709091187
3099 0.19757629930973053
4099 0.1529243290424347
5099 0.11914822459220886
6099 0.09359253942966461
7099 0.07425563037395477
8099 0.05961895361542702
9099 0.048537544906139374


In [81]:
print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

Result: y = 2.998710870742798 + 3.7585716247558594 x + 5.91465950012207 x^2 + 5.268720626831055 x^3


## Tensor + Autograd + Optim + (a,b,c,d as tensor <b>Parameter class</b> instead of individual variable)

In [82]:
# -*- coding: utf-8 -*-
import torch
import math

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.

x = torch.linspace(0, 1, 100, device=device, dtype=dtype)

y = 3 + 4 * x + 5 * x**2 + 6 * x**3

# Create random Tensors for weights. For a third order polynomial, we need
# 4 weights: y = a + b x + c x^2 + d x^3
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
# a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
# b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
# c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
# d = torch.randn((), device=device, dtype=dtype, requires_grad=True)

w = torch.randn((4,1),device=device, dtype=dtype, requires_grad=True)

In [83]:
p = torch.nn.Parameter(w)
print (p)

Parameter containing:
tensor([[-0.2198],
        [-1.7268],
        [-0.5194],
        [-0.7132]], requires_grad=True)


In [84]:
x = x.reshape(-1,1)

In [85]:
x4 = torch.cat([x**0, x**1, x**2, x**3], dim=1)

print (x4.shape)

torch.Size([100, 4])


In [86]:
print (p.shape)

torch.Size([4, 1])


In [87]:
y_pred = torch.matmul(x4,p)
print (y_pred.shape)

torch.Size([100, 1])


In [88]:
loss = (y_pred - y).pow(2).sum()

In [89]:
print (loss)

tensor(1122351.6250, grad_fn=<SumBackward0>)


In [90]:
loss.backward()

In [91]:
print (p.grad)

tensor([[-192551.5156],
        [-101190.0859],
        [ -69606.0312],
        [ -53342.7969]])


In [92]:
optimizer.zero_grad()
w = torch.randn((4,1),device=device, dtype=dtype, requires_grad=True)
p = torch.nn.Parameter(w)

print (p)

learning_rate = 1e-4
# 2022 Aug 23 - Kalidas (ykalidas@iittp.ac.in)
optimizer = torch.optim.SGD([p], lr=1e-6, momentum=0)

print (p)

optimizer.zero_grad()

print (p)

y_pred = torch.matmul(x4,p)

loss = (y_pred - y).pow(2).sum()

loss.backward()

print (p)

optimizer.step()

print (p)

Parameter containing:
tensor([[ 0.0838],
        [ 1.2840],
        [ 0.1710],
        [-1.0790]], requires_grad=True)
Parameter containing:
tensor([[ 0.0838],
        [ 1.2840],
        [ 0.1710],
        [-1.0790]], requires_grad=True)
Parameter containing:
tensor([[ 0.0838],
        [ 1.2840],
        [ 0.1710],
        [-1.0790]], requires_grad=True)
Parameter containing:
tensor([[ 0.0838],
        [ 1.2840],
        [ 0.1710],
        [-1.0790]], requires_grad=True)
Parameter containing:
tensor([[ 0.2374],
        [ 1.3600],
        [ 0.2218],
        [-1.0406]], requires_grad=True)


In [93]:
for t in range(10000):
    # Forward pass: compute predicted y using operations on Tensors.
    y_pred = torch.matmul(x4, p)


    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()

    # 2022 Aug 23 - Kalidas (ykalidas@iittp.ac.in)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if t % 1000 == 99:
        print(t, loss.item())

99 209529.671875


1099 188704.21875
2099 188566.5
3099 188477.015625
4099 188409.375
5099 188358.203125
6099 188319.546875
7099 188290.3125
8099 188268.203125
9099 188251.46875


In [94]:
for x in [a,b,c,d]:
  x.requires_grad = False

In [95]:
print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

Result: y = 2.998710870742798 + 3.7585716247558594 x + 5.91465950012207 x^2 + 5.268720626831055 x^3


In [96]:

import torch
import math


class LegendrePolynomial3(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For this example, we need
# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
# not too far from the correct result to ensure convergence.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)

learning_rate = 5e-6
for t in range(2000):
    # To apply our Function, we use Function.apply method. We alias this as 'P3'.
    P3 = LegendrePolynomial3.apply

    # Forward pass: compute predicted y using operations; we compute
    # P3 using our custom autograd operation.
    y_pred = a + b * P3(c + d * x)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')

99 209.95834350585938
199 144.66018676757812
299 100.70249938964844
399 71.03519439697266
499 50.978511810302734
599 37.403133392333984
699 28.206867218017578
799 21.97318458557129
899 17.7457275390625
999 14.877889633178711
1099 12.93176555633545
1199 11.610918045043945
1299 10.71425724029541
1399 10.10548210144043
1499 9.692105293273926
1599 9.411375999450684
1699 9.220745086669922
1799 9.091285705566406
1899 9.003361701965332
1999 8.943641662597656
Result: y = -6.71270206087371e-10 + -2.208526849746704 * P3(-3.392665037793563e-10 + 0.2554861009120941 x)


In [97]:
# -*- coding: utf-8 -*-
import torch
import math


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# For this example, the output y is a linear function of (x, x^2, x^3), so
# we can consider it as a linear layer neural network. Let's prepare the
# tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# In the above code, x.unsqueeze(-1) has shape (2000, 1), and p has shape
# (3,), for this case, broadcasting semantics will apply to obtain a tensor
# of shape (2000, 3)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. The Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# The Flatten layer flatens the output of the linear layer to a 1D tensor,
# to match the shape of `y`.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-6
for t in range(2000):

    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(xx)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

# You can access the first layer of `model` like accessing the first item of a list
linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 537.861083984375


199 359.9160461425781
299 241.8788604736328
399 163.56446838378906
499 111.59332275390625
599 77.09587097167969
699 54.19139099121094
799 38.980224609375
899 28.87520980834961
999 22.16036033630371
1099 17.696809768676758
1199 14.728796005249023
1299 12.754524230957031
1399 11.440767288208008
1499 10.56618881225586
1599 9.983724594116211
1699 9.595646858215332
1799 9.336956024169922
1899 9.164433479309082
1999 9.049310684204102
Result: y = -0.006551248021423817 + 0.8432069420814514 x + 0.0011301989434286952 x^2 + -0.09140530973672867 x^3


In [98]:
# -*- coding: utf-8 -*-
import torch
import math


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Prepare the input tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use RMSprop; the optim package contains many other
# optimization algorithms. The first argument to the RMSprop constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
for t in range(2000):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(xx)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()


linear_layer = model[0]
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 5235.78173828125
199 1161.5919189453125


299 363.5297546386719
399 250.62913513183594
499 197.06393432617188
599 149.46275329589844
699 106.71774291992188
799 70.25125122070312
899 42.08367156982422
999 23.255905151367188
1099 13.235845565795898
1199 9.565327644348145
1299 8.861568450927734
1399 8.8418550491333
1499 8.818650245666504
1599 8.829854011535645
1699 8.90394401550293
1799 8.89794635772705
1899 8.950279235839844
1999 8.918556213378906
Result: y = -0.00041789887472987175 + 0.8562489748001099 x + -0.00041799014434218407 x^2 + -0.09382333606481552 x^3


In [99]:
# -*- coding: utf-8 -*-
import torch
import math


class Polynomial3(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate four parameters and assign them as
        member parameters.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = Polynomial3()

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined
# with torch.nn.Parameter) which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

99 1849.756103515625
199 1297.1942138671875
299 911.0399169921875


399 640.9949340820312
499 452.0255126953125
599 319.7074279785156
699 227.00320434570312
799 162.0160675048828
899 116.43463897705078
999 84.44781494140625
1099 61.990169525146484
1199 46.215545654296875
1299 35.13043212890625
1399 27.337444305419922
1499 21.856719970703125
1599 18.00076675415039
1699 15.286951065063477
1799 13.37632942199707
1899 12.03078842163086
1999 11.082903861999512
Result: y = 0.049035340547561646 + 0.8461071848869324 x + -0.00845941249281168 x^2 + -0.09181784838438034 x^3


In [100]:
# -*- coding: utf-8 -*-
import random
import torch
import math


class DynamicNet(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate five parameters and assign them as members.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))
        self.e = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 4, 5
        and reuse the e parameter to compute the contribution of these orders.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same parameter many
        times when defining a computational graph.
        """
        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
        for exp in range(4, random.randint(4, 6)):
            y = y + self.e * x ** exp
        return y

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = DynamicNet()

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
for t in range(30000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 2000 == 1999:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

1999 721.32568359375
3999 332.1724548339844
5999 157.57943725585938
7999 76.64330291748047
9999 41.466102600097656
11999 24.678539276123047
13999 15.313752174377441
15999 11.8419771194458
17999 10.098008155822754
19999 9.506826400756836
21999 9.154935836791992
23999 8.96929931640625
25999 8.942845344543457
27999 8.548627853393555
29999 8.860589027404785
Result: y = 0.004241105634719133 + 0.8554676175117493 x + -0.00131724844686687 x^2 + -0.09346926212310791 x^3 + 9.771926852408797e-05 x^4 ? + 9.771926852408797e-05 x^5 ?
