# Examples1_learning_with_examples

## 1. 第一次尝试：手动forward and backward

In [1]:
import torch
import math

In [2]:
# -*- coding: utf-8 -*-
dtype = torch.float
device = torch.device('cpu')

# create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)
# print(x**3)

# randomly initialize weights
torch.manual_seed(231)
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)
# print(a, b, c, d)

learning_rate = 1e-6
for t in range(2000):
    # forward
    y_pred = a + b * x + c * (x ** 2) + d * (x ** 3)
    loss = (y_pred - y).pow(2).sum().item()
    if t % 200 == 99:
        print(f"{t}: {loss}")
    
    # backward
    grad_y_pred = 2 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()
  
    # update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d
    
print(f'Result: y = {a.item()} + {b.item()}x + {c.item()}x^2 + {d.item()}x^3')

99: 1477.5787353515625
299: 722.48388671875
499: 356.6500549316406
699: 178.82565307617188
899: 92.1266098022461
1099: 49.73776626586914
1299: 28.959972381591797
1499: 18.751493453979492
1699: 13.72527027130127
1899: 11.245824813842773
Result: y = -0.04202675819396973 + 0.8455353379249573x + 0.007250312250107527x^2 + -0.09173650294542313x^3


## 2. autograd

In [3]:
# -*- coding: utf-8 -*-
dtype = torch.float
device = torch.device('cpu')

# create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# randomly initialize weights
torch.manual_seed(231)
a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d = torch.randn((), device=device, dtype=dtype, requires_grad=True)
# print(a, b, c, d)

learning_rate = 1e-6
for t in range(2000):
    # forward
    y_pred = a + b * x + c * (x ** 2) + d * (x ** 3)
    loss = (y_pred - y).pow(2).sum()
    if t % 200 == 99:
        print(f"{t}: {loss.item()}")
    
    # backward
    loss.backward()
  
    # 手动update weights，这里weights的运算不用计grad，所以要暂停autograd
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad
    
    # 每次迭代后要将grad置零，以免下次迭代时从非0值开始累加各个path的gradient
        a.grad = b.grad = c.grad = d.grad = None
    
print(f'Result: y = {a.item()} + {b.item()}x + {c.item()}x^2 + {d.item()}x^3')

99: 1477.5787353515625
299: 722.48388671875
499: 356.6500549316406
699: 178.82565307617188
899: 92.1266098022461
1099: 49.73776626586914
1299: 28.959972381591797
1499: 18.751493453979492
1699: 13.72527027130127
1899: 11.245824813842773
Result: y = -0.04202675819396973 + 0.8455353379249573x + 0.007250312250107527x^2 + -0.09173650294542313x^3


## 3. 自定义autograd function
1. each primitive autograd operator包含了两个function：forward function 用input tensor计算output tensor；backward function收到output Tensors 相当于某个scalar value的梯度，然后用这个梯度来计算input tensor相对于该scalar value的梯度
2. 可以通过定义torch.autograd.Function的子类的方式来自定义autograd operator，实现forward and backward functions。

这里取$y=a + b*P_3(c + d*x)$来代替前面的$y=a + b*x^2 + c*x^3$。$P_3(x)=\frac{1}{2}(5x^3-3x)$是Legendre polynomial。

In [4]:
import torch
import math

class LegendrePolynomial3(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        """
        ctx is a context object.可以用来stash information for backward。
        可以用ctx.save_for_backward method来cache arbitrary objects for use in the backward.
        """
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)

x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# 这里initialize weights刻意放在了正确值附近
a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)
# print(a, b, c, d)

learning_rate = 5e-6
for t in range(2000):
    # autograd.Function要先apply
    P3 = LegendrePolynomial3.apply
    
    # forward
    y_pred = a + b * P3(c + d * x)
    loss = (y_pred - y).pow(2).sum()
    if t % 200 == 99:
        print(f"{t}: {loss.item()}")
    
    # backward
    loss.backward()
  
    # 手动update weights，这里weights的运算不用计grad，所以要暂停autograd
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad
    
    # 每次迭代后要将grad置零，以免下次迭代时从非0值开始累加各个path的gradient
        a.grad = b.grad = c.grad = d.grad = None
    
print(f'Result: y = {a.item()} + {b.item()}x + {c.item()}x^2 + {d.item()}x^3')

99: 209.95834350585938
299: 100.70249938964844
499: 50.978511810302734
699: 28.206867218017578
899: 17.7457275390625
1099: 12.93176555633545
1299: 10.71425724029541
1499: 9.692105293273926
1699: 9.220745086669922
1899: 9.003361701965332
Result: y = -6.71270206087371e-10 + -2.208526849746704x + -3.392665037793563e-10x^2 + 0.2554861009120941x^3


## 4. 用nn package中的module
1. 直接用autograd对于构建复杂的NN而言，还是太底层。使用nn module直接以layer的形式来安排layer更方便，因为抽象层次更高。
2. nn package中定义了很多modules，这些modules实现了NN中的layers, loss functions等。module接收input tensor，计算output tensor。同时也会存放weights等internal state。

· <font color=blue>这里以linear layer为例，所以拟合的函数改为$y = f(x, x^2, x^3)$</font>

In [5]:
import torch
import math


x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# x.unsqueeze(-1) has shape (2000, 1), and p has shape (3,)
# broadcasting semantics will apply, xx has shape (2000, 3) 
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# define model as a sequence of layers
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1), # output of linear layer has shape (200, 1)
    torch.nn.Flatten(0, 1) # flatens the linear output to 1D to match`y`.
)

# define loss funtion
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-6
for t in range(2000):
    
    # forward: Module objects override the __call__ operator
    # so you can call them like functions.
    y_pred = model(xx)
    loss = loss_fn(y_pred, y)
    if t % 200 == 99:
        print(f"{t}: {loss.item()}")
    
    # BP前要将weights置零
    model.zero_grad()
    
    # backward
    loss.backward()
  
    # 手动update weights，这里weights的运算不用计grad，所以要暂停autograd
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    
# You can access the first layer of `model` like accessing the first item of a list
linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + \
                    {linear_layer.weight[:, 0].item()} x + \
                    {linear_layer.weight[:, 1].item()} x^2 + \
                    {linear_layer.weight[:, 2].item()} x^3')

99: 413.55877685546875
299: 194.99159240722656
499: 94.82963562011719
699: 48.7336311340332
899: 27.42612648010254
1099: 17.532577514648438
1299: 12.917770385742188
1499: 10.755277633666992
1699: 9.73727798461914
1899: 9.255874633789062
Result: y = -0.015080885030329227 +                     0.8469998240470886 x +                     0.0026017031632363796 x^2 +                     -0.09194481372833252 x^3


## 5. 使用optim package

In [6]:
import torch
import math


x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# x.unsqueeze(-1) has shape (2000, 1), and p has shape (3,)
# broadcasting semantics will apply, xx has shape (2000, 3) 
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# define model as a sequence of layers
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1), # output of linear layer has shape (200, 1)
    torch.nn.Flatten(0, 1) # flatens the linear output to 1D to match`y`.
)

# define loss funtion
loss_fn = torch.nn.MSELoss(reduction='sum')

# 定义优化器，第一个argument告诉优化器要update的参数是哪些
learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
for t in range(2000):
    
    # forward: Module objects override the __call__ operator
    # so you can call them like functions.
    y_pred = model(xx)
    loss = loss_fn(y_pred, y)
    if t % 200 == 99:
        print(f"{t}: {loss.item()}")
    
    # BP前要将weights置零，用了优化器后，使用优化器来置零
    optimizer.zero_grad()
    
    # backward
    loss.backward()
  
    # update weights
    optimizer.step()
    
# You can access the first layer of `model` like accessing the first item of a list
linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + \
                    {linear_layer.weight[:, 0].item()} x + \
                    {linear_layer.weight[:, 1].item()} x^2 + \
                    {linear_layer.weight[:, 2].item()} x^3')

99: 237.22927856445312
299: 32.75457000732422
499: 18.58349609375
699: 10.452796936035156
899: 8.833868980407715
1099: 8.817211151123047
1299: 8.822708129882812
1499: 8.891637802124023
1699: 8.916147232055664
1899: 8.917805671691895
Result: y = -0.0005856486386619508 +                     0.8562260866165161 x +                     -0.0005856484640389681 x^2 +                     -0.0938451737165451 x^3


## 6. 自定义nn module
· subclassing nn.Module and defining a forward which receives input Tensors and produces output Tensors using other modules or other autograd operations on Tensors.

In [7]:
## implement the third order polynomial as a custom Module
class Polynomial3(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 注意这里的初始化方式
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'

x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = Polynomial3()

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined 
# with torch.nn.Parameter) which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

99 1036.6336669921875
199 690.2674560546875
299 460.70635986328125
399 308.53466796875
499 207.6463165283203
599 140.74659729003906
699 96.37659454345703
799 66.94302368164062
899 47.413673400878906
999 34.452789306640625
1099 25.849281311035156
1199 20.136619567871094
1299 16.342504501342773
1399 13.82181167602539
1499 12.146681785583496
1599 11.033086776733398
1699 10.29256534576416
1799 9.799927711486816
1899 9.472057342529297
1999 9.253796577453613
Result: y = 0.007822850719094276 + 0.8377413749694824 x + -0.0013495712773874402 x^2 + -0.09062787890434265 x^3


## 7. dynamic graph: control flow and weight sharing

In [8]:
## implement a custom Module,
#  第4，5次式可能有可能没有，且共享参数
import random

class DynamicNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 注意这里的初始化方式
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))
        self.e = torch.nn.Parameter(torch.randn(())) # 第4、5次式共享的参数

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
        # 随机决定第4、5次式是否存在
        for exp in range(4, random.randint(4, 6)):
            y = y + self.e * x ** exp
        return y

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 \
               + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'

x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = DynamicNet()

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined 
# with torch.nn.Parameter) which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
for t in range(30000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 2000 == 1999:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

1999 2335.856689453125
3999 1150.7891845703125
5999 473.41619873046875
7999 230.9722900390625
9999 107.4931411743164
11999 50.93584442138672
13999 27.7421932220459
15999 17.346866607666016
17999 12.601629257202148
19999 10.762849807739258
21999 9.551115036010742
23999 9.021665573120117
25999 8.971240043640137
27999 8.670414924621582
29999 8.902894973754883
Result: y = -0.0016190327005460858 + 0.852457582950592 x + -0.0002379447250859812 x^2 + -0.09315378218889236 x^3                + 0.00012578832684084773 x^4 ? + 0.00012578832684084773 x^5 ?
