# Examples1_learning_with_examples

## 1. 手动forward and backward

In [1]:
import torch
import math

In [2]:
# 配置
dtype = torch.float
device = torch.device('cuda')

In [3]:
# 用多项式拟合sin函数
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

In [4]:
# 参数初始化
torch.manual_seed(231)
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)
a, b, c, d, a.shape

(tensor(1.6443, device='cuda:0'),
 tensor(-0.7900, device='cuda:0'),
 tensor(1.4691, device='cuda:0'),
 tensor(-0.6675, device='cuda:0'),
 torch.Size([]))

In [5]:
lr = 1e-6

for i in range(2000):
    y_pred = a + b*x + c*(x**2) + d*(x**3)
    loss = ((y_pred - y)**2).sum().item()
    
    # 打印training loss
    if i % 200 == 0:
        print(loss)
    
    # 计算梯度
    grad_y_pred = 2 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = grad_y_pred @ x
    grad_c = grad_y_pred @ (x**2)
    grad_d = grad_y_pred @ (x**3)
    
    # 参数更新
    a -= lr * grad_a
    b -= lr * grad_b
    c -= lr * grad_c
    d -= lr * grad_d

303990.4375
1905.811279296875
889.9012451171875
419.92266845703125
201.5188446044922
99.55885314941406
51.74040222167969
29.211084365844727
18.548545837402344
13.479888916015625


## 2. 用autograd package自动化BP

In [6]:
# 参数初始化时打开requires_grad
torch.manual_seed(231)
a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d = torch.randn((), device=device, dtype=dtype, requires_grad=True)

# 但x和y不用打开
x.requires_grad, y.requires_grad

(False, False)

In [7]:
learning_rate = 1e-6
for t in range(2000):
    # FP
    y_pred = a + b * x + c * (x ** 2) + d * (x ** 3)
    loss = (y_pred - y).pow(2).sum()
    
    # 打印loss
    if t % 200 == 99:
        print(f"{t}: {loss.item()}")
    
    # 用autograd自动计算梯度
    loss.backward()
  
    # 参数更新，这里weights的运算不用计grad，所以要暂停autograd
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad
    
    # 每次迭代后要将grad置零，以免下次迭代时从非0值开始累加各个path的gradient
        a.grad = b.grad = c.grad = d.grad = None

99: 2807.8134765625
299: 1305.8782958984375
499: 612.6148681640625
699: 291.18499755859375
899: 141.47540283203125
1099: 71.42547607421875
1299: 38.4979248046875
1499: 22.949541091918945
1699: 15.574642181396484
1899: 12.061371803283691


## 3. 自定义autograd function
1. each primitive autograd operator包含了两个function：forward function 用input tensor计算output tensor；backward function收到output Tensors 相当于某个scalar value的梯度，然后用这个梯度来计算input tensor相对于该scalar value的梯度
2. 可以通过定义torch.autograd.Function的子类的方式来自定义autograd operator，实现forward and backward functions。

这里取$y=a + b*P_3(c + d*x)$来代替前面的$y=a + b*x^2 + c*x^3$。$P_3(x)=\frac{1}{2}(5x^3-3x)$是Legendre polynomial。

In [8]:
import torch
import math

class LegendrePolynomial3(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        """
        ctx is a context object.可以用来stash information for backward。
        可以用ctx.save_for_backward method来cache arbitrary objects for use in the backward.
        """
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)

x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# 这里initialize weights刻意放在了正确值附近
a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)
# print(a, b, c, d)

learning_rate = 5e-6
for t in range(2000):
    # autograd.Function要先apply
    P3 = LegendrePolynomial3.apply
    
    # forward
    y_pred = a + b * P3(c + d * x)
    loss = (y_pred - y).pow(2).sum()
    if t % 200 == 99:
        print(f"{t}: {loss.item()}")
    
    # backward
    loss.backward()
  
    # 手动update weights，这里weights的运算不用计grad，所以要暂停autograd
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad
    
    # 每次迭代后要将grad置零，以免下次迭代时从非0值开始累加各个path的gradient
        a.grad = b.grad = c.grad = d.grad = None
    
print(f'Result: y = {a.item()} + {b.item()}x + {c.item()}x^2 + {d.item()}x^3')

99: 209.95834350585938
299: 100.70249938964844
499: 50.978511810302734
699: 28.20686912536621
899: 17.745729446411133
1099: 12.931766510009766
1299: 10.714248657226562
1499: 9.692106246948242
1699: 9.220745086669922
1899: 9.003361701965332
Result: y = 1.2777713782885503e-11 + -2.208526849746704x + -2.5764071431844116e-10x^2 + 0.2554861009120941x^3


## 4. 用nn package中的module
1. 直接用autograd对于构建复杂的NN而言，还是太底层。使用nn module直接以layer的形式来安排layer更方便，因为抽象层次更高。
2. nn package中定义了很多modules，这些modules实现了NN中的layers, loss functions等。module接收input tensor，计算output tensor。同时也会存放weights等internal state。

· <font color=blue>这里以linear layer为例，所以拟合的函数改为$y = f(x, x^2, x^3)$</font>

In [9]:
import torch
import math

# x.unsqueeze(-1) has shape (2000, 1), and p has shape (3,)
# broadcasting semantics will apply, xx has shape (2000, 3) 
p = torch.tensor([1, 2, 3], device=device)
xx = x.unsqueeze(-1).pow(p)

# define model as a sequence of layers
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1), # output of linear layer has shape (200, 1)
    torch.nn.Flatten(0, 1) # 将input的dim0到dim1 flatten成1D，以便match`y`.
)

# 把model移动到cuda上
model = model.to(device)

# define loss funtion
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-6
for t in range(2000):
    
    # forward: Module objects override the __call__ operator
    # so you can call them like functions.
    y_pred = model(xx)
    loss = loss_fn(y_pred, y)
    if t % 400 == 399:
        print(f"{t}: {loss.item()}")
    
    # BP前要将weights置零
    model.zero_grad()
    
    # backward
    loss.backward()
  
    # 手动update weights，这里weights的运算不用计grad，所以要暂停autograd
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    
# You can access the first layer of `model` like accessing the first item of a list
linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + \
                    {linear_layer.weight[:, 0].item()} x + \
                    {linear_layer.weight[:, 1].item()} x^2 + \
                    {linear_layer.weight[:, 2].item()} x^3')

99: 1481.5634765625
299: 664.4503784179688
499: 301.3392333984375
699: 139.65023803710938
899: 67.49002075195312
1099: 35.20598220825195
1299: 20.723323822021484
1499: 14.207231521606445
1699: 11.26616096496582
1899: 9.934158325195312
Result: y = 0.016986023634672165 +                     0.8350483775138855 x +                     -0.0029303710907697678 x^2 +                     -0.0902448296546936 x^3


## 5. 使用optim package

In [10]:
import torch
import math

# define model as a sequence of layers
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1), # output of linear layer has shape (200, 1)
    torch.nn.Flatten(0, 1) # flatens the linear output to 1D to match`y`.
)

# 把model移动到cuda上
model = model.to(device)

# define loss funtion
loss_fn = torch.nn.MSELoss(reduction='sum')

# 定义优化器，第一个argument告诉优化器要update的参数是哪些
learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
for t in range(2000):
    
    # forward: Module objects override the __call__ operator
    # so you can call them like functions.
    y_pred = model(xx)
    loss = loss_fn(y_pred, y)
    if t % 400 == 399:
        print(f"{t}: {loss.item()}")
    
    # BP前要将weights置零，用了优化器后，使用优化器来置零
    optimizer.zero_grad()
    
    # backward
    loss.backward()
  
    # update weights
    optimizer.step()
    
# You can access the first layer of `model` like accessing the first item of a list
linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + \
                    {linear_layer.weight[:, 0].item()} x + \
                    {linear_layer.weight[:, 1].item()} x^2 + \
                    {linear_layer.weight[:, 2].item()} x^3')

99: 2154.765380859375
299: 1385.784423828125
499: 933.8905639648438
699: 601.6036376953125
899: 354.37677001953125
1099: 180.43382263183594
1299: 73.17616271972656
1499: 22.37281608581543
1699: 9.53388500213623
1899: 8.922995567321777
Result: y = 0.0004955814220011234 +                     0.8563342690467834 x +                     0.0004956094198860228 x^2 +                     -0.09381970018148422 x^3


## 6. 自定义nn module
· subclassing nn.Module and defining a forward which receives input Tensors and produces output Tensors using other modules or other autograd operations on Tensors.

In [11]:
## implement the third order polynomial as a custom Module
class Polynomial3(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 注意这里的初始化方式
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'


# Construct our model by instantiating the class defined above
model = Polynomial3().to(device)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined 
# with torch.nn.Parameter) which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 400 == 399:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

99 524.7474365234375
199 363.1872863769531
299 252.4933319091797
399 176.56356811523438
499 124.42117309570312
599 88.57331848144531
699 63.900386810302734
799 46.89997482299805
899 35.17347717285156
999 27.076091766357422
1099 21.47881507873535
1199 17.605751037597656
1299 14.923057556152344
1399 13.063041687011719
1499 11.772197723388672
1599 10.875513076782227
1699 10.252089500427246
1799 9.818252563476562
1899 9.516105651855469
1999 9.305503845214844
Result: y = -0.02114352583885193 + 0.8475532531738281 x + 0.00364761077798903 x^2 + -0.09202353656291962 x^3


## 7. dynamic graph: control flow and weight sharing
- 因为pytorch在每次执行forward pass的时候会新建一个dynamic computation graph，所以python中的control flow，比如for,while loops和if statement等，在module的forward method定义中都可以使用。不会给自动计算梯度造成问题。
  - 本例中，forward的函数计算中有随机性，每次forward的函数形态会变化，但不影响梯度计算。

In [12]:
## implement a custom Module,
#  第4，5次式可能有可能没有，且共享参数
import random

class DynamicNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 注意这里的初始化方式
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))
        self.e = torch.nn.Parameter(torch.randn(())) # 第4、5次式共享的参数

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
        # 随机决定第4、5次式是否存在
        for exp in range(4, random.randint(4, 6)):
            y = y + self.e * x ** exp
        return y

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 \
               + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'


model = DynamicNet().to(device)

# The call to model.parameters() in the SGD constructor will contain the 
# learnable parameters (defined with torch.nn.Parameter) which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
for t in range(30000):
    # Forward pass
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 4000 == 3999:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

1999 676.8985595703125
3999 319.4315185546875
5999 147.26318359375
7999 68.19235229492188
9999 37.63022232055664
11999 20.69076156616211
13999 14.038339614868164
15999 11.104504585266113
17999 9.824529647827148
19999 9.250890731811523
21999 9.04260540008545
23999 8.7088623046875
25999 8.90151309967041
27999 8.577032089233398
29999 8.898475646972656
Result: y = 0.0017547697061672807 + 0.8550370931625366 x + -0.0009069516672752798 x^2 + -0.09357427805662155 x^3                + 0.0001444164226995781 x^4 ? + 0.0001444164226995781 x^5 ?
