In [1]:
# define model
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

class SimpleNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        x = self.fc1(x)
        return x

In [2]:
# build model
batch_size = 1
epochs = 5000
model = SimpleNet(1, 1, 1)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [3]:
# load data
x = np.array([1, 2, 3, 4, 1.1, 2.2, 3.3])
y_true = x**2
inputs = torch.FloatTensor(x).view(-1, 1)
labels = torch.FloatTensor(y_true).view(-1, 1)

In [4]:
total_loss = 0.0

In [5]:
def train_one_epoch(optimizer, model, inputs):
    optimizer.zero_grad()
    outputs = model(inputs)
    globals()['w'] = model.fc1.weight.item()
    globals()['b'] = model.fc1.bias.item()
    print("model.weight:{}, model.bias:{}".format(w, b))
    loss = criterion(outputs, labels)
    show = torch.cat((inputs, outputs, labels), dim=1)
    print("input, output, label\n", show.T)
    print("loss:{}".format(loss))
    loss.backward()
    optimizer.step()
    return outputs

In [6]:
# epoch 0
pre_output = train_one_epoch(optimizer, model, inputs)

model.weight:0.44418656826019287, model.bias:0.5152027606964111
input, output, label
 tensor([[ 1.0000,  2.0000,  3.0000,  4.0000,  1.1000,  2.2000,  3.3000],
        [ 0.9594,  1.4036,  1.8478,  2.2919,  1.0038,  1.4924,  1.9810],
        [ 1.0000,  4.0000,  9.0000, 16.0000,  1.2100,  4.8400, 10.8900]],
       grad_fn=<PermuteBackward0>)
loss:48.06100845336914


In [7]:
# calculate theory
theory_output = inputs*w + b
print(theory_output.T == pre_output.T)
print("theroy_loss:{}".format((labels-pre_output).pow(2).sum()/len(theory_output)))

tensor([[True, True, True, True, True, True, True]])
theroy_loss:48.06100845336914


## backward
$$
MSE\_Loss=\sum_{n=1}^{N}(y_n-c_n)^2 \\
y=w*x+b \\
$$ 
$$
\frac{\partial L}{\partial w}=2*(y_n-c_n)*x=\Delta{_w} \\
w_{n+1}=w_n-lr*\Delta{_w}=w_n-2*lr*(y_n-c_n)*x \\
$$
$$
\frac{\partial L}{\partial b}=2*(y_n-c_n)=\Delta{_b} \\
b_{n+1}=b_n-2*lr*(y_n-c_n)
$$

In [8]:
# Therefore, the next epoch weight should be:
dif = pre_output - labels
print(w - 2*0.01*float(sum(dif)/len(dif))*float(sum(inputs)/len(inputs)))
# bias should be:
print(b - 2*0.01*float(sum(dif)/len(dif)))

0.6878344872548996
0.617945852279663


In [9]:
# epoch 1
pre_output = train_one_epoch(optimizer, model, inputs)

model.weight:0.7827966213226318, model.bias:0.6179458498954773
input, output, label
 tensor([[ 1.0000,  2.0000,  3.0000,  4.0000,  1.1000,  2.2000,  3.3000],
        [ 1.4007,  2.1835,  2.9663,  3.7491,  1.4790,  2.3401,  3.2012],
        [ 1.0000,  4.0000,  9.0000, 16.0000,  1.2100,  4.8400, 10.8900]],
       grad_fn=<PermuteBackward0>)
loss:36.484130859375


In [10]:
# epoch 2
dif = pre_output - labels
print("next weight", w - 2*0.01*float(sum(dif)/len(dif))*float(sum(inputs)/len(inputs)))
print("next bias", b - 2*0.01*float(sum(dif)/len(dif)))
print("theroy_loss:{}".format((labels-pre_output).pow(2).sum()/len(theory_output)))
pre_output = train_one_epoch(optimizer, model, inputs)

next weight 0.9834869326622424
next bias 0.7025742888450622
theroy_loss:36.484130859375
model.weight:1.0711212158203125, model.bias:0.7025743126869202
input, output, label
 tensor([[ 1.0000,  2.0000,  3.0000,  4.0000,  1.1000,  2.2000,  3.3000],
        [ 1.7737,  2.8448,  3.9159,  4.9871,  1.8808,  3.0590,  4.2373],
        [ 1.0000,  4.0000,  9.0000, 16.0000,  1.2100,  4.8400, 10.8900]],
       grad_fn=<PermuteBackward0>)
loss:28.13516616821289


# 要弄清楚SGD怎么更新的，按道理来说十分接近了

In [11]:
# epoch test
ws, bs = [], []
for _ in range(20):
    dif = pre_output - labels
    print("next weight", w - 2*0.01*float(sum(dif)/len(dif))*float(sum(inputs)/len(inputs)))
    print("next bias", b - 2*0.01*float(sum(dif)/len(dif)))
    print("theroy_loss:{}".format((labels-pre_output).pow(2).sum()/len(theory_output)))
    pre_output = train_one_epoch(optimizer, model, inputs)
    ws.append(w)
    bs.append(b)

next weight 1.2353688600608563
next bias 0.7718353629112243
theroy_loss:28.13516616821289
model.weight:1.3167636394500732, model.bias:0.7718353867530823
input, output, label
 tensor([[ 1.0000,  2.0000,  3.0000,  4.0000,  1.1000,  2.2000,  3.3000],
        [ 2.0886,  3.4054,  4.7221,  6.0389,  2.2203,  3.6687,  5.1172],
        [ 1.0000,  4.0000,  9.0000, 16.0000,  1.2100,  4.8400, 10.8900]],
       grad_fn=<PermuteBackward0>)
loss:22.11155128479004
next weight 1.450098077727613
next bias 0.8280607485771179
theroy_loss:22.11155128479004
model.weight:1.526176929473877, model.bias:0.8280607461929321
input, output, label
 tensor([[ 1.0000,  2.0000,  3.0000,  4.0000,  1.1000,  2.2000,  3.3000],
        [ 2.3542,  3.8804,  5.4066,  6.9328,  2.5069,  4.1856,  5.8644],
        [ 1.0000,  4.0000,  9.0000, 16.0000,  1.2100,  4.8400, 10.8900]],
       grad_fn=<PermuteBackward0>)
loss:17.76310920715332
next weight 1.6332912488588591
next bias 0.8732294321060181
theroy_loss:17.76310920715332
model.

In [12]:
# 按道理它们应该是一样的
print(np.diff(ws)/float(sum(inputs)/len(inputs)))
print(np.diff(bs))

[0.0883068  0.07533912 0.06433167 0.05498802 0.04705654 0.04032366
 0.03460798 0.02975571 0.02563627 0.02213886 0.01916937 0.01664788
 0.01450672 0.01268829 0.01114383 0.00983201 0.00871744 0.00777038
 0.00696557]
[ 5.62253594e-02  4.51686978e-02  3.57916355e-02  2.78401971e-02
  2.10987329e-02  1.53841376e-02  1.05410814e-02  6.43777847e-03
  2.96235085e-03  1.96695328e-05 -2.47073174e-03 -4.57739830e-03
 -6.35826588e-03 -7.86274672e-03 -9.13256407e-03 -1.02033019e-02
 -1.11050606e-02 -1.18634701e-02 -1.25001669e-02]


In [13]:
# 结果不一样。既然bias比较接近，那用bias去预估weight，看下会不会贴近
pb = b
pw = w

In [14]:

# backward
pre_output = train_one_epoch(optimizer, model, inputs)
# get diff
db = pb - b
dw = pw - w
print("dw", dw/float(sum(inputs)/len(inputs)))
print("db", db)
pb = b
pw = w

model.weight:2.7306270599365234, model.bias:0.9041976928710938
input, output, label
 tensor([[ 1.0000,  2.0000,  3.0000,  4.0000,  1.1000,  2.2000,  3.3000],
        [ 3.6348,  6.3655,  9.0961, 11.8267,  3.9079,  6.9116,  9.9153],
        [ 1.0000,  4.0000,  9.0000, 16.0000,  1.2100,  4.8400, 10.8900]],
       grad_fn=<PermuteBackward0>)
loss:6.069058895111084
dw -0.00628130940221845
db 0.013033628463745117


上述结果发现，实际代码与理论公式有哪里出入

In [15]:
# 看一下网络计算的梯度
pre_output = train_one_epoch(optimizer, model, inputs)
print("weight.grad:{}, bias.grad:{}".format(model.fc1.weight.grad.item(), model.fc1.bias.grad.item()))
db = pb - b
dw = pw - w
print("dw", dw)
print("db", db)
pb = b
pw = w

model.weight:2.74414324760437, model.bias:0.8907182812690735
input, output, label
 tensor([[ 1.0000,  2.0000,  3.0000,  4.0000,  1.1000,  2.2000,  3.3000],
        [ 3.6349,  6.3790,  9.1231, 11.8673,  3.9093,  6.9278,  9.9464],
        [ 1.0000,  4.0000,  9.0000, 16.0000,  1.2100,  4.8400, 10.8900]],
       grad_fn=<PermuteBackward0>)
loss:6.033162593841553
weight.grad:-1.2342838048934937, bias.grad:1.385087251663208
dw -0.01351618766784668
db 0.013479411602020264


差值并不是单纯乘上LR，说明SGD还是在作用的
接下来就验证梯度的计算就好了

In [17]:
print(sum(2*(pre_output-labels)*inputs)/len(inputs))
print(sum(2*(pre_output-labels))/len(inputs))

tensor([-1.2343], grad_fn=<DivBackward0>)
tensor([1.3851], grad_fn=<DivBackward0>)


正确