In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# モデル定義
class StackedModel(nn.Module):
    def __init__(self):
        super(StackedModel, self).__init__()
        self.layer1 = nn.Linear(4, 8)
        self.layer2 = nn.Linear(8, 16)
        self.layer3 = nn.Linear(16, 1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.layer3(x)
        return x
# モデル初期化
model = StackedModel()

# ダミーデータ
inputs = torch.tensor([[1.0, 2.0, 3.0, 4.0]], requires_grad=True)
targets = torch.tensor([[5.0]])

# lossに関する重みの勾配

In [4]:
# 順伝播
outputs = model(inputs)

# ロスの計算
criterion = nn.MSELoss()
loss = criterion(outputs, targets)

# 勾配計算
optimizer = optim.SGD(model.parameters(), lr=0.01)
optimizer.zero_grad()
loss.backward()

# 特定のパラメータの勾配を取得
weight_grad = model.layer3.weight.grad
bias_grad = model.layer3.bias.grad

print(f"Weight gradients: ({weight_grad.shape})", weight_grad)
print(f"Bias gradients: ({bias_grad.shape})", bias_grad)

Weight gradients: (torch.Size([1, 16])) tensor([[-15.6780,  -4.2667,   0.0000,   0.0000,   0.0000,  -4.5858,  -4.3228,
           0.0000,   0.0000, -10.9119,  -5.4721,  -9.6747,   0.0000, -11.7262,
          -5.3415,  -9.2127]])
Bias gradients: (torch.Size([1])) tensor([-9.6139])


# 最終出力の一個前のレイヤのニューロン値に関する他のレイヤの重みの勾配

In [25]:
# 順伝播
outputs = model(inputs)
# PyTorchによる勾配計算
# layer3の出力に関するlayer1の重みの勾配
autograd_grad = torch.autograd.grad(outputs, model.layer1.weight, retain_graph=True)[0]
print(autograd_grad.shape)
print(autograd_grad)

torch.Size([8, 4])
tensor([[-0.0930, -0.1861, -0.2791, -0.3721],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0580,  0.1161,  0.1741,  0.2321],
        [ 0.0041,  0.0083,  0.0124,  0.0165],
        [ 0.0189,  0.0378,  0.0567,  0.0756],
        [ 0.0468,  0.0936,  0.1404,  0.1872],
        [ 0.0477,  0.0953,  0.1430,  0.1906],
        [-0.0353, -0.0706, -0.1058, -0.1411]])


## autogradの結果を数値微分の結果と比較

In [26]:
# 数値微分での勾配計算
epsilon = 1e-5
numerical_grad = torch.zeros_like(model.layer1.weight)

for i in range(model.layer1.weight.shape[0]):
    for j in range(model.layer1.weight.shape[1]):
        # w + epsilon
        model.layer1.weight.data[i, j] += epsilon
        outputs_plus = model(inputs) # f(w + epsilon)
        
        # w - epsilon
        model.layer1.weight.data[i, j] -= 2 * epsilon
        outputs_minus = model(inputs) # f(w - epsilon)

        # 勾配の有限差分近似
        numerical_grad[i, j] = (outputs_plus - outputs_minus).item() / (2 * epsilon) # f(w + epsilon) - f(w - epsilon) / 2 * epsilon
        
        # 重みを元に戻す
        model.layer1.weight.data[i, j] += epsilon
print(numerical_grad.shape)
print(numerical_grad)

torch.Size([8, 4])
tensor([[-0.0931, -0.1848, -0.2816, -0.3718],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0596,  0.1170,  0.1736,  0.2302],
        [ 0.0037,  0.0089,  0.0142,  0.0179],
        [ 0.0179,  0.0350,  0.0559,  0.0767],
        [ 0.0469,  0.0931,  0.1416,  0.1878],
        [ 0.0499,  0.0946,  0.1431,  0.1907],
        [-0.0373, -0.0708, -0.1065, -0.1431]])


In [27]:
# 差分を表示
difference = torch.abs(numerical_grad - autograd_grad)
print("Difference between numerical and autograd gradients:")
print(difference)

Difference between numerical and autograd gradients:
tensor([[9.9368e-05, 1.2914e-03, 2.5333e-03, 3.4758e-04],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.5708e-03, 9.0638e-04, 5.0308e-04, 1.9125e-03],
        [4.1025e-04, 6.6962e-04, 1.7495e-03, 1.3392e-03],
        [1.0081e-03, 2.7614e-03, 7.8927e-04, 1.1828e-03],
        [1.4442e-04, 4.5621e-04, 1.1783e-03, 5.7769e-04],
        [2.2575e-03, 7.0047e-04, 6.6876e-05, 8.9169e-05],
        [1.9744e-03, 2.2356e-04, 7.0787e-04, 1.9372e-03]])
