In [1]:
import torch
from torch import autograd
from torch import nn

In [2]:
# Make data
torch.manual_seed(4321)
X = torch.rand(size=(8, 2))
y = torch.randint(low=0, high=3, size=(8,))


In [3]:
print(X)
print(y)

tensor([[0.1255, 0.5377],
        [0.6564, 0.0365],
        [0.5837, 0.7018],
        [0.3068, 0.9500],
        [0.4321, 0.2946],
        [0.6015, 0.1762],
        [0.9945, 0.3177],
        [0.9886, 0.3911]])
tensor([0, 2, 2, 0, 2, 2, 0, 1])


In [16]:
# creatre a Vanilla model with 2 inputs, 3 outputs, and 1 hidden layer with 2 nodes and bias on the hidden layer and output layer
class Vanilla(nn.Module):
    def __init__(self):
        super(Vanilla, self).__init__()
        self.fc1 = nn.Linear(2, 2, bias=True)
        self.fc2 = nn.Linear(2, 3, bias=True)
        self.logistic_activate_func = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)
    
    # specify the initial weight
    def init_weight(self):
        self.fc1.weight.data = torch.tensor([[0.48, -0.51], [-0.43, -0.48]], dtype=torch.float)
        self.fc1.bias.data = torch.tensor([0.23, 0.05], dtype=torch.float)
        self.fc2.weight.data = torch.tensor([[-0.99, -0.66], [0.36, 0.34], [-0.75, 0.66]], dtype=torch.float)
        self.fc2.bias.data = torch.tensor([0.32, -0.44, 0.70], dtype=torch.float)

    def forward(self, x):
        x = self.fc1(x)
        x = self.logistic_activate_func(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x


In [25]:
# use and prediction the model
model = Vanilla()
model.init_weight()
print(model(X))

# loss
loss = nn.CrossEntropyLoss()
print(loss(model(X), y))

# calculate the gradient
model.zero_grad()
loss(model(X), y).backward()

# print the gradient
print(model.fc1.weight.grad)
print(model.fc1.bias.grad)
print(model.fc2.weight.grad)
print(model.fc2.bias.grad)


tensor([[0.1867, 0.2663, 0.5470],
        [0.1747, 0.2958, 0.5295],
        [0.1959, 0.2738, 0.5303],
        [0.2022, 0.2590, 0.5388],
        [0.1812, 0.2820, 0.5368],
        [0.1787, 0.2902, 0.5311],
        [0.1863, 0.2966, 0.5171],
        [0.1886, 0.2943, 0.5171]], grad_fn=<SoftmaxBackward0>)
tensor(1.0681, grad_fn=<NllLossBackward0>)
tensor([[ 0.0057,  0.0067],
        [-0.0017,  0.0058]])
tensor([0.0167, 0.0001])
tensor([[-0.0059, -0.0053],
        [ 0.0323,  0.0252],
        [-0.0264, -0.0199]])
tensor([-0.0157,  0.0579, -0.0422])


In [8]:
x1 = X[:, 0].requires_grad_(True)
x2 = X[:, 1].requires_grad_(True)

# Forward
b10 = 0.23
b20 = 0.05

a11 = 1 / (1 + torch.exp(-(0.48 * x1 - 0.51 * x2 + b10)))
a21 = 1 / (1 + torch.exp(-(-0.43 * x1 - 0.48 * x2 + b20)))

b11 = 0.32
b21 = -0.44
b31 = 0.70

y1 = a11 * -0.99 + a21 * -0.66 + b11
y2 = a11 * 0.36 + a21 * 0.34 + b21
y3 = a11 * -0.75 + a21 * 0.66 + b31

# softmax
y1 = torch.exp(y1) / (torch.exp(y1) + torch.exp(y2) + torch.exp(y3))
y2 = torch.exp(y2) / (torch.exp(y1) + torch.exp(y2) + torch.exp(y3))
y3 = torch.exp(y3) / (torch.exp(y1) + torch.exp(y2) + torch.exp(y3))

# categorical cross entropy loss
loss = (
    - torch.log(y1[y == 0]).mean()
    - torch.log(y2[y == 1]).mean()
    - torch.log(y3[y == 2]).mean()
)

dy1_dx = autograd.grad(
    outputs=y1, inputs=[x1, x2], grad_outputs=torch.ones_like(y1), create_graph=True
)
dy2_dx = autograd.grad(
    outputs=y2, inputs=[x1, x2], grad_outputs=torch.ones_like(y2), create_graph=True
)
dy3_dx = autograd.grad(
    outputs=y3, inputs=[x1, x2], grad_outputs=torch.ones_like(y3), create_graph=True
)

print("prediction:")
print(y1)
print(y2)
print(y3)
print(y1+y2+y3)
print("loss:")
print(loss)
print("grad:")
print(dy1_dx)
print(dy2_dx)
print(dy3_dx)


prediction:
tensor([0.1867, 0.1747, 0.1959, 0.2022, 0.1812, 0.1787, 0.1863, 0.1886],
       grad_fn=<DivBackward0>)
tensor([0.2273, 0.2463, 0.2321, 0.2225, 0.2374, 0.2426, 0.2465, 0.2450],
       grad_fn=<DivBackward0>)
tensor([0.4277, 0.4045, 0.4092, 0.4214, 0.4142, 0.4070, 0.3917, 0.3921],
       grad_fn=<DivBackward0>)
tensor([0.8416, 0.8255, 0.8371, 0.8460, 0.8327, 0.8283, 0.8245, 0.8257],
       grad_fn=<AddBackward0>)
loss:
tensor(3.9537, grad_fn=<SubBackward0>)
grad:
(tensor([0.0085, 0.0081, 0.0075, 0.0080, 0.0081, 0.0080, 0.0074, 0.0072],
       grad_fn=<AddBackward0>), tensor([0.0334, 0.0315, 0.0337, 0.0342, 0.0327, 0.0321, 0.0320, 0.0322],
       grad_fn=<AddBackward0>))
(tensor([0.0176, 0.0169, 0.0173, 0.0170, 0.0175, 0.0172, 0.0164, 0.0165],
       grad_fn=<AddBackward0>), tensor([-0.0193, -0.0191, -0.0196, -0.0191, -0.0195, -0.0193, -0.0191, -0.0193],
       grad_fn=<AddBackward0>))
(tensor([-0.0423, -0.0401, -0.0406, -0.0411, -0.0414, -0.0406, -0.0386, -0.0386],
       gr