In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim


### 搭建模型

In [2]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [3]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128, 64)
        self.layer3 = LinearBNAC(64, 32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 
        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [4]:
model = Model(input_dimention=256,output_classes=10)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-3)

In [5]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

In [6]:
output = model(dummy_input)
print(output)

tensor([[0.0684, 0.0692, 0.0421, 0.1275, 0.1270, 0.1144, 0.1502, 0.1093, 0.1069,
         0.0851],
        [0.0740, 0.1587, 0.0810, 0.1824, 0.1409, 0.0575, 0.0715, 0.0808, 0.0750,
         0.0782],
        [0.0771, 0.1076, 0.1086, 0.0965, 0.1152, 0.0728, 0.1267, 0.1309, 0.1032,
         0.0614],
        [0.0404, 0.1208, 0.0637, 0.2030, 0.1421, 0.0491, 0.1128, 0.0637, 0.0842,
         0.1201]], grad_fn=<SoftmaxBackward>)


### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [7]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [8]:
criterion = NLLLoss()

In [9]:
loss = criterion(torch.log(output), target)

### 完成back propagation並更新梯度

In [10]:
loss.backward()

In [11]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0186, -0.0317, -0.0110,  ...,  0.0321,  0.0376,  0.0556],
        [ 0.0377,  0.0591, -0.0092,  ...,  0.0314,  0.0490,  0.0602],
        [ 0.0308, -0.0483,  0.0568,  ...,  0.0172, -0.0413, -0.0282],
        ...,
        [-0.0177,  0.0574, -0.0507,  ..., -0.0086,  0.0217, -0.0523],
        [ 0.0422, -0.0348, -0.0106,  ...,  0.0503, -0.0608, -0.0188],
        [ 0.0343,  0.0323,  0.0486,  ..., -0.0514, -0.0354, -0.0426]],
       requires_grad=True)


grad : tensor([[-5.1005e-03,  9.8831e-03,  2.0402e-02,  ..., -1.1333e-02,
          2.2453e-04,  6.1929e-03],
        [-3.5395e-01,  2.4335e-01,  4.6013e-01,  ..., -4.2728e-01,
          5.5890e-02,  2.1694e-01],
        [ 3.3120e-01, -8.4755e-01, -3.4841e-01,  ...,  8.0200e-01,
         -5.5948e-02, -5.6349e-01],
        ...,
        [ 4.4244e-03, -1.0215e-02, -2.1614e-02,  ...,  1.1503e-02,
          5.9425e-05, -6.1114e-03],
        [-3.3238e-01,  8.0976e-01,  2.1182e+00,  ..., -1.6523e+00,
       

In [12]:
optimizer.step()

In [13]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0176, -0.0327, -0.0120,  ...,  0.0331,  0.0366,  0.0546],
        [ 0.0387,  0.0581, -0.0102,  ...,  0.0324,  0.0480,  0.0592],
        [ 0.0298, -0.0473,  0.0578,  ...,  0.0162, -0.0403, -0.0272],
        ...,
        [-0.0187,  0.0584, -0.0497,  ..., -0.0096,  0.0207, -0.0513],
        [ 0.0432, -0.0358, -0.0116,  ...,  0.0513, -0.0598, -0.0198],
        [ 0.0333,  0.0333,  0.0476,  ..., -0.0504, -0.0344, -0.0416]],
       requires_grad=True)


grad : tensor([[-5.1005e-03,  9.8831e-03,  2.0402e-02,  ..., -1.1333e-02,
          2.2453e-04,  6.1929e-03],
        [-3.5395e-01,  2.4335e-01,  4.6013e-01,  ..., -4.2728e-01,
          5.5890e-02,  2.1694e-01],
        [ 3.3120e-01, -8.4755e-01, -3.4841e-01,  ...,  8.0200e-01,
         -5.5948e-02, -5.6349e-01],
        ...,
        [ 4.4244e-03, -1.0215e-02, -2.1614e-02,  ...,  1.1503e-02,
          5.9425e-05, -6.1114e-03],
        [-3.3238e-01,  8.0976e-01,  2.1182e+00,  ..., -1.6523e+00,
       

### 清空 gradient

In [14]:
optimizer.zero_grad()

In [15]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0176, -0.0327, -0.0120,  ...,  0.0331,  0.0366,  0.0546],
        [ 0.0387,  0.0581, -0.0102,  ...,  0.0324,  0.0480,  0.0592],
        [ 0.0298, -0.0473,  0.0578,  ...,  0.0162, -0.0403, -0.0272],
        ...,
        [-0.0187,  0.0584, -0.0497,  ..., -0.0096,  0.0207, -0.0513],
        [ 0.0432, -0.0358, -0.0116,  ...,  0.0513, -0.0598, -0.0198],
        [ 0.0333,  0.0333,  0.0476,  ..., -0.0504, -0.0344, -0.0416]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
