In [None]:
"""
实现一个非常简单的relu神经网络。
原始的numpy版本会很快发生梯度爆炸或者梯度消失，改变learning rate有一定的帮助。原始版本主要是注意怎么求导的问题。
梯度爆炸这里表现出来self.w1/self.w2的值很大，所以应该也可以通过修改loss函数加入正则项来修正。
但是使用torch的版本，不仅收敛快，而且不会发生梯度消失/爆炸，估计是optimizer和初始化方面的方面。
"""

import numpy as np

class SimpleReLUNN():
    def __init__(self, x, y, sample_num, sample_feature, hidden_units, hidden_output_dim, lr):
        # learning rate
        self.lr = lr
        # 神经网络的权重
        # 每个hidden unit有一个vector对应feature, 1000个feature, 1000个权重，每个unit 1000个权重
        self.w1 = np.random.randn(sample_feature, hidden_units)
        # 与上面类似。
        self.w2 = np.random.randn(hidden_units, hidden_output_dim)
        self.x = x
        self.y = y
        print("x.shape\n", x.shape)
        print("y.shape\n", y.shape)
        print("w1.shape\n", self.w1.shape)
        print("w2.shape\n", self.w2.shape)
        # y_predict = 0
        self.y_predict = np.zeros((sample_num, hidden_output_dim), dtype=np.float)

    """
    前向传播 
    """
    def forward(self):
        # 1d数组内积，2d数学矩阵乘法，注意不一样。
        # hidden unit的结果
        h = self.x.dot(self.w1)
        # h.shape (64,100)
        # element-wise maximum，可以直接作用到2d情况
        relu_output = np.maximum(h, 0)
        # relu_output.shape (64, 100)
        y_predict = relu_output.dot(self.w2)
        # y_predict.shape (64, 100)
        self.y_predict = y_predict

    """
    loss function - MSE
    BGD - 在更新参数时使用所有的样本来进行更新。
    """
    def loss(self):
        y_diff = self.y_predict - self.y
        # shape = (64, 100)
        y_diff_square = np.square(y_diff)
        # shape - ?
        loss = y_diff_square.sum()
        return loss

    """
    反向传播函数，更新w1, w2
    实际上这里的loss函数是
    loss = (y - w2 * relu(w1 * x))^2，要想这个尽量小，loss对w2, w1求导。
    loss对w2求导：
    2(y - w2 * relu(w1 * x)) * relu(w1 * x), 而w2 * relu(w1 * x)其实就是y_predict，前面已经结算完了。
    """
    def back_propagation(self):
        # (64, 100)
        grad_y_predict = 2.0 * (self.y_predict - self.y)
        # update w2
        grad_w2 = np.maximum(self.x.dot(self.w1), 0).T.dot(grad_y_predict)
        self.w2 = self.w2 - self.lr * grad_w2
        # update w1
        # (64, 100)
        """
        这里其实是求复合函数relu(w1 * x)的导数
        w1 * x < 0时，导数为0,
        w1 * x > 0时，导数=x,
        这里相当于是，如果当前权重w1 * x 的训练集中，过滤掉w1 * x < 0的权重变化。
        """
        relu_inner = self.x.dot(self.w1)
        # (64, 100)
        grad_h = grad_y_predict.dot(self.w2.T)
        # 这个操作相当于原始是一个2d矩阵，传入的也是相同形状的同一个2d矩阵，element-wise的操作，这个操作我不熟悉
        grad_h[relu_inner < 0] = 0
        # (1000, 100)
        grad_w1 = self.x.T.dot(grad_h)
        self.w1 = self.w1 - self.lr * grad_w1


def run_model(model, iter):
    print("lr - {}".format(model.lr))
    for i in range(iter):
        model.forward()
        print(i, model.loss())
        model.back_propagation()

sample_num = 64
sample_feature = 100
hidden_units = 10
# 每个unit的输出维度？
hidden_output_dim = 10
iter = 50
# 训练集 x y
x = np.random.randn(sample_num, sample_feature)
y = np.random.randn(sample_num, hidden_output_dim)

model_lr1 = SimpleReLUNN(x, y, sample_num, sample_feature, hidden_units, hidden_output_dim, 1e-4)
model_lr2 = SimpleReLUNN(x, y, sample_num, sample_feature, hidden_units, hidden_output_dim, 1e-5)
model_lr3 = SimpleReLUNN(x, y, sample_num, sample_feature, hidden_units, hidden_output_dim, 1e-6)

print("==================================================================")
run_model(model_lr1, iter)
print("==================================================================")
run_model(model_lr2, iter)
print("==================================================================")
run_model(model_lr3, iter)
print("==================================================================")

In [None]:
import torch
import torch.nn as nn

sample_num = 64
sample_feature = 100
hidden_units = 10
# 每个unit的输出维度？
hidden_output_dim = 10
iter = 50

x = torch.randn(sample_num, sample_feature)
y = torch.randn(sample_num, hidden_output_dim)

class TwoLayerNet(torch.nn.Module):
    def __init__(self, sample_feature, hidden_units, hidden_output_dim):
        super(TwoLayerNet, self).__init__()
        # 定义模型计算图
        self.linear1 = torch.nn.Linear(sample_feature, hidden_units, bias=False)
        self.linear2 = torch.nn.Linear(hidden_units, hidden_output_dim, bias=False)

    # 在class中必须定于前向传播过程，torch.nn.Sequential则不需要
    def forward(self, x):
        # 两个模型相连的前向传播
        # clamp实现relu
        y_pred = self.linear2(self.linear1(x).clamp(min=0))
        return y_pred

# model使用前面定义的类
model2 = TwoLayerNet(sample_feature, hidden_units, hidden_output_dim)
loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model2.parameters(), lr=learning_rate)

for it in range(iter):
    # 前向传播
    y_pred = model2(x)           # model.forward() 和model(x)是一样的
    # 计算损失
    loss = loss_fn(y_pred, y)   # computation graph
    print(it, loss.item())
    # 梯度清零
    optimizer.zero_grad()
    # 反向传播
    loss.backward()
    # 参数自动一步更新
    optimizer.step()