在 NumPy 中，多维数组（即 ndarray）的维度顺序通常是按从外到内的顺序排列的。例如，对于一个二维数组（矩阵），第一个维度代表行，第二个维度代表列。对于一个三维数组，维度顺序是深度、高度和宽度。

In [5]:
# 导入相应的包
import numpy as np
tmp = np.zeros((4,))
print(tmp)

[0. 0. 0. 0.]


# 网络类型
1. FC 全连接层
2. CNN 卷积层
3. 池化层
3. LayerNorm 层
4. BatchNorm 层

In [20]:
# 全连接层
"""
前向传播:
    - z = xW^T + b

反向传播: dz为batch_size * output_dim
    - dz/dW = dz^T * X, output_dim * input_dim
    - dz/db = sum(dz), output_dim
    - dz/dX = dz * W, batch_size * input_dim (对应上一层的输出)

复杂度分析:
    - 前向传播: O(batch_size, input_dim, output_dim)
    - 后向传播: O(batch_size, input_dim, output_dim)
"""
class FullConnectedLayer:

    def __init__(self, input_dim, output_dim, learning_rate=1e-2):
        
        # 初始化参数，randn是正太分布初始化
        self.W = np.random.randn(output_dim, input_dim)
        self.b = np.random.randn(output_dim)
        self.learning_rate = learning_rate

    
    def forward(self, x):

        # 全连接层计算 z = Wx + b
        # x: batch_size * input_dim 
        # 维度不同时，numpy会通过广播机制进行补齐
        self.input = x
        self.z = np.dot(x, self.W.T) + self.b 

        # 注意要保留计算结果，用于反向传播
        return self.z

    def backward(self, dz):

        # 计算梯度
        # dz : batch * output_dim
        dW = np.dot(dz.T, self.input)
        db = np.sum(dz, axis=0)
        dX = np.dot(dz, self.W)

        # 更新参数
        self.W -= self.lr * dW
        self.b -= self.lr * db

        return dX        

In [27]:
# CNN 卷积层
class ConvLayer:

    def __init__(self, input_channels, output_channels, kernel_size, stride=1, padding=0):
        
        # 保留参数
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding # 当padding = (kernel_size - 1)/2 时, 卷积前后大小不变

        # 初始化权重
        self.W = np.random.randn(output_channels, input_channels, kernel_size, kernel_size)
        self.b = np.random.randn(output_channels)  # 输出维度代表卷积核的数量，每个卷积核共用一个bias
    
    def forward(self, X):

        # X: batch_size * input_channels * height * width
        N, C_in, H_in, W_in = X.shape
        C_out, C_in, K_h, K_w = self.W.shape

        # 计算输出维度
        H_out = (H_in - K_h + 2 * self.padding) // self.stride + 1
        W_out = (W_in - K_w + 2 * self.padding) // self.stride + 1

        # 初始化输出
        Z = np.zeros(N, C_out, H_out, W_out)

        # padding
        X_padded = np.pad(X, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), 'constant')

        # 卷积计算
        for n in range(N):
            for c_out in range(C_out):
                for i in range(H_out):
                    for j in range(W_out):
                        h_start = i * self.stride
                        h_end = h_start + K_h
                        w_start = j * self.stride
                        w_end = w_start + K_w
                        # 计算输出                
                        Z[n, c_out, i, j] = np.sum(X_padded[n, :, h_start:h_end, w_start:w_end] * self.W[c_out, :, :, :]) + self.b[c_out]

        self.X = X
        self.Z = Z

        return Z
    
    def backward(self, dZ):
        # dZ 的形状: (N, C_out, H_out, W_out)
        N, C_out, H_out, W_out = dZ.shape
        C_out, C_in, K_h, K_w = self.W.shape
        N, C_in, H_in, W_in = self.X.shape

        # 填充输入
        X_padded = np.pad(self.X, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), 'constant')

        # 初始化梯度
        dW = np.zeros_like(self.W)
        db = np.zeros_like(self.b)
        dX = np.zeros_like(X_padded)

        # 计算 W 和 b 的梯度
        for n in range(N):
            for c_out in range(C_out):
                for i in range(H_out):
                    for j in range(W_out):
                        h_start = i * self.stride
                        h_end = h_start + K_h
                        w_start = j * self.stride
                        w_end = w_start + K_w

                        # 对 W 和 b 的梯度累加
                        dW[c_out, :, :, :] += dZ[n, c_out, i, j] * X_padded[n, :, h_start:h_end, w_start:w_end]
                        db[c_out] += dZ[n, c_out, i, j]
                        
                        # 反向传播输入梯度
                        dX[n, :, h_start:h_end, w_start:w_end] += dZ[n, c_out, i, j] * self.W[c_out, :, :, :]

        # 去掉填充
        if self.padding > 0:
            dX = dX[:, :, self.padding:-self.padding, self.padding:-self.padding]

        # 更新权重和偏置
        self.W -= self.learning_rate * dW
        self.b -= self.learning_rate * db

        return dX    

### 1. 对可学习参数 $ \gamma $ 和 $ \beta $ 的梯度：

- 对 $ \gamma $ 的梯度：
  $$
  \frac{\partial L}{\partial \gamma_i} = \sum_{n=1}^{N} \frac{\partial L}{\partial y_{n,i}} \cdot \hat{x}_{n,i}
  $$

- 对 $ \beta $ 的梯度：
  $$
  \frac{\partial L}{\partial \beta_i} = \sum_{n=1}^{N} \frac{\partial L}{\partial y_{n,i}}
  $$

### 2. 对均值 $ \mu $ 的梯度：

$$
\frac{\partial L}{\partial \mu} = \sum_{i=1}^{d} \left( \frac{\partial L}{\partial \hat{x}_i} \cdot \frac{-1}{\sqrt{\sigma^2 + \epsilon}} \right) + \frac{\partial L}{\partial \sigma^2} \cdot \frac{-2}{d} \sum_{i=1}^{d} (x_i - \mu)
$$

### 3. 对方差 $ \sigma^2 $ 的梯度：

$$
\frac{\partial L}{\partial \sigma^2} = \sum_{i=1}^{d} \left( \frac{\partial L}{\partial \hat{x}_i} \cdot \frac{-(x_i - \mu)}{2(\sigma^2 + \epsilon)^{3/2}} \right)
$$

### 4. 最终对 $ x_i $ 的梯度：

对 $ x_i $ 的总梯度由三部分组成：
1. $ x_i \to \hat{x}_i \to L $
2. $ x_i \to \mu \to \hat{x}_i \to L$
3. $ x_i \to \sigma^2 \to \hat{x}_i \to L $

总的梯度为：
$$
\frac{\partial L}{\partial x_i} = \frac{\partial L}{\partial \hat{x}_i} \cdot \frac{1}{\sqrt{\sigma^2 + \epsilon}} + \frac{\partial L}{\partial \mu} \cdot \frac{1}{d} + \frac{\partial L}{\partial \sigma^2} \cdot \frac{2(x_i - \mu)}{d}
$$


In [30]:
# LayerNorm
"""
- 时间复杂度分析
    - 前向传播: O(ND)
    - 后向传播: O(ND)
"""
class LayerNorm:

    def __init__(self, input_dim, epsilon=1e-5):

        self.gamma = np.ones(input_dim)
        self.beta = np.zeros(input_dim)
        self.epsilon = epsilon
    
    def forward(self, X):

        # 求均值、方差
        # X : batch_size * input_dim
        mu = np.mean(X, axis=-1, keepdims=True)
        var = np.var(X, axis=-1, keepdims=True)

        # 标准化
        X_hat = (X - mu) / np.sqrt(var + self.epsilon)

        # 放缩
        Y = self.gamma * X_hat + self.beta
        self.X = X
        self.mu = mu
        self.var = var
        self.X_hat = X_hat
        
        return Y
    
    def backward(self, dY):

        # 反向传播
        # dY : batch_size, output_dim
        N, D = dY.shape

        # 求本层梯度
        d_gamma = np.sum(self.X_hat * dY, axis=0)
        d_beta = np.sum(dY, axis=0)

        # 求反向梯度
        d_X_hat = self.gamma * dY
        d_var = np.sum(d_X_hat * (self.X - self.mu) * -0.5 * (self.var + self.epsilon) ** -1.5, axis=-1, keepdims=True)
        d_mu = np.sum(d_X_hat * -1 / np.sqrt(self.var + self.epsilon), axis=-1, keepdims=True) + d_var * np.mean(-2 * (self.X - self.mu), axis=-1, keepdims=True)    
        dx = d_X_hat / np.sqrt(self.var + self.epsilon) + d_var * 2 * (self.X - self.mu) / D + d_mu / D

        # 更新参数
        self.gamma -= self.lr * d_gamma
        self.beta -= self.lr * d_beta

        return dx

In [32]:
# BatchNorm
"""
- 时间复杂度分析
    - 前向传播: O(ND)
    - 后向传播: O(ND)
"""
class BatchNorm:

    def __init__(self, input_dim, epsilon=1e-5):

        self.gamma = np.ones(input_dim)
        self.beta = np.zeros(input_dim)
        self.epsilon = epsilon

    def forward(self, X, isTrain=True):

        if isTrain:
            # 计算均值和方差
            mu = np.mean(X, axis=0, keepdims=True)
            var = np.var(X, axis=0, keepdims=True)

            # 归一化
            X_hat = (X - mu) / np.sqrt(var + self.epsilon)

            # 计算输出
            y = self.gamma * X_hat + self.beta

            # 保存用于反向传播
            self.mu = mu
            self.var = var
            self.X_hat = X_hat
            self.X = X
        else:
            X_hat = (X - self.mu) / np.sqrt(self.var + self.epsilon)
        
        y = self.gamma * X_hat + self.beta
        return y

    def backward(self, dY):

        N, D = self.X.shape

        # 计算参数的梯度
        d_gamma = np.sum(dY * self.X_hat, axis=0)
        d_beta = np.sum(dY, axis=0)

        # 计算反向传播的梯度
        d_X_hat = dY * self.gamma
        d_var = np.sum(d_X_hat * -0.5 * (self.X - self.mu) * (self.var + self.epsilon) ** -1/5, axis=0, keepdims=True)
        d_mu = np.sum(d_X_hat * -1 / np.sqrt(self.var + self.epsilon), axis=0, keepdims=True) + d_var * np.mean(-2 * (self.X - self.mu), axis=0, keepdims=True)
        dX = d_X_hat / np.sqrt(self.var + self.epsilon) + d_var * 2/N * (self.X - self.mu) + d_mu / N

        return dX
        

In [29]:
gamma = np.ones(5)
beta = np.ones(5)
X = np.random.randn(3, 5)
print(gamma.shape)
print(X.shape)
Y = gamma * X
print(Y.shape)
print(X)
print(Y)


(5,)
(3, 5)
(3, 5)
[[ 0.54846513  0.96159119 -0.2132476   0.58320184  1.02084592]
 [-0.02489036 -0.93834391  0.31020801 -0.44293322  0.58189601]
 [ 1.79345817  0.41517114 -1.05214362 -0.47283479 -2.81505693]]
[[ 0.54846513  0.96159119 -0.2132476   0.58320184  1.02084592]
 [-0.02489036 -0.93834391  0.31020801 -0.44293322  0.58189601]
 [ 1.79345817  0.41517114 -1.05214362 -0.47283479 -2.81505693]]


---

# 激活函数
1. ReLU 激活函数
2. Sigmoid 激活函数
3. Softmax 激活函数

In [None]:
# ReLU 激活函数 
"""
前向传播
    - output = max(X, 0)

反向传播
    - dz/dX = dz * (X > 0)

复杂度分析
    - 前向传播: O(batch_size, input_dim)
    - 后向传播: O(batch_size, input_dim)
"""
class ReLULayer:

    def __init__(self):
        pass

    def forward(self, x):

        self.x = x
        self.output = np.maximum(self.x, 0)

        return self.output
    
    def backward(self, doutput):

        dz = doutput * (self.input > 0).astype(float)
        return dz

---

# 损失函数
1. MSE 损失函数
2. MAE 损失函数

In [18]:
# MSE 损失函数
"""
前向传播:
    - output = 1/N * sum(y_pred - y_true)

反向传播:
    - dL/dX = 2/batch_size * (y_pred - y_true)

复杂度分析:
    - 前向传播: O(batch_size, input_dim)
    - 后向传播: O(batch_size, input_dim)
"""
class MSELoss:
    
    def __init__(self):
        pass
    
    def forward(self, y_pred, y_true):
        
        # 计算loss
        self.y_pred = y_pred
        self.y_true = y_true

        loss = np.mean((y_pred - y_true) ** 2)

        return loss
    
    def backward(self):

        # 计算梯度
        # dL: batch_size * output*dim
        batch_size = self.y_pred.shape[0]
        dL = (2 * (self.y_pred - self.y_true)) / batch_size

        return dL

In [25]:
# 测试
fc_layer = FullConnectedLayer(input_dim=5, output_dim=1)
loss_function = MSELoss()

# 输入和目标
X = np.array([[1, 2, 3, 4, 5], 
              [5, 4, 3, 2, 1], 
              [1, 3, 5, 2, 4]])
y = np.array([[1], 
              [0], 
              [1]])

print(np.sum(X, axis=0))
print(np.sum(X, axis=1))


print(X.shape)
print(y.shape)

# 前向传播
z = fc_layer.forward(X)
loss = loss_function.forward(z, y)

# 反向传播
dz = loss_function.backward()
dx = fc_layer.backward(dz)

# print(dz.shape)
# print(dz)

# print("Loss:", loss)
# print("Gradient wrt Input:", dx)

[ 7  9 11  8 10]
[15 15 15]


NameError: name 'aaaa' is not defined