In [1]:
# 加载数据
import sys, os
# sys.path.append(os.pardir)
import numpy as np
from dataset.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
print(x_train.shape) # (60000, 784)
print(t_train.shape) # (60000, 10)

(60000, 784)
(60000, 10)


In [2]:
# 显示图像
from PIL import Image
def img_show(img):
    pil_img = Image.fromarray(np.uint8(img))
    pil_img.show()

(x_train, t_train), (x_test, t_test) = load_mnist(flatten=True, normalize=False)
img = x_train[0] # 读入第一张图像
label = t_train[0] # 读入第一张图像的标签
print(label) # 5
print(img.shape)
# (784,)
img = img.reshape(28, 28) # 把图像的形状变成原来的尺寸
print(img.shape)
# (28, 28)
img_show(img)

5
(784,)
(28, 28)


In [3]:
x_train.shape, t_train.shape, x_test.shape, t_test.shape

((60000, 784), (60000,), (10000, 784), (10000,))

In [4]:
# 以mini-batch读入数据
train_size = x_train.shape[0]
batch_size = 10
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]

In [5]:
x_batch.shape, t_batch.shape, batch_mask

((10, 784),
 (10,),
 array([34941, 14845,  5472, 38407, 46247, 29335, 23087, 28996, 51657,
         4347]))

In [6]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        return out
        
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        return dx
    
aff = Affine(np.array([[1, 2, 3], [4, 5, 6]]), np.array([1, 2, 3]))    

In [7]:
class ReLu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx

In [8]:
class LinearWithReLu():
    def __init__(self, dim_in, dim_out):
        self.W = np.random.randn(dim_in, dim_out)
        self.b = np.random.randn(dim_out)
        self.affine = Affine(self.W, self.b)
        self.relu = ReLu()
        self.x_pos = None
        self.x_neg = None
        self.dW = None
        self.db = None
        # self.out = None
        # print(f"Initialized with W: {self.W}, b: {self.b}")

    def __call__(self, x):
        # 当实例被“调用”时，执行的代码
        print(f"Calling with {x}")
        return self.forward(x)
    
    def __str__(self):
        return f"LinearWithReLu with W: {self.W}, b: {self.b}"
    
    def __repr__(self):
        return f"LinearWithReLu with W: {self.W}, b: {self.b}"
    
    def state_dict(self):
        return {'W': self.W, 'b': self.b}
    
    def forward(self, x_pos, x_neg):
        self.x_pos = x_pos
        y_pos = self.affine.forward(x_pos)
        y_neg = self.affine.forward(x_neg)
        out = self.relu.forward(out)
        self.out = out
        return out
    
    def backward(self, dout):
        dout = self.relu.backward(dout)
        dout = self.affine.backward(dout)
        self.dW = self.affine.dW
        self.db = self.affine.dbload_mnist(normalize=True, one_hot_label=True)
        return dout
    
    def save_state_dict(self, path):
        state_dict = {'W': self.W, 'b': self.b}
        np.savez(path, **state_dict)

    def load_state_dict(self, path):
        state_dict = np.load(path)
        self.W = state_dict['W']
        self.b = state_dict['b']
        self.affine = Affine(self.W, self.b)

    def ff_loss(self, x_pos, x_neg):
        y_pos = self.forward(x_pos)
        y_neg = self.forward(x_neg)
        loss = y_neg - y_pos
        return loss
    
linear_with_relu = LinearWithReLu(3, 2)
linear_with_relu

LinearWithReLu with W: [[-2.25409652  2.45995518]
 [ 0.9068056  -0.35968946]
 [-0.16514676 -0.74819542]], b: [1.6154998  0.78149397]

In [38]:
import numpy as np
from dataset.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

# from tensorflow.keras.datasets import mnist

# def overlay_y_on_x(x, y):
#     """
#     Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
#     """
#     x_ = x.clone()
#     x_[:, :10] *= 0.0
#     x_[range(x.shape[0]), y] = x.max()
#     return x_
# 假设x,y是numpy数组, 用numpy重写
def overlay_y_on_x(x, y):
    """
    Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
    """
    y *= x.max()
    x_ = x.copy()
    x_[:, :10] = y
    # x_[range(x.shape[0]), y] = x.max()
    return x_

class LossFunction:
    def __init__(self):
        pass
    
    def loss(self, Y_pos, Y_neg):
        return np.sum(- Y_pos ** 2 + Y_neg ** 2)
    
    def compute_gradients(self, X_pos, X_neg, W, B):
        # 计算 Y_pos 和 Y_neg
        Y_pos = np.dot(X_pos, W) + B
        Y_neg = np.dot(X_neg, W) + B
        
        # 计算损失函数
        loss = self.loss(Y_pos, Y_neg)
        
        # 计算损失函数对 W 的梯度
        dW = - np.dot(X_pos.T, 2 * Y_pos) + np.dot(X_neg.T, 2 * Y_neg)
        
        # 计算损失函数对 B 的梯度
        dB = np.sum(2 * ( - Y_pos + Y_neg))
        
        return loss, dW, dB

# 加载 MNIST 数据集
(X_train, y_train), (X_test, y_test) = load_mnist(normalize=True, one_hot_label=False)

# 数据预处理
X_train = X_train.reshape(-1, 28*28) / 255.0
X_test = X_test.reshape(-1, 28*28) / 255.0

# 将标签转换为 one-hot 编码
y_train = np.eye(10)[y_train]
y_test = np.eye(10)[y_test]

# 定义神经网络类
class NeuralNetwork:
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        
        # 初始化权重和偏置
        self.W = np.random.randn(input_size, output_size)
        self.B = np.zeros((1, output_size))
        
        # 初始化损失函数对象
        self.loss_function = LossFunction()

    def forward(self, X):
        # 计算 Y
        Y = np.dot(X, self.W) + self.B
        return Y
    
    def train(self, X_train_pos, X_train_neg, num_epochs=10000, learning_rate=1e-5):
        for epoch in range(num_epochs):
            # 计算梯度
            loss, dW, dB = self.loss_function.compute_gradients(X_train_pos, X_train_neg, self.W, self.B)
            
            # 更新参数
            self.W -= learning_rate * dW
            self.B -= learning_rate * dB
            
            # 打印损失
            # if (epoch+1) % 10 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}')

            # 评估模型
            # if (epoch+1) % 10 == 0:
            y_pred =  self.predict(X_train_pos)
            y_true = np.argmax(y_train, axis=1)
            accuracy = self.eval(y_pred, y_true)

            print(f'Accuracy: {accuracy:.4f}')

    
    def predict(self, x):
        goodness_per_label = []
        for label in range(10): # 对每一个标签进行预测
            label = np.eye(10)[label] # 将标签转换为 one-hot 编码
            h = overlay_y_on_x(x, label) # h是输入x和标签label的叠加
            goodness = [] # goodness是一个列表，里面存放了每一层的结果向量的均方
            # for layer in self.layers: # 对每一层进行前传
            #     h = layer(h) # h是每一层的输出
            h = self.forward(h) # h是每一层的输出
            # print(h.shape): (60000, 10)
            # goodness += [(h**2).mean(1)] # goodness是每一层的结果向量的均方。h.pow(2)是h的每一个元素的平方，mean(1)是对每一行求均值
            goodness_per_label.append((h**2).mean(1)) # goodness_per_label是每一层的结果向量的均方的和
        # goodness_per_label = torch.cat(goodness_per_label, 1) # goodness_per_label是每一层的结果向量的均方的和的列表
        return np.array(goodness_per_label).argmax(0) # 返回的是goodness_per_label中每一行最大值的索引，也就是说，返回的是每一行最大值的列索引
        # # 计算 Y
        # Y = np.dot(X_test, self.W) + self.B
        
        # # 计算准确率
        # accuracy = np.mean(np.argmax(Y, axis=1) == np.argmax(y_test, axis=1))
        # print(f'Accuracy: {accuracy:.4f}')
                
    def eval(self, y_pred, y_true):
        return np.mean(y_pred == y_true)

# 初始化神经网络
input_size = X_train.shape[1]
output_size = 10
nn = NeuralNetwork(input_size, output_size)

# 定义正负样本
x_pos = overlay_y_on_x(X_train, y_train)
# rnd = torch.randperm(x.size(0)) # 生成一个从0到n-1的随机整数序列
# 将上一句用numpy重写
rnd = np.random.permutation(X_train.shape[0])
x_neg = overlay_y_on_x(X_train, y_train[rnd])

# 使用相同的数据作为正负样本进行训练
nn.train(x_pos, x_neg)


Epoch 1/10000, Loss: 5.0891
Accuracy: 0.0942
Epoch 2/10000, Loss: 4.9709
Accuracy: 0.0979
Epoch 3/10000, Loss: 4.8528
Accuracy: 0.1020
Epoch 4/10000, Loss: 4.7347
Accuracy: 0.1065
Epoch 5/10000, Loss: 4.6167
Accuracy: 0.1106
Epoch 6/10000, Loss: 4.4988
Accuracy: 0.1135
Epoch 7/10000, Loss: 4.3809
Accuracy: 0.1157
Epoch 8/10000, Loss: 4.2631
Accuracy: 0.1176
Epoch 9/10000, Loss: 4.1454
Accuracy: 0.1193
Epoch 10/10000, Loss: 4.0278
Accuracy: 0.1201
Epoch 11/10000, Loss: 3.9103
Accuracy: 0.1203
Epoch 12/10000, Loss: 3.7930
Accuracy: 0.1196
Epoch 13/10000, Loss: 3.6757
Accuracy: 0.1201
Epoch 14/10000, Loss: 3.5587
Accuracy: 0.1203
Epoch 15/10000, Loss: 3.4418
Accuracy: 0.1201
Epoch 16/10000, Loss: 3.3250
Accuracy: 0.1196
Epoch 17/10000, Loss: 3.2085
Accuracy: 0.1196
Epoch 18/10000, Loss: 3.0921
Accuracy: 0.1199
Epoch 19/10000, Loss: 2.9759
Accuracy: 0.1191
Epoch 20/10000, Loss: 2.8600
Accuracy: 0.1183
Epoch 21/10000, Loss: 2.7442
Accuracy: 0.1180
Epoch 22/10000, Loss: 2.6287
Accuracy: 0.11

KeyboardInterrupt: 

In [48]:
import numpy as np
from dataset.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

# from tensorflow.keras.datasets import mnist

# def overlay_y_on_x(x, y):
#     """
#     Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
#     """
#     x_ = x.clone()
#     x_[:, :10] *= 0.0
#     x_[range(x.shape[0]), y] = x.max()
#     return x_
# 假设x,y是numpy数组, 用numpy重写
def overlay_y_on_x(x, y):
    """
    Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
    """
    y *= x.max()
    x_ = x.copy()
    x_[:, :10] = y
    # x_[range(x.shape[0]), y] = x.max()
    return x_


# 加载 MNIST 数据集
(X_train, y_train), (X_test, y_test) = load_mnist(normalize=True, one_hot_label=False)

# 数据预处理
X_train = X_train.reshape(-1, 28*28) / 255.0
X_test = X_test.reshape(-1, 28*28) / 255.0

# 将标签转换为 one-hot 编码
y_train = np.eye(10)[y_train]
y_test = np.eye(10)[y_test]

# 定义神经网络类
class NeuralNetwork:
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        
        # 初始化权重和偏置
        self.W = np.random.randn(input_size, output_size)
        self.B = np.zeros((1, output_size))
        
        # 初始化损失函数对象
        self.loss_function = LossFunction()

    # def forward(self, X):
    #     # 计算 Y
    #     Y = np.dot(X, self.W) + self.B
    #     return Y
    
    def softmaxwithloss(self, X, y):
        # 计算 Y
        Y = np.dot(X, self.W) + self.B
        # softmax
        Y = np.exp(Y) / np.sum(np.exp(Y), axis=1, keepdims=True)

        # print(Y.shape, y.shape)
        # 计算损失
        loss = np.mean(-np.sum(y * np.log(Y), axis=1))
        return loss
    
    def softmaxwithloss_backward(self, X, y):
        # 计算 Y
        Y = np.dot(X, self.W) + self.B
        
        # 计算梯度
        dW = np.dot(X.T, Y - y)
        dB = np.sum(Y - y, axis=0)
        
        return dW, dB

    
    def train(self, X, y, num_epochs=10000, learning_rate=2.5e-5):
        for epoch in range(num_epochs):
            # 计算梯度
            # loss, dW, dB = self.loss_function.compute_gradients(X_train_pos, X_train_neg, self.W, self.B)
            
            # # 前向传播
            # Y = self.forward(X)

            # 损失计算
            loss = self.softmaxwithloss(X, y)
            
            # 计算梯度
            dW, dB = self.softmaxwithloss_backward(X, y)
            

            # 更新参数
            self.W -= learning_rate * dW
            self.B -= learning_rate * dB
            
            # 打印损失
            # if (epoch+1) % 10 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}')

            # 评估模型
            # if (epoch+1) % 10 == 0:
            y_pred =  self.predict(X_train)
            y_true = np.argmax(y_train, axis=1)
            accuracy = self.eval(y_pred, y_true)

            print(f'Accuracy: {accuracy:.4f}')

    
    def predict(self, X):
        # 计算 Y
        Y = np.dot(X, self.W) + self.B
        # softmax
        Y = np.exp(Y) / np.sum(np.exp(Y), axis=1, keepdims=True)
        return Y.argmax(1)
        
        # # 计算准确率
        # accuracy = np.mean(np.argmax(Y, axis=1) == np.argmax(y, axis=1))
        # print(f'Accuracy: {accuracy:.4f}')
        
                
    def eval(self, y_pred, y_true):
        return np.mean(y_pred == y_true)

# 初始化神经网络
input_size = X_train.shape[1]
output_size = 10
nn = NeuralNetwork(input_size, output_size)

# # 定义正负样本
# x_pos = overlay_y_on_x(X_train, y_train)
# # rnd = torch.randperm(x.size(0)) # 生成一个从0到n-1的随机整数序列
# # 将上一句用numpy重写
# rnd = np.random.permutation(X_train.shape[0])
# x_neg = overlay_y_on_x(X_train, y_train[rnd])

# 使用相同的数据作为正负样本进行训练
nn.train(X_train, y_train)


Epoch 1/10000, Loss: 2.3035
Accuracy: 0.1002
Epoch 2/10000, Loss: 2.3028
Accuracy: 0.1037
Epoch 3/10000, Loss: 2.3030
Accuracy: 0.1052
Epoch 4/10000, Loss: 2.3028
Accuracy: 0.1047
Epoch 5/10000, Loss: 2.3029
Accuracy: 0.1053
Epoch 6/10000, Loss: 2.3028
Accuracy: 0.1054
Epoch 7/10000, Loss: 2.3028
Accuracy: 0.1053
Epoch 8/10000, Loss: 2.3028
Accuracy: 0.1056
Epoch 9/10000, Loss: 2.3028
Accuracy: 0.1058
Epoch 10/10000, Loss: 2.3027
Accuracy: 0.1059
Epoch 11/10000, Loss: 2.3027
Accuracy: 0.1062
Epoch 12/10000, Loss: 2.3027
Accuracy: 0.1066
Epoch 13/10000, Loss: 2.3027
Accuracy: 0.1067
Epoch 14/10000, Loss: 2.3026
Accuracy: 0.1069
Epoch 15/10000, Loss: 2.3026
Accuracy: 0.1070
Epoch 16/10000, Loss: 2.3026
Accuracy: 0.1072
Epoch 17/10000, Loss: 2.3026
Accuracy: 0.1074
Epoch 18/10000, Loss: 2.3025
Accuracy: 0.1078
Epoch 19/10000, Loss: 2.3025
Accuracy: 0.1079
Epoch 20/10000, Loss: 2.3025
Accuracy: 0.1081
Epoch 21/10000, Loss: 2.3025
Accuracy: 0.1083
Epoch 22/10000, Loss: 2.3024
Accuracy: 0.10

In [49]:
import numpy as np
from dataset.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

# def overlay_y_on_x(x, y):
#     """
#     Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
#     """
    
#     x_ = x.copy()
#     y_ = y.copy()
#     y_ = y_ * x.max()
#     x_[:, :10] = y_
#     # x_[range(x.shape[0]), y] = x.max()
#     return x_

# def overlay_y_on_x(x, y):
#     """
#     Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
#     """
#     x_ = x.copy()
#     x_[:, :10] *= 0.0
#     x_[range(x.shape[0]), y] = x.max()
#     return x_

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def forward_propagation(X_pos, X_neg, W, B):
    # 正向传播
    y_pos = np.dot(X_pos, W) + B
    y_neg = np.dot(X_neg, W) + B
    a = -np.mean(y_pos**2)
    b = np.mean(y_neg**2)
    loss = np.log(1 + np.exp(a)) + np.log(1 + np.exp(b))
    return loss, y_pos, y_neg, a, b

def backward_propagation(X_pos, X_neg, y_pos, y_neg, a, b, W, B):
    # 反向传播
    n = len(y_pos)
    da = sigmoid(a)
    db = sigmoid(b)
    dloss_y_pos = -2 / n * y_pos * da
    dloss_y_neg = 2 / n * y_neg * db
    dloss_W_pos = np.dot(X_pos.T, dloss_y_pos)
    dloss_W_neg = np.dot(X_neg.T, dloss_y_neg)
    dloss_W = dloss_W_pos + dloss_W_neg
    dloss_B_pos = np.sum(dloss_y_pos)
    dloss_B_neg = np.sum(dloss_y_neg)
    dloss_B = dloss_B_pos + dloss_B_neg
    return dloss_W, dloss_B

# 示例数据和参数
X_pos = np.array([[1, 2], [3, 4]])
X_neg = np.array([[5, 6], [7, 8]])
W = np.array([0.1, 0.2])
B = 0.3

# 正向传播
loss, y_pos, y_neg, a, b = forward_propagation(X_pos, X_neg, W, B)
print(f"Loss: {loss}")

# 反向传播
dloss_W, dloss_B = backward_propagation(X_pos, X_neg, y_pos, y_neg, a, b, W, B)
print(f"dLoss/dW: {dloss_W}")
print(f"dLoss/dB: {dloss_B}")


Loss: 5.625605692250358
dLoss/dW: [26.99983033 31.10756853]
dLoss/dB: 4.10773820115179


In [53]:
import numpy as np
from dataset.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

def overlay_y_on_x(x, y):
    """
    Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
    """
    x_ = x.copy()
    x_[:, :10] *= 0.0
    x_[range(x.shape[0]), y] = x.max()
    return x_


class FFLayer:
    def __init__(self, input_dim, output_dim):
        self.W = np.random.randn(input_dim, output_dim) * 0.01
        self.B = np.zeros(output_dim)
        self.cache = None

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def forward(self, X_pos, X_neg):
        # 正向传播
        y_pos = np.dot(X_pos, self.W) + self.B
        y_neg = np.dot(X_neg, self.W) + self.B
        a = -np.mean(y_pos**2)
        b = np.mean(y_neg**2)
        loss = np.log(1 + np.exp(a)) + np.log(1 + np.exp(b))
        self.cache = (X_pos, X_neg, y_pos, y_neg, a, b)
        return loss

    def backward(self):
        # 反向传播
        X_pos, X_neg, y_pos, y_neg, a, b = self.cache
        n = len(y_pos)
        da = self.sigmoid(a)
        db = self.sigmoid(b)
        dloss_y_pos = -2 / n * y_pos * da
        dloss_y_neg = 2 / n * y_neg * db
        dloss_W_pos = np.dot(X_pos.T, dloss_y_pos)
        dloss_W_neg = np.dot(X_neg.T, dloss_y_neg)
        dloss_W = dloss_W_pos + dloss_W_neg
        dloss_B_pos = np.sum(dloss_y_pos)
        dloss_B_neg = np.sum(dloss_y_neg)
        dloss_B = dloss_B_pos + dloss_B_neg
        return dloss_W, dloss_B

# # 示例使用
# input_dim = 2
# output_dim = 1
# layer = CustomLayer(input_dim, output_dim)

# X_pos = np.array([[1, 2], [3, 4]])
# X_neg = np.array([[5, 6], [7, 8]])

# loss = layer.forward(X_pos, X_neg)
# print(f"Loss: {loss}")

# dloss_W, dloss_B = layer.backward()
# print(f"dLoss/dW: {dloss_W}")
# print(f"dLoss/dB: {dloss_B}")
    
# 加载 MNIST 数据集
(X_train, y_train), (X_test, y_test) = load_mnist(normalize=True, one_hot_label=False)

# 初始化神经网络
input_size = X_train.shape[1]
output_size = 10
nn = FFLayer(input_size, output_size)

# 定义正负样本
x_pos = overlay_y_on_x(X_train, y_train)
# rnd = torch.randperm(x.size(0)) # 生成一个从0到n-1的随机整数序列
# 将上一句用numpy重写
rnd = np.random.permutation(X_train.shape[0])
x_neg = overlay_y_on_x(X_train, y_train[rnd])

# 使用相同的数据作为正负样本进行训练
# nn.train(x_pos, x_neg)


AttributeError: 'FFLayer' object has no attribute 'train'

In [1]:
import numpy as np

def l2_normalize(x):
    norm = np.linalg.norm(x)
    return x / norm

def l2_normalize_backward(dL_dy, x):
    norm = np.linalg.norm(x)
    dL_dx = (dL_dy / norm) - ((x * np.sum(dL_dy * x)) / (norm**3))
    return dL_dx

# Example usage
x = np.array([1.0, 2.0, 3.0])
y = l2_normalize(x)

# Assuming some gradient dL_dy
dL_dy = np.array([0.1, 0.2, 0.3])
dL_dx = l2_normalize_backward(dL_dy, x)

print("Input x:", x)
print("Normalized output y:", y)
print("Gradient dL/dx:", dL_dx)

Input x: [1. 2. 3.]
Normalized output y: [0.26726124 0.53452248 0.80178373]
Gradient dL/dx: [3.46944695e-18 6.93889390e-18 1.38777878e-17]


In [9]:
import numpy as np


def overlay_y_on_x(x, y):
    """
    Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
    """
    x_ = x.copy()
    x_[:, :10] *= 0.0
    x_[range(x.shape[0]), y] = x.max()
    return x_


class FFLayer:
    def __init__(self, input_dim, output_dim, threshold):
        self.W = np.random.randn(input_dim, output_dim) * 0.01
        self.B = np.zeros(output_dim)
        self.threshold = threshold
        self.X_pos = None
        self.X_neg = None
        self.y_pos = None
        self.y_neg = None
        self.cache = None

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def forward(self, X):
        # 正向传播
        Y = np.dot(X, self.W) + self.B
        return Y

    def loss(self, y_pos, y_neg):
        # 正向传播
        # y_pos = self.forward(X_pos)
        # y_neg = self.forward(X_neg)
        a = -np.mean(y_pos**2) + self.threshold
        b = np.mean(y_neg**2) - self.threshold
        loss = np.log(1 + np.exp(a)) + np.log(1 + np.exp(b))
        self.cache = (a, b)
        return loss

    def backward(self):
        # 反向传播
        X_pos, X_neg, y_pos, y_neg = self.X_pos, self.X_neg, self.y_pos, self.y_neg 
        a, b = self.cache
        n = len(y_pos)
        da = self.sigmoid(a)
        db = self.sigmoid(b)
        dloss_y_pos = -2 / n * y_pos * da
        dloss_y_neg = 2 / n * y_neg * db
        dloss_W_pos = np.dot(X_pos.T, dloss_y_pos)
        dloss_W_neg = np.dot(X_neg.T, dloss_y_neg)
        dloss_W = dloss_W_pos + dloss_W_neg
        dloss_B_pos = np.sum(dloss_y_pos)
        dloss_B_neg = np.sum(dloss_y_neg)
        dloss_B = dloss_B_pos + dloss_B_neg
        return dloss_W, dloss_B

    def train(self, X_pos, X_neg, learning_rate=1e-4, num_epochs=1000000):
        # 训练神经网络
        for epoch in range(num_epochs):
            y_pos = self.forward(X_pos)
            y_neg = self.forward(X_neg)
            self.X_pos = X_pos
            self.X_neg = X_neg
            self.y_pos = y_pos
            self.y_neg = y_neg

            loss = self.loss(y_pos, y_neg)
            dloss_W, dloss_B = self.backward()
            self.W -= learning_rate * dloss_W
            self.B -= learning_rate * dloss_B
            # if epoch % 10 == 0:
            #     print(f"Epoch {epoch}, Loss: {loss}")
            # 打印损失
            # if (epoch+1) % 10 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}')

            # 评估模型
            # if (epoch+1) % 10 == 0:
            y_pred =  self.predict(X_test)
            # print(y_train.shape)
            y_true = y_test # np.argmax(y_train, axis=1)
            accuracy = self.eval(y_pred, y_true)

            print('\n', y_pred[:20], '\n', y_true[:20])
            print(f'Accuracy: {accuracy:.4f}')

    def predict(self, x):
        goodness_per_label = []
        for label in range(10): # 对每一个标签进行预测
            # label = np.eye(10)[label] # 将标签转换为 one-hot 编码
            h = overlay_y_on_x(x, label) # h是输入x和标签label的叠加
            goodness = [] # goodness是一个列表，里面存放了每一层的结果向量的均方
            # for layer in self.layers: # 对每一层进行前传
            #     h = layer(h) # h是每一层的输出
            h = self.forward(h) # h是每一层的输出
            # print(h.shape): (60000, 10)
            # goodness += [(h**2).mean(1)] # goodness是每一层的结果向量的均方。h.pow(2)是h的每一个元素的平方，mean(1)是对每一行求均值
            goodness_per_label.append((h**2).mean(1)) # goodness_per_label是每一层的结果向量的均方的和
        # goodness_per_label = torch.cat(goodness_per_label, 1) # goodness_per_label是每一层的结果向量的均方的和的列表
        return np.array(goodness_per_label).argmax(0) # 返回的是goodness_per_label中每一行最大值的索引，也就是说，返回的是每一行最大值的列索引
        # # 计算 Y
        # Y = np.dot(X_test, self.W) + self.B
        
        # # 计算准确率
        # accuracy = np.mean(np.argmax(Y, axis=1) == np.argmax(y_test, axis=1))
        # print(f'Accuracy: {accuracy:.4f}')
                
    def eval(self, y_pred, y_true):
        return np.mean(y_pred == y_true)
    
# # 示例使用
# input_dim = 2
# output_dim = 1
# threshold = 0.5
# layer = CustomLayer(input_dim, output_dim, threshold)

# X_pos = np.array([[1, 2], [3, 4]])
# X_neg = np.array([[5, 6], [7, 8]])

# layer.train(X_pos, X_neg, learning_rate=0.00001, epochs=10000000)

np.random.seed(0)
# 加载 MNIST 数据集
(X_train, y_train), (X_test, y_test) = load_mnist(normalize=True, one_hot_label=False)

# 初始化神经网络
input_size = X_train.shape[1]
hidden_size = 500
output_size = 500
threshold = 2.0
nn1 = FFLayer(input_size, hidden_size, threshold)
nn2 = FFLayer(hidden_size, output_size, threshold)

# 定义正负样本
x_pos = overlay_y_on_x(X_train, y_train)
# rnd = torch.randperm(x.size(0)) # 生成一个从0到n-1的随机整数序列
# 将上一句用numpy重写
rnd = np.random.permutation(X_train.shape[0])
x_neg = overlay_y_on_x(X_train, y_train[rnd])

# 使用相同的数据作为正负样本进行训练
nn1.train(x_pos, x_neg, learning_rate=1e-4, num_epochs=10000000)


Epoch 1/10000000, Loss: 2.2470

 [9 3 2 3 9 2 3 1 2 3 9 3 6 3 3 3 1 9 3 1] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4]
Accuracy: 0.0903
Epoch 2/10000000, Loss: 2.2470

 [9 3 2 3 9 2 3 1 2 3 9 3 6 3 3 3 1 9 3 1] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4]
Accuracy: 0.0908
Epoch 3/10000000, Loss: 2.2470

 [9 3 2 3 9 2 3 1 2 3 9 3 6 3 3 3 1 9 3 1] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4]
Accuracy: 0.0910
Epoch 4/10000000, Loss: 2.2469

 [9 3 2 3 9 2 3 1 2 3 9 3 6 3 3 3 1 9 3 1] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4]
Accuracy: 0.0910
Epoch 5/10000000, Loss: 2.2469

 [9 3 2 3 9 2 3 1 2 3 9 3 6 3 3 3 1 9 3 1] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4]
Accuracy: 0.0915
Epoch 6/10000000, Loss: 2.2469

 [9 3 2 3 9 2 3 1 2 3 9 3 6 3 3 3 1 9 3 1] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4]
Accuracy: 0.0914
Epoch 7/10000000, Loss: 2.2468

 [9 3 2 3 9 2 3 1 2 3 9 3 6 3 3 3 1 9 3 1] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4]
Accuracy: 0.0921
Epoch 8/10000000, Loss: 2.2468

 [9 3 2 3 9 2 3 

KeyboardInterrupt: 

In [10]:
import numpy as np
from dataset.mnist import load_mnist



# from tensorflow.keras.datasets import mnist

# def overlay_y_on_x(x, y):
#     """
#     Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
#     """
#     x_ = x.clone()
#     x_[:, :10] *= 0.0
#     x_[range(x.shape[0]), y] = x.max()
#     return x_
# 假设x,y是numpy数组, 用numpy重写
def overlay_y_on_x(x, y):
    """
    Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
    """
    
    x_ = x.copy()
    y_ = y.copy()
    y_ = y_ * x.max()
    x_[:, :10] = y_
    # x_[range(x.shape[0]), y] = x.max()
    return x_

class LossFunction:
    def __init__(self):
        pass
    
    def loss(self, Y_pos, Y_neg):
        return np.sum(- Y_pos ** 2 + Y_neg ** 2)
    
    def compute_gradients(self, X_pos, X_neg, W, B):
        # 计算 Y_pos 和 Y_neg
        Y_pos = np.dot(X_pos, W) + B
        Y_neg = np.dot(X_neg, W) + B
        
        # 计算损失函数
        loss = self.loss(Y_pos, Y_neg)
        
        # 计算损失函数对 W 的梯度
        dW = - np.dot(X_pos.T, 2 * Y_pos) + np.dot(X_neg.T, 2 * Y_neg)
        
        # 计算损失函数对 B 的梯度
        dB = np.sum(2 * ( - Y_pos + Y_neg))
        
        return loss, dW, dB

class AdamOptimizer:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None
        self.v = None
        self.t = 0

    def update(self, params, grads):
        if self.m is None:
            self.m = [0] * len(params)
            self.v = [0] * len(params)

        self.t += 1
        for i, (param, grad) in enumerate(zip(params, grads)):
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * grad**2
            m_hat = self.m[i] / (1 - self.beta1**self.t)
            v_hat = self.v[i] / (1 - self.beta2**self.t)
            params[i] -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)

# # 示例使用
# input_dim = 2
# output_dim = 1
# threshold = 0.5
# layer = CustomLayer(input_dim, output_dim, threshold)
optimizer = AdamOptimizer(learning_rate=0.001)

# X_pos = np.array([[1, 2], [3, 4]])
# X_neg = np.array([[5, 6], [7, 8]])

# for epoch in range(100):
#     loss = layer.forward(X_pos, X_neg)
#     dloss_W, dloss_B = layer.backward()
#     optimizer.update([layer.W, layer.B], [dloss_W, dloss_B])
#     if epoch % 10 == 0:
#         print(f"Epoch {epoch}, Loss: {loss}")


# 定义神经网络类
class NeuralNetwork:
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        
        # 初始化权重和偏置
        self.W = np.random.randn(input_size, output_size)
        self.B = np.zeros((1, output_size))
        
        # 初始化损失函数对象
        self.loss_function = LossFunction()

    def forward(self, X):
        # 计算 Y
        Y = np.dot(X, self.W) + self.B
        
        return Y
    
    def train(self, X_train_pos, X_train_neg, num_epochs=10000, learning_rate=1e-2):
        for epoch in range(num_epochs):
            # 计算梯度
            loss, dW, dB = self.loss_function.compute_gradients(X_train_pos, X_train_neg, self.W, self.B)
            
            # 更新参数
            # self.W -= learning_rate * dW
            # self.B -= learning_rate * dB
            optimizer.update([self.W, self.B], [dW, dB])
            
            # 打印损失
            # if (epoch+1) % 10 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}')

            # 评估模型
            # if (epoch+1) % 10 == 0:
            y_pred =  self.predict(X_test)
            y_true = np.argmax(y_test, axis=1)
            accuracy = self.eval(y_pred, y_true)

            print(y_pred, y_true)
            print('\n', y_pred[:30], '\n', y_true[:30])
            print(f'Accuracy: {accuracy:.4f}')
        return 

    
    def predict(self, x):
        goodness_per_label = []
        for label in range(10): # 对每一个标签进行预测
            label = np.eye(10)[label] # 将标签转换为 one-hot 编码
            h = overlay_y_on_x(x, label) # h是输入x和标签label的叠加
            goodness = [] # goodness是一个列表，里面存放了每一层的结果向量的均方
            # for layer in self.layers: # 对每一层进行前传
            #     h = layer(h) # h是每一层的输出
            h = self.forward(h) # h是每一层的输出
            # print(h.shape): (60000, 10)
            # goodness += [(h**2).mean(1)] # goodness是每一层的结果向量的均方。h.pow(2)是h的每一个元素的平方，mean(1)是对每一行求均值
            goodness_per_label.append((h**2).mean(1)) # goodness_per_label是每一层的结果向量的均方的和
        # goodness_per_label = torch.cat(goodness_per_label, 1) # goodness_per_label是每一层的结果向量的均方的和的列表
        return np.array(goodness_per_label).argmax(0) # 返回的是goodness_per_label中每一行最大值的索引，也就是说，返回的是每一行最大值的列索引
        # # 计算 Y
        # Y = np.dot(X_test, self.W) + self.B
        
        # # 计算准确率
        # accuracy = np.mean(np.argmax(Y, axis=1) == np.argmax(y_test, axis=1))
        # print(f'Accuracy: {accuracy:.4f}')
                
    def eval(self, y_pred, y_true):
        return np.mean(y_pred == y_true)


np.random.seed(0)

# 加载 MNIST 数据集
(X_train, y_train), (X_test, y_test) = load_mnist(normalize=True, one_hot_label=False)

# 数据预处理
X_train = X_train.reshape(-1, 28*28) / 255.0
X_test = X_test.reshape(-1, 28*28) / 255.0

# 将标签转换为 one-hot 编码
y_train = np.eye(10)[y_train]
y_test = np.eye(10)[y_test]

# 初始化神经网络
input_size = X_train.shape[1]
hidden_size = 500
output_size = 500
nn1 = NeuralNetwork(input_size, hidden_size)
nn2 = NeuralNetwork(hidden_size, 10)

# 定义正负样本
x_pos = overlay_y_on_x(X_train, y_train)
# rnd = torch.randperm(x.size(0)) # 生成一个从0到n-1的随机整数序列
# 将上一句用numpy重写
rnd = np.random.permutation(X_train.shape[0])
x_neg = overlay_y_on_x(X_train, y_train[rnd])

# 使用相同的数据作为正负样本进行训练
nn.train(x_pos, x_neg)


Epoch 1/10000, Loss: 25.0149
[9 3 2 ... 3 3 3] [7 2 1 ... 4 5 6]

 [9 3 2 3 9 2 3 1 2 3 9 3 6 3 3 3 1 9 3 1 3 1 3 2 1 9 9 1 3 3] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4 9 6 6 5 4 0 7 4 0 1]
Accuracy: 0.0982
Epoch 2/10000, Loss: 7.8398
[9 3 2 ... 3 3 3] [7 2 1 ... 4 5 6]

 [9 3 2 3 9 2 3 1 2 3 9 3 6 3 3 3 9 9 3 1 3 1 3 2 1 9 9 1 3 3] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4 9 6 6 5 4 0 7 4 0 1]
Accuracy: 0.1073
Epoch 3/10000, Loss: -9.3356
[9 3 2 ... 3 3 3] [7 2 1 ... 4 5 6]

 [9 3 2 3 9 2 3 3 2 3 9 3 6 3 3 3 9 9 3 1 3 1 3 2 1 9 9 9 3 3] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4 9 6 6 5 4 0 7 4 0 1]
Accuracy: 0.1168
Epoch 4/10000, Loss: -26.5114
[9 3 2 ... 3 3 3] [7 2 1 ... 4 5 6]

 [9 3 2 3 9 2 3 3 2 3 9 3 6 3 3 3 9 9 3 1 3 1 3 2 1 9 9 9 3 3] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4 9 6 6 5 4 0 7 4 0 1]
Accuracy: 0.1268
Epoch 5/10000, Loss: -43.6877
[9 3 2 ... 3 3 3] [7 2 1 ... 4 5 6]

 [9 3 2 3 9 2 3 3 2 3 9 3 6 3 3 3 9 9 3 9 3 1 3 2 1 9 9 9 3 3] 
 [7 2 1 0 4 1 4 9 5 9 0 6 9 0

KeyboardInterrupt: 

In [80]:
y_train

array([5, 0, 4, ..., 5, 6, 8], dtype=uint8)

In [13]:
import numpy as np

def relu(x):
    return np.maximum(0, x)

class FFLayer:
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        
        # 初始化权重和偏置
        self.W = np.random.randn(input_size, output_size)
        self.B = np.zeros((1, output_size))

    def forward(self, X):
        # 计算 Y
        Y = np.dot(X, self.W) + self.B
        return Y

class LossFunction:
    def __init__(self):
        pass
    
    def loss(self, Y_pos, Y_neg):
        return np.log(1 + np.exp(-1 / Y_pos.shape[0] * np.sum(Y_pos**2))) + np.log(1 + np.exp(1 / Y_neg.shape[0] * np.sum(Y_neg**2)))

    def compute_gradients(self, X_pos, X_neg, W1, B1, W2, B2):
        # 计算 Y_pos 和 Y_neg
        Y_pos = relu(np.dot(X_pos, W1) + B1)
        Y_neg = relu(np.dot(X_neg, W1) + B1)

        # 计算损失函数
        loss = self.loss(Y_pos, Y_neg)

        # 计算损失函数对 W1 和 B1 的梯度
        dY_pos = -2 / Y_pos.shape[0] * Y_pos
        dY_neg = 2 / Y_neg.shape[0] * Y_neg
        dW1 = np.dot(X_pos.T, np.dot(dY_pos, W2.T) * (Y_pos > 0)) - np.dot(X_neg.T, np.dot(dY_neg, W2.T) * (Y_neg > 0))
        dB1 = np.sum(np.dot(dY_pos, W2.T) * (Y_pos > 0)) - np.sum(np.dot(dY_neg, W2.T) * (Y_neg > 0))

        # 计算损失函数对 W2 和 B2 的梯度
        dW2 = np.dot(Y_pos.T, dY_pos) + np.dot(Y_neg.T, dY_neg)
        dB2 = np.sum(dY_pos) + np.sum(dY_neg)

        return loss, dW1, dB1, dW2, dB2

class AdamOptimizer:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None
        self.v = None
        self.t = 0

    def update(self, params, grads):
        if self.m is None:
            self.m = [0] * len(params)
            self.v = [0] * len(params)

        self.t += 1
        for i, (param, grad) in enumerate(zip(params, grads)):
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * grad**2
            m_hat = self.m[i] / (1 - self.beta1**self.t)
            v_hat = self.v[i] / (1 - self.beta2**self.t)
            params[i] -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)

class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # 初始化两个FFLayer层
        self.layer1 = FFLayer(input_size, hidden_size)
        self.layer2 = FFLayer(hidden_size, output_size)
        
        # 初始化损失函数对象
        self.loss_function = LossFunction()

    def forward(self, X):
        # 计算第一层的输出
        hidden_output = relu(self.layer1.forward(X))
        
        # 计算第二层的输出
        output = relu(self.layer2.forward(hidden_output))
        
        return output

    def train(self, X_train_pos, X_train_neg, num_epochs=10000, learning_rate=1e-2):
        optimizer = AdamOptimizer(learning_rate=learning_rate)
        for epoch in range(num_epochs):
            # 计算梯度和损失
            loss, dW1, dB1, dW2, dB2 = self.loss_function.compute_gradients(X_train_pos, X_train_neg, self.layer1.W, self.layer1.B, self.layer2.W, self.layer2.B)
            
            # 更新第二层参数
            optimizer.update([self.layer2.W, self.layer2.B], [dW2,


SyntaxError: incomplete input (1425009904.py, line 99)

In [None]:
import numpy as np

# 损失函数：sum(Y_pos^2 - Y_neg^2)
# Y_pos = X_pos * W + B
# Y_neg = X_neg * W + B

def loss_function(Y_pos, Y_neg):
    return np.sum(Y_pos ** 2 - Y_neg ** 2)

def compute_gradients(X_pos, X_neg, W, B):
    # 计算 Y_pos 和 Y_neg
    Y_pos = np.dot(X_pos, W) + B
    Y_neg = np.dot(X_neg, W) + B
    
    # 损失函数
    loss = loss_function(Y_pos, Y_neg)
    
    # 计算损失函数对 W 的梯度
    dW = np.dot(X_pos.T, 2 * Y_pos) - np.dot(X_neg.T, 2 * Y_neg)
    
    # 计算损失函数对 B 的梯度
    dB = np.sum(2 * Y_pos) - np.sum(2 * Y_neg)
    
    return loss, dW, dB

# 示例数据
X_pos = np.array([[1, 2], [3, 4]])  # 正样本输入
X_neg = np.array([[5, 6], [7, 8]])  # 负样本输入
W = np.array([[0.1], [0.2]])        # 权重
B = 0.5                             # 偏置项

# 计算梯度
loss, dW, dB = compute_gradients(X_pos, X_neg, W, B)

print("损失函数值：", loss)
print("损失函数对 W 的梯度：\n", dW)
print("损失函数对 B 的梯度：", dB)


损失函数值： -9.120000000000001
损失函数对 W 的梯度：
 [[-49.6]
 [-54.4]]
损失函数对 B 的梯度： -4.8


In [None]:
# 创建一个示例输入张量，大小为 (batch_size, in_features)，这里假设batch_size=10，in_features=3  
input_tensor = np.random.randn(10, 3)

# 通过线性层传递输入张量，得到输出张量
output_tensor = linear_with_relu(input_tensor)
output_tensor.shape, linear_with_relu.backward(output_tensor).shape, linear_with_relu.dW.shape, linear_with_relu.db.shape

Calling with [[ 1.23995115  0.80295709 -2.10658818]
 [-0.85377799 -0.50528737 -0.33197667]
 [-0.75476225 -0.39145849  0.28908069]
 [ 1.54048943 -0.11126473 -0.0178062 ]
 [-0.36381642 -1.12360251  0.77222582]
 [-0.95519656 -0.11839887 -1.40283641]
 [-0.01863232  2.08999829  0.20712434]
 [ 1.15585889 -0.22871184  1.98679129]
 [-2.70868447  1.26405934  1.08003658]
 [-0.26870468 -1.52448913  1.08188348]]


((10, 2), (10, 3), (3, 2), (2,))

In [None]:
linear_with_relu.save_state_dict('test.npz')

In [None]:
linear_with_relu.load_state_dict('test.npz')

In [None]:
linear_with_relu.state_dict()

{'W': array([[-0.45174253,  0.89631055],
        [-1.40571558, -1.62979079],
        [-0.10383291, -0.09456478]]),
 'b': array([-0.29577391,  0.29874649])}

In [None]:
# 创建一个示例输入张量，大小为 (batch_size, in_features)，这里假设batch_size=3
input_tensor = np.random.randn(10, 3)

# 通过线性层传递输入张量，得到输出张量
output_tensor = linear_with_relu(input_tensor)
output_tensor.shape

Calling with [[-0.22818384 -0.45897934 -0.03745416]
 [-1.08014176 -0.2822869  -1.50470031]
 [ 0.56571172  0.42851455 -0.88218169]
 [ 1.29668201 -0.75654735  1.07494307]
 [ 1.49953357  1.01972658 -2.20856832]
 [-1.17587485  1.15270113 -0.23747215]
 [ 0.13526406  1.79250237  1.24276977]
 [-0.10815105 -0.10704854  1.84190569]
 [-0.55388518 -1.51959481  0.23537742]
 [-1.08975478  0.53230016 -0.09435039]]


(10, 2)

In [None]:
linear_with_relu.backward(output_tensor).shape

(10, 3)

In [None]:
class LinearWithReLu():
    def __init__(self, W, b):
        self.affine = Affine(W, b)
        self.relu = ReLu()
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        out = self.affine.forward(x)
        out = self.relu.forward(out)
        return out
    
    def backward(self, dout):
        dout = self.relu.backward(dout)
        dout = self.affine.backward(dout)
        return dout

In [None]:
linear_with_relu = LinearWithReLu(np.array([[1, 2, 3], [4, 5, 6]]), np.array([1, 2, 3]))

In [None]:
aff.forward(np.array([[1, 2]]))

array([[10, 14, 18]])

In [None]:
aff.backward(np.array([[10, 14, 18]]))

array([[ 92, 218]])

In [None]:
class Sigmoid:
    def __init__(self):
        self.out = None
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out
    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx

sig = Sigmoid()
sig.forward(np.array([[1, 2], [3, 4]])), sig.backward(np.array([[0.1, 0.2], [0.3, 0.4]]))

(array([[0.73105858, 0.88079708],
        [0.95257413, 0.98201379]]),
 array([[0.01966119, 0.02099872],
        [0.013553  , 0.00706508]]))

In [None]:
def softmax(a):
    c = np.max(a)
    exp_a = np.exp(a - c) # 溢出对策
    sum_exp_a = np.sum(exp_a)
    y = exp_a / sum_exp_a
    return y

def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + 1e-7)) / batch_size

class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None # 损失
        self.y = None
        # softmax 的输出
        self.t = None
        # 监督数据（one-hot vector）
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

In [None]:
import numpy as np
from dataset.mnist import load_mnist

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(a):
    c = np.max(a)
    exp_a = np.exp(a - c) # 溢出对策
    sum_exp_a = np.sum(exp_a)
    y = exp_a / sum_exp_a
    return y

def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + 1e-7)) / batch_size

def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)
        x[idx] = tmp_val - h
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        x[idx] = tmp_val # 还原值
        it.iternext()
    return grad

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # 初始化权重
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        return y
    
    # x: 输入数据 , t: 监督数据
    def loss(self, x, t):
        y = self.predict(x)
        return cross_entropy_error(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    # x: 输入数据 , t: 监督数据
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True)
train_loss_list = []
train_acc_list = []
test_acc_list = []
# 平均每个 epoch 的重复次数
iter_per_epoch = max(train_size / batch_size, 1)
# 超参数
# iters_num = 10000
num_epochs = 10
batch_size = 1000
learning_rate = 0.1
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

for i in range(num_epochs):
    for j in range(train_size // batch_size):
        print(i, j)
        # 获取 mini-batch
        batch_mask = np.random.choice(train_size, batch_size)
        x_batch = x_train[batch_mask]
        t_batch = t_train[batch_mask]
        # 计算梯度
        # grad = network.numerical_gradient(x_batch, t_batch)
        # grad = network.gradient(x_batch, t_batch) # 高速版 !
        # # 更新参数
        # for key in ('W1', 'b1', 'W2', 'b2'):
        #     network.params[key] -= learning_rate * grad[key]
        # loss = network.loss(x_batch, t_batch)
        # train_loss_list.append(loss)
    # 计算每个 epoch 的识别精度
        # if i % iter_per_epoch == 0:
    train_acc = network.accuracy(x_train, t_train)
    test_acc = network.accuracy(x_test, t_test)
    train_acc_list.append(train_acc)
    test_acc_list.append(test_acc)
    print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))

0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10
0 11
0 12
0 13
0 14
0 15
0 16
0 17
0 18
0 19
0 20
0 21
0 22
0 23
0 24
0 25
0 26
0 27
0 28
0 29
0 30
0 31
0 32
0 33
0 34
0 35
0 36
0 37
0 38
0 39
0 40
0 41
0 42
0 43
0 44
0 45
0 46
0 47
0 48
0 49
0 50
0 51
0 52
0 53
0 54
0 55
0 56
0 57
0 58
0 59
train acc, test acc | 0.10831666666666667, 0.1072
1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
1 21
1 22
1 23
1 24
1 25
1 26
1 27
1 28
1 29
1 30
1 31
1 32
1 33
1 34
1 35
1 36
1 37
1 38
1 39
1 40
1 41
1 42
1 43
1 44
1 45
1 46
1 47
1 48
1 49
1 50
1 51
1 52
1 53
1 54
1 55
1 56
1 57
1 58
1 59
train acc, test acc | 0.10831666666666667, 0.1072
2 0
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
2 9
2 10
2 11
2 12
2 13
2 14
2 15
2 16
2 17
2 18
2 19
2 20
2 21
2 22
2 23
2 24
2 25
2 26
2 27
2 28
2 29
2 30
2 31
2 32
2 33
2 34
2 35
2 36
2 37
2 38
2 39
2 40
2 41
2 42
2 43
2 44
2 45
2 46
2 47
2 48
2 49
2 50
2 51
2 52
2 53
2 54
2 55
2 56
2 57
2 58
2 59
train acc, test acc | 0.108316

In [None]:
def function_2(x):
    return x[0]**2 + x[1]**2

numerical_gradient(function_2, np.array([3., 0.]))

array([6., 0.])

In [None]:
class simpleNet:
    def __init__(self):
        self.W = np.random.randn(2,3) # 用高斯分布进行初始化
    def predict(self, x):
        return np.dot(x, self.W)
    def loss(self, x, t):
        z = self.predict(x)
        y = softmax(z)
        loss = cross_entropy_error(y, t)
        return loss
    
net = simpleNet()

In [None]:
net.W

array([[ 0.49730265, -1.73603617,  0.11148207],
       [ 0.70140045, -0.06801788,  0.06007321]])