In [2]:
import numpy as np
from dataset.mnist import load_mnist
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, ToTensor, Normalize, Lambda
import torch

def MNIST_loaders(train_batch_size=50000, test_batch_size=10000):

    transform = Compose([
        ToTensor(),
        Normalize((0.1307,), (0.3081,)),
        Lambda(lambda x: torch.flatten(x))])

    train_loader = DataLoader(
        MNIST('./data/', train=True,
              download=True,
              transform=transform),
        batch_size=train_batch_size, shuffle=True)

    test_loader = DataLoader(
        MNIST('./data/', train=False,
              download=True,
              transform=transform),
        batch_size=test_batch_size, shuffle=False)

    return train_loader, test_loader


def overlay_y_on_x(x, y):
    """
    Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
    """
    x_ = x.copy()
    x_[:, :10] *= 0.0
    for i in range(x.shape[0]):
        x_[i, y[i]] = x[i, :].max()
    return x_


def relu(x):
    return np.maximum(0, x)

class AdamOptimizer:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None
        self.v = None
        self.t = 0

    def update(self, params, grads):
        if self.m is None:
            self.m = [0] * len(params)
            self.v = [0] * len(params)

        self.t += 1
        for i, grad in enumerate(grads):
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * grad**2
            m_hat = self.m[i] / (1 - self.beta1**self.t)
            v_hat = self.v[i] / (1 - self.beta2**self.t)
            params[i] -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)


# 定义神经网络类
class Layer:
    def __init__(self, input_size, output_size):
        # 初始化权重和偏置
        self.W = np.random.randn(input_size, output_size)
        self.B = np.zeros((1, output_size))
        
        self.optimizer = AdamOptimizer(learning_rate=6e-3)
        self.accu = []
        self.threshold = 4.0


    def forward(self, X):
        X = X  / X.max() #np.linalg.norm(X, ord=2)
        Y = np.dot(X, self.W) + self.B
        return relu(Y)


    def compute_loss(self, Y_pos, Y_neg):
        # 计算正样本和负样本的损失
        g_pos = np.mean(Y_pos**2, axis=1)
        g_neg = np.mean(Y_neg**2, axis=1)
        loss = ((np.log(1 + np.exp(-g_pos + self.threshold)) + np.log(1 + np.exp(g_neg - self.threshold))) / 2).mean()
        return loss

    def compute_loss_gradients(self, Y_pos, Y_neg):
        # 损失函数梯度计算
        N_pos = Y_pos.shape[0]
        N_neg = Y_neg.shape[0]

        g_pos = np.mean(Y_pos**2, axis=1)
        g_neg = np.mean(Y_neg**2, axis=1)

        dloss_dgpos = -0.5 * np.exp(-g_pos + self.threshold) / (1 + np.exp(-g_pos + self.threshold)) 
        dloss_dgneg = 0.5 * np.exp(g_neg - self.threshold) / (1 + np.exp(g_neg - self.threshold)) 

        dloss_dY_pos = (2 / N_pos) * dloss_dgpos[:, np.newaxis] * Y_pos
        dloss_dY_neg = (2 / N_neg) * dloss_dgneg[:, np.newaxis] * Y_neg

        return dloss_dY_pos, dloss_dY_neg
    

    def backward(self, X_pos, X_neg, Y_pos, Y_neg):
        X_pos = X_pos / X_pos.max() #np.linalg.norm(X_pos, ord=2)
        X_neg = X_neg / X_neg.max() #np.linalg.norm(X_neg, ord=2)
        # 计算损失梯度
        dloss_dY_pos, dloss_dY_neg = self.compute_loss_gradients(Y_pos, Y_neg)
        
        # 计算ReLU激活后的梯度
        relu_grad_pos = (Y_pos > 0).astype(np.float32)
        relu_grad_neg = (Y_neg > 0).astype(np.float32)
        dloss_dY_pos *= relu_grad_pos
        dloss_dY_neg *= relu_grad_neg
        
        # 权重和偏置的梯度
        dW_pos = np.dot(X_pos.T, dloss_dY_pos) / X_pos.shape[0]
        dW_neg = np.dot(X_neg.T, dloss_dY_neg) / X_neg.shape[0]
        dB_pos = np.sum(dloss_dY_pos, axis=0, keepdims=True) / X_pos.shape[0]
        dB_neg = np.sum(dloss_dY_neg, axis=0, keepdims=True) / X_neg.shape[0]
        
        # 合并正样本和负样本的梯度
        dW = dW_pos + dW_neg
        dB = dB_pos + dB_neg
        
        # 使用优化器更新参数
        self.optimizer.update([self.W, self.B], [dW, dB])


    def train(self, X_train_pos, X_train_neg, num_epochs=2000):
        for epoch in range(num_epochs):
            # 正样本并进行前向传播
            Y_pos = self.forward(X_train_pos)
    
            # 负样本并进行前向传播
            Y_neg = self.forward(X_train_neg)
            loss = self.compute_loss(Y_pos, Y_neg)

            self.backward(X_train_pos, X_train_neg, Y_pos, Y_neg)
            
            # 打印损失
            # if (epoch+1) % 10 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss:.12f}')

        return self.forward(X_train_pos), self.forward(X_train_neg)


    def eval(self, y_pred, y_true):
        acc = np.mean(y_pred == y_true)
        self.accu.append(acc)
        print(y_pred, y_true)
        print('\n', y_pred[:30], '\n', y_true[:30])
        print(f'Accuracy: {acc:.4f}')
        return acc


class Net(torch.nn.Module):
    def __init__(self, dims):
        super().__init__()
        self.accu = []
        self.layers = []
        for d in range(len(dims) - 1):
            self.layers += [Layer(dims[d], dims[d + 1])]

    def predict(self, x): 
        # 这个predict方法是理解ff方法的关键，它不是像普通的predict方法一样，输入一个样本，输出一个长度为num_cls的softmax预测向量
        # 而是一个样本反复输入这个网络num_cls次，把每种带标签的可能都计算一个goodness，也就是这个数据是好数据的可能性，找出最高goodness的就是预测类别
        goodness_per_label = []
        for label in range(10): # 对每一个标签进行预测
            label = label * np.ones(x.shape[0], dtype=np.int64)
            h = overlay_y_on_x(x, label) # h是输入x和标签label的叠加
            goodness = [] # goodness是一个列表，里面存放了每一层的结果向量的均方
            for layer in self.layers: # 对每一层进行前传
                h = layer.forward(h) # h是每一层的输出
                goodness += [(h**2).mean(1)] # goodness是每一层的结果向量的均方。h.pow(2)是h的每一个元素的平方，mean(1)是对每一行求均值
            # goodness_per_label += [sum(goodness).unsqueeze(1)] # goodness_per_label是每一层的结果向量的均方的和
            goodness_per_label.append((h**2).mean(1))
        # goodness_per_label = torch.cat(goodness_per_label, 1) # goodness_per_label是每一层的结果向量的均方的和的列表
        return np.array(goodness_per_label).argmax(0)#goodness_per_label.argmax(1) # 返回的是goodness_per_label中每一行最大值的索引，也就是说，返回的是每一行最大值的列索引

    def eval(self, y_pred, y_true):
        acc = np.mean(y_pred == y_true)
        self.accu.append(acc)
        print(y_pred, y_true)
        print('\n', y_pred[:30], '\n', y_true[:30])
        print(f'Accuracy: {acc:.12f}')
        return acc
    
    def train(self, x_pos, x_neg, li_epochs): # 这个train方法是对整个网络进行训练，训练的目标是让正样本的结果向量的均方上升，负样本的结果向量的均方下降
        h_pos, h_neg = x_pos, x_neg # h_pos和h_neg是正样本和负样本的输入
        for i, layer in enumerate(self.layers): # 对每一层进行训练
            print('training layer', i, '...') # 这里的i是层数
            h_pos, h_neg = layer.train(h_pos, h_neg, num_epochs=li_epochs[i]) # 对每一层进行训练，得到了正样本和负样本的结果向量，这个结果向量是该层的输出，也是下一层的输入
            # 也就是说，这个训练的过程中，正样本在前传过程中得到的每一层输出都被认为是正的，负样本在前传过程中得到的每一层输出都被认为是负的，也就是说，出身决定一切

        y_pred =  self.predict(x_te)
        self.eval(y_pred, y_te)


np.random.seed(0)

# 加载 MNIST 数据集
# (X_train, y_train), (X_test, y_test) = load_mnist(normalize=True, one_hot_label=False)
train_loader, test_loader = MNIST_loaders()
# train_loader, test_loader = train_loader.numpy(), test_loader.numpy()

x, y = next(iter(train_loader))
x, y = x.numpy(), y.numpy()

x_te, y_te = next(iter(test_loader))
x_te, y_te = x_te.numpy(), y_te.numpy()

x_pos = overlay_y_on_x(x, y)
rnd = torch.randperm(x.shape[0]) # 生成一个从0到n-1的随机整数序列。
x_neg = overlay_y_on_x(x, y[rnd])

net = Net([784, 500, 500])
net.train(x_pos, x_neg, [2000, 2000])


training layer 0 ...
Epoch 1/2000, Loss: 22.543785368981
Epoch 2/2000, Loss: 19.509472835438
Epoch 3/2000, Loss: 16.788293713531
Epoch 4/2000, Loss: 14.362973321893
Epoch 5/2000, Loss: 12.214024442869
Epoch 6/2000, Loss: 10.320488217730
Epoch 7/2000, Loss: 8.660593689376
Epoch 8/2000, Loss: 7.212535001251
Epoch 9/2000, Loss: 5.955346040156
Epoch 10/2000, Loss: 4.869866039878
Epoch 11/2000, Loss: 3.939947979517
Epoch 12/2000, Loss: 3.153745181149
Epoch 13/2000, Loss: 2.504392002923
Epoch 14/2000, Loss: 1.988524547420
Epoch 15/2000, Loss: 1.601643593013
Epoch 16/2000, Loss: 1.332693153634
Epoch 17/2000, Loss: 1.162512559392
Epoch 18/2000, Loss: 1.067534111895
Epoch 19/2000, Loss: 1.025094313616
Epoch 20/2000, Loss: 1.016729605057
Epoch 21/2000, Loss: 1.028907801413
Epoch 22/2000, Loss: 1.052342995457
Epoch 23/2000, Loss: 1.080940902419
Epoch 24/2000, Loss: 1.110856505710
Epoch 25/2000, Loss: 1.139766744338
Epoch 26/2000, Loss: 1.166346429268
Epoch 27/2000, Loss: 1.189906259762
Epoch 28/2

In [8]:
net.train(x_pos, x_neg, [0, 2000])

training layer 0 ...
training layer 1 ...
Epoch 1/2000, Loss: 0.339415264921
Epoch 2/2000, Loss: 0.339411252257
Epoch 3/2000, Loss: 0.339407239845
Epoch 4/2000, Loss: 0.339403227686
Epoch 5/2000, Loss: 0.339399215791
Epoch 6/2000, Loss: 0.339395204165
Epoch 7/2000, Loss: 0.339391192809
Epoch 8/2000, Loss: 0.339387181730
Epoch 9/2000, Loss: 0.339383170928
Epoch 10/2000, Loss: 0.339379160397
Epoch 11/2000, Loss: 0.339375150134
Epoch 12/2000, Loss: 0.339371140140
Epoch 13/2000, Loss: 0.339367130412
Epoch 14/2000, Loss: 0.339363120940
Epoch 15/2000, Loss: 0.339359111724
Epoch 16/2000, Loss: 0.339355102769
Epoch 17/2000, Loss: 0.339351094072
Epoch 18/2000, Loss: 0.339347085628
Epoch 19/2000, Loss: 0.339343077433
Epoch 20/2000, Loss: 0.339339069481
Epoch 21/2000, Loss: 0.339335061771
Epoch 22/2000, Loss: 0.339331054306
Epoch 23/2000, Loss: 0.339327047093
Epoch 24/2000, Loss: 0.339323040137
Epoch 25/2000, Loss: 0.339319033445
Epoch 26/2000, Loss: 0.339315027018
Epoch 27/2000, Loss: 0.33931102

In [6]:
net.train(x_pos, x_neg, [1000, 1000])

training layer 0 ...
Epoch 1/1000, Loss: 0.2782
Epoch 2/1000, Loss: 0.2782
Epoch 3/1000, Loss: 0.2781
Epoch 4/1000, Loss: 0.2781
Epoch 5/1000, Loss: 0.2780
Epoch 6/1000, Loss: 0.2780
Epoch 7/1000, Loss: 0.2779
Epoch 8/1000, Loss: 0.2779
Epoch 9/1000, Loss: 0.2779
Epoch 10/1000, Loss: 0.2778
Epoch 11/1000, Loss: 0.2778
Epoch 12/1000, Loss: 0.2777
Epoch 13/1000, Loss: 0.2777
Epoch 14/1000, Loss: 0.2776
Epoch 15/1000, Loss: 0.2776
Epoch 16/1000, Loss: 0.2776
Epoch 17/1000, Loss: 0.2775
Epoch 18/1000, Loss: 0.2775
Epoch 19/1000, Loss: 0.2774
Epoch 20/1000, Loss: 0.2774
Epoch 21/1000, Loss: 0.2774
Epoch 22/1000, Loss: 0.2773
Epoch 23/1000, Loss: 0.2773
Epoch 24/1000, Loss: 0.2772
Epoch 25/1000, Loss: 0.2772
Epoch 26/1000, Loss: 0.2771
Epoch 27/1000, Loss: 0.2771
Epoch 28/1000, Loss: 0.2771
Epoch 29/1000, Loss: 0.2770
Epoch 30/1000, Loss: 0.2770
Epoch 31/1000, Loss: 0.2769
Epoch 32/1000, Loss: 0.2769
Epoch 33/1000, Loss: 0.2768
Epoch 34/1000, Loss: 0.2768
Epoch 35/1000, Loss: 0.2768
Epoch 36

In [7]:
net.train(x_pos, x_neg, [1000, 1000])

training layer 0 ...
Epoch 1/1000, Loss: 0.2436
Epoch 2/1000, Loss: 0.2435
Epoch 3/1000, Loss: 0.2435
Epoch 4/1000, Loss: 0.2435
Epoch 5/1000, Loss: 0.2434
Epoch 6/1000, Loss: 0.2434
Epoch 7/1000, Loss: 0.2434
Epoch 8/1000, Loss: 0.2433
Epoch 9/1000, Loss: 0.2433
Epoch 10/1000, Loss: 0.2433
Epoch 11/1000, Loss: 0.2433
Epoch 12/1000, Loss: 0.2432
Epoch 13/1000, Loss: 0.2432
Epoch 14/1000, Loss: 0.2432
Epoch 15/1000, Loss: 0.2431
Epoch 16/1000, Loss: 0.2431
Epoch 17/1000, Loss: 0.2431
Epoch 18/1000, Loss: 0.2431
Epoch 19/1000, Loss: 0.2430
Epoch 20/1000, Loss: 0.2430
Epoch 21/1000, Loss: 0.2430
Epoch 22/1000, Loss: 0.2429
Epoch 23/1000, Loss: 0.2429
Epoch 24/1000, Loss: 0.2429
Epoch 25/1000, Loss: 0.2428
Epoch 26/1000, Loss: 0.2428
Epoch 27/1000, Loss: 0.2428
Epoch 28/1000, Loss: 0.2428
Epoch 29/1000, Loss: 0.2427
Epoch 30/1000, Loss: 0.2427
Epoch 31/1000, Loss: 0.2427
Epoch 32/1000, Loss: 0.2426
Epoch 33/1000, Loss: 0.2426
Epoch 34/1000, Loss: 0.2426
Epoch 35/1000, Loss: 0.2426
Epoch 36

In [8]:
net.train(x_pos, x_neg, [1000, 1000])

training layer 0 ...
Epoch 1/1000, Loss: 0.2178
Epoch 2/1000, Loss: 0.2178
Epoch 3/1000, Loss: 0.2178
Epoch 4/1000, Loss: 0.2177
Epoch 5/1000, Loss: 0.2177
Epoch 6/1000, Loss: 0.2177
Epoch 7/1000, Loss: 0.2177
Epoch 8/1000, Loss: 0.2177
Epoch 9/1000, Loss: 0.2176
Epoch 10/1000, Loss: 0.2176
Epoch 11/1000, Loss: 0.2176
Epoch 12/1000, Loss: 0.2176
Epoch 13/1000, Loss: 0.2175
Epoch 14/1000, Loss: 0.2175
Epoch 15/1000, Loss: 0.2175
Epoch 16/1000, Loss: 0.2175
Epoch 17/1000, Loss: 0.2174
Epoch 18/1000, Loss: 0.2174
Epoch 19/1000, Loss: 0.2174
Epoch 20/1000, Loss: 0.2174
Epoch 21/1000, Loss: 0.2174
Epoch 22/1000, Loss: 0.2173
Epoch 23/1000, Loss: 0.2173
Epoch 24/1000, Loss: 0.2173
Epoch 25/1000, Loss: 0.2173
Epoch 26/1000, Loss: 0.2172
Epoch 27/1000, Loss: 0.2172
Epoch 28/1000, Loss: 0.2172
Epoch 29/1000, Loss: 0.2172
Epoch 30/1000, Loss: 0.2172
Epoch 31/1000, Loss: 0.2171
Epoch 32/1000, Loss: 0.2171
Epoch 33/1000, Loss: 0.2171
Epoch 34/1000, Loss: 0.2171
Epoch 35/1000, Loss: 0.2170
Epoch 36

In [None]:
net.train(x_pos, x_neg, [200, 200])

training layer 0 ...
Epoch 1/200, Loss: 0.6932
Epoch 2/200, Loss: 0.6932
Epoch 3/200, Loss: 0.6932
Epoch 4/200, Loss: 0.6932
Epoch 5/200, Loss: 0.6932
Epoch 6/200, Loss: 0.6932
Epoch 7/200, Loss: 0.6932
Epoch 8/200, Loss: 0.6932
Epoch 9/200, Loss: 0.6932
Epoch 10/200, Loss: 0.6932
Epoch 11/200, Loss: 0.6932
Epoch 12/200, Loss: 0.6932
Epoch 13/200, Loss: 0.6932
Epoch 14/200, Loss: 0.6932
Epoch 15/200, Loss: 0.6932
Epoch 16/200, Loss: 0.6932
Epoch 17/200, Loss: 0.6932
Epoch 18/200, Loss: 0.6932
Epoch 19/200, Loss: 0.6932
Epoch 20/200, Loss: 0.6932
Epoch 21/200, Loss: 0.6932
Epoch 22/200, Loss: 0.6932
Epoch 23/200, Loss: 0.6932
Epoch 24/200, Loss: 0.6932
Epoch 25/200, Loss: 0.6932
Epoch 26/200, Loss: 0.6932
Epoch 27/200, Loss: 0.6932
Epoch 28/200, Loss: 0.6932
Epoch 29/200, Loss: 0.6932
Epoch 30/200, Loss: 0.6932
Epoch 31/200, Loss: 0.6932
Epoch 32/200, Loss: 0.6932
Epoch 33/200, Loss: 0.6932
Epoch 34/200, Loss: 0.6932
Epoch 35/200, Loss: 0.6932
Epoch 36/200, Loss: 0.6932
Epoch 37/200, Lo