# TODO
- 调研: Loss 替换

In [None]:
import numpy as np

class Layer:
    def __init__(self, input_size, output_size):
        # 初始化权重和偏置
        self.W = np.random.randn(input_size, output_size) * np.sqrt(2. / input_size)
        self.B = np.zeros((1, output_size))
        
        # 优化器
        self.optimizer = AdamOptimizer(learning_rate=3e-2)
        
        # 损失相关的阈值
        self.threshold = 2.0

    def forward(self, X):
        # 归一化处理
        X_normalized = X / np.linalg.norm(X, ord=2, axis=1, keepdims=True)
        # 线性变换后应用ReLU激活函数
        Y = np.dot(X_normalized, self.W) + self.B
        return np.maximum(0, Y)
    
    def compute_loss(self, Y_pos, Y_neg):
        # 计算正样本和负样本的损失
        g_pos = np.mean(Y_pos**2, axis=1)
        g_neg = np.mean(Y_neg**2, axis=1)
        loss = ((np.log(1 + np.exp(-g_pos + self.threshold)) + np.log(1 + np.exp(g_neg - self.threshold))) / 2).mean()
        return loss

    def compute_loss_gradients(self, Y_pos, Y_neg):
        # 损失函数梯度计算
        N_pos = Y_pos.shape[0]
        N_neg = Y_neg.shape[0]

        g_pos = np.mean(Y_pos**2, axis=1)
        g_neg = np.mean(Y_neg**2, axis=1)

        dloss_dgpos = -0.5 * np.exp(-g_pos + self.threshold) / (1 + np.exp(-g_pos + self.threshold))
        dloss_dgneg = 0.5 * np.exp(g_neg - self.threshold) / (1 + np.exp(g_neg - self.threshold))

        dloss_dY_pos = (2 / N_pos) * dloss_dgpos[:, np.newaxis] * Y_pos
        dloss_dY_neg = (2 / N_neg) * dloss_dgneg[:, np.newaxis] * Y_neg

        return dloss_dY_pos, dloss_dY_neg

    def backward(self, X_pos, X_neg, Y_pos, Y_neg):
        # 计算损失梯度
        dloss_dY_pos, dloss_dY_neg = self.compute_loss_gradients(Y_pos, Y_neg)
        
        # 计算ReLU激活后的梯度
        relu_grad_pos = (Y_pos > 0).astype(np.float32)
        relu_grad_neg = (Y_neg > 0).astype(np.float32)
        dloss_dY_pos *= relu_grad_pos
        dloss_dY_neg *= relu_grad_neg
        
        # 权重和偏置的梯度
        dW_pos = np.dot(X_pos.T, dloss_dY_pos) / X_pos.shape[0]
        dW_neg = np.dot(X_neg.T, dloss_dY_neg) / X_neg.shape[0]
        dB_pos = np.sum(dloss_dY_pos, axis=0, keepdims=True) / X_pos.shape[0]
        dB_neg = np.sum(dloss_dY_neg, axis=0, keepdims=True) / X_neg.shape[0]
        
        # 合并正样本和负样本的梯度
        dW = dW_pos + dW_neg
        dB = dB_pos + dB_neg
        
        # 使用优化器更新参数
        self.W, self.B = self.optimizer.update(self.W, self.B, dW, dB)

# 注意：这里假设AdamOptimizer类以及其update方法已经定义好，且可以接受参数和梯度进行更新。


The Forward-Forward algorithm is a greedy multi-layer learning procedure inspired by Boltzmann
machines (Hinton and Sejnowski, 1986) and Noise Contrastive Estimation (Gutmann and Hyvärinen,
2010). The idea is to replace the forward and backward passes of backpropagation by two forward
passes that operate in exactly the same way as each other, but on different data and with opposite
objectives. The positive pass operates on real data and adjusts the weights to increase the goodness in
every hidden layer. The negative pass operates on "negative data" and adjusts the weights to decrease
the goodness in every hidden layer. This paper explores two different measures of goodness – the
sum of the squared neural activities and the negative sum of the squared activities, but many other
measures are possible.

In [40]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.optim import Adam
from torchvision.datasets import MNIST
from torchvision.transforms import Compose, ToTensor, Normalize, Lambda
from torch.utils.data import DataLoader


def MNIST_loaders(train_batch_size=50000, test_batch_size=10000):

    transform = Compose([
        ToTensor(),
        Normalize((0.1307,), (0.3081,)),
        Lambda(lambda x: torch.flatten(x))])

    train_loader = DataLoader(
        MNIST('./data/', train=True,
              download=True,
              transform=transform),
        batch_size=train_batch_size, shuffle=True)

    test_loader = DataLoader(
        MNIST('./data/', train=False,
              download=True,
              transform=transform),
        batch_size=test_batch_size, shuffle=False)

    return train_loader, test_loader


def overlay_y_on_x(x, y):
    """
    Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
    """
    x_ = x.clone()
    x_[:, :10] *= 0.0
    x_[range(x.shape[0]), y] = x.max()
    return x_


class Net(torch.nn.Module):
    def __init__(self, dims):
        super().__init__()
        self.layers = []
        for d in range(len(dims) - 1):
            self.layers += [Layer(dims[d], dims[d + 1]).cuda()]

    def predict(self, x): 
        # 这个predict方法是理解ff方法的关键，它不是像普通的predict方法一样，输入一个样本，输出一个长度为num_cls的softmax预测向量
        # 而是一个样本反复输入这个网络num_cls次，把每种带标签的可能都计算一个goodness，也就是这个数据是好数据的可能性，找出最高goodness的就是预测类别
        goodness_per_label = []
        for label in range(10): # 对每一个标签进行预测
            h = overlay_y_on_x(x, label) # h是输入x和标签label的叠加
            goodness = [] # goodness是一个列表，里面存放了每一层的结果向量的均方
            for layer in self.layers: # 对每一层进行前传
                h = layer(h) # h是每一层的输出
                goodness += [h.pow(2).mean(1)] # goodness是每一层的结果向量的均方。h.pow(2)是h的每一个元素的平方，mean(1)是对每一行求均值
            goodness_per_label += [sum(goodness).unsqueeze(1)] # goodness_per_label是每一层的结果向量的均方的和
        goodness_per_label = torch.cat(goodness_per_label, 1) # goodness_per_label是每一层的结果向量的均方的和的列表
        return goodness_per_label.argmax(1) # 返回的是goodness_per_label中每一行最大值的索引，也就是说，返回的是每一行最大值的列索引

    def train(self): #, x_pos, x_neg): # 这个train方法是对整个网络进行训练，训练的目标是让正样本的结果向量的均方上升，负样本的结果向量的均方下降
        x, y = next(iter(train_loader))
        x, y = x.cuda(), y.cuda()
        x_pos = overlay_y_on_x(x, y)
        # rnd = torch.randperm(x.size(0)) # 生成一个从0到n-1的随机整数序列。
        # x_neg = overlay_y_on_x(x, y[rnd])
        y_rnd = y.clone()
        for i, y_i in enumerate(y):
            li = list(range(10))
            li.remove(y_i)
            j = np.random.choice(li)
            y_rnd[i] = j

        x_neg = overlay_y_on_x(x, y_rnd)

        h_pos, h_neg = x_pos, x_neg # h_pos和h_neg是正样本和负样本的输入
        for i, layer in enumerate(self.layers): # 对每一层进行训练
            print('training layer', i, '...') # 这里的i是层数
            h_pos, h_neg = layer.train(h_pos, h_neg) # 对每一层进行训练，得到了正样本和负样本的结果向量，这个结果向量是该层的输出，也是下一层的输入
            # 也就是说，这个训练的过程中，正样本在前传过程中得到的每一层输出都被认为是正的，负样本在前传过程中得到的每一层输出都被认为是负的，也就是说，出身决定一切


class Layer(nn.Linear):
    def __init__(self, in_features, out_features,
                 bias=True, device=None, dtype=None):
        super().__init__(in_features, out_features, bias, device, dtype)
        self.relu = torch.nn.ReLU()
        self.opt = Adam(self.parameters(), lr=0.008)
        self.threshold = 2.0
        self.num_epochs = 60000 # 训练的次数是1000次

    def forward(self, x):
        # x_direction = x / (x.norm(2, 1, keepdim=True) + 1e-4)  # 这个是对输入做了归一化，使得输入的模长为1，这在论文里有解释
        x_direction = x / x.max()
        x_direction =  self.relu(
            torch.mm(x_direction, self.weight.T) +
            self.bias.unsqueeze(0) # 这个是对输入做了最基本的前向传播，得到了结果向量
            ) # 注意，在前传之后，随即使用了relu激活函数，这意味着每一层的所有激活值都是非负的
        return x_direction

    def train(self, x_pos, x_neg):
        # 训练其实就是对每一层分别进行训练，训练的目标是让正样本的结果向量的均方上升，负样本的结果向量的均方下降
        # 每一层的forward方法定义如上一个函数，这里的train方法定义了训练的过程
        for i in tqdm(range(self.num_epochs)):
            # minibatch
            bz = 100
            for j in range(0, x_pos.size(0), bz):
                # mask = torch.randperm(x_pos.size(0))[:bz]
                # x_pos = x_pos[mask] # 随机采样1000个正样本
                # x_neg = x_neg[mask] # 随机采样1000个负样本


                # for data, name in zip([x, x_pos, x_neg], ['orig', 'pos', 'neg']):
                #     visualize_sample(data, name)
                
                # print(self.forward(x_pos).pow(2), self.forward(x_pos).pow(2).shape)
                g_pos = self.forward(x_pos).pow(2).mean(1) # g_pos 是正样本x_pos在该层前向传播得到的结果向量的均方
                g_neg = self.forward(x_neg).pow(2).mean(1) # g_neg 是负样本x_neg在该层前向传播得到的结果向量的均方
                # 论文关于使用L2范数来度量的理由：
                # There are two main reasons for using the squared length of the activity vector as the goodness function.
                # First, it has very simple derivatives. Second, layer normalization removes all trace of the goodness.
                
                # The following loss pushes pos (neg) samples to
                # values larger (smaller) than the self.threshold.
                # 随着训练过程，loss下降，g_pos将上升，g_neg将下降
                loss = torch.log(1 + torch.exp(torch.cat([
                    -g_pos + self.threshold,
                    g_neg - self.threshold]))).mean() # loss = [log(1+exp(-(g_pos-threshold))) + log(1+exp(g_neg-threshold))] / 2
                # print(loss)
                self.opt.zero_grad()
                # this backward just compute the derivative and hence
                # is not considered backpropagation.
                loss.backward()
                self.opt.step()
                # 关于这里为什么能够work：
                # 1. loss是权重的函数，loss的核心思想是让g_pos上升，g_neg下降
                # 2. g_pos和g_neg是x_pos和x_neg的函数，x_pos和x_neg反映了客观世界，是这样要学习的对象。有了x_pos和x_neg，就能够计算出g_pos和g_neg，有了g_pos和g_neg，就能够计算出loss
                # 3. 通过loss.backward()，计算loss对权重的梯度，使得loss下降，g_pos上升，g_neg下降
                # 4. 通过self.opt.step()，更新了self.weight和self.bias
        return self.forward(x_pos).detach(), self.forward(x_neg).detach()

    
# def visualize_sample(data, name='', idx=0):
#     reshaped = data[idx].cpu().reshape(28, 28)
#     plt.figure(figsize = (4, 4))
#     plt.title(name)
#     plt.imshow(reshaped, cmap="gray")
#     plt.show()
    
    
if __name__ == "__main__":
    torch.manual_seed(1234)
    train_loader, test_loader = MNIST_loaders(train_batch_size=1000, test_batch_size=10000)

    net = Net([784, 500, 500])
    # x, y = next(iter(train_loader))
    # x, y = x.cuda(), y.cuda()
    # x_pos = overlay_y_on_x(x, y)
    # # rnd = torch.randperm(x.size(0)) # 生成一个从0到n-1的随机整数序列。
    # # x_neg = overlay_y_on_x(x, y[rnd])
    # y_rnd = y.clone()
    # for i, y_i in enumerate(y):
    #     li = list(range(10))
    #     li.remove(y_i)
    #     j = np.random.choice(li)
    #     y_rnd[i] = j

    # x_neg = overlay_y_on_x(x, y_rnd)
    # # for data, name in zip([x, x_pos, x_neg], ['orig', 'pos', 'neg']):
    # #     visualize_sample(data, name)
    
    net.train()#x_pos, x_neg)

    print('train error:', 1.0 - net.predict(x).eq(y).float().mean().item())

    x_te, y_te = next(iter(test_loader))
    x_te, y_te = x_te.cuda(), y_te.cuda()

    print('test error:', 1.0 - net.predict(x_te).eq(y_te).float().mean().item())

    print(net.predict(x_te)[:30], y_te[:30])


training layer 0 ...


100%|██████████| 60000/60000 [06:35<00:00, 151.68it/s]


training layer 1 ...


100%|██████████| 60000/60000 [05:39<00:00, 176.90it/s]


train error: 0.07799994945526123
test error: 0.17820000648498535
tensor([7, 6, 1, 0, 4, 1, 4, 9, 4, 9, 0, 6, 9, 0, 1, 5, 9, 7, 9, 4, 7, 6, 6, 5,
        7, 0, 7, 4, 0, 1], device='cuda:0') tensor([7, 2, 1, 0, 4, 1, 4, 9, 5, 9, 0, 6, 9, 0, 1, 5, 9, 7, 3, 4, 9, 6, 6, 5,
        4, 0, 7, 4, 0, 1], device='cuda:0')


In [None]:
import numpy as np
np.log(2)

In [1]:
import numpy as np
from dataset.mnist import load_mnist
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

# from tensorflow.keras.datasets import mnist

# def overlay_y_on_x(x, y):
#     """
#     Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
#     """
#     x_ = x.clone()
#     x_[:, :10] *= 0.0
#     x_[range(x.shape[0]), y] = x.max()
#     return x_
# 假设x,y是numpy数组, 用numpy重写
def overlay_y_on_x(x, y):
    """
    Replace the first 10 pixels of data [x] with one-hot-encoded label [y]
    """
    
    x_ = x.copy()
    y_ = y.copy()
    y_ = y_ * x.max()
    x_[:, :10] = y_
    # x_[range(x.shape[0]), y] = x.max()
    return x_

class LossFunction:
    def __init__(self):
        pass
    
    def loss(self, Y_pos, Y_neg):
        return np.sum(- Y_pos ** 2 + Y_neg ** 2)
    
    def compute_gradients(self, X_pos, X_neg, W, B):
        # 计算 Y_pos 和 Y_neg
        Y_pos = np.dot(X_pos, W) + B
        Y_neg = np.dot(X_neg, W) + B
        
        # 计算损失函数
        loss = self.loss(Y_pos, Y_neg)
        
        # 计算损失函数对 W 的梯度
        dW = - np.dot(X_pos.T, 2 * Y_pos) + np.dot(X_neg.T, 2 * Y_neg)
        
        # 计算损失函数对 B 的梯度
        dB = np.sum(2 * ( - Y_pos + Y_neg))
        
        return loss, dW, dB

# 加载 MNIST 数据集
(X_train, y_train), (X_test, y_test) = load_mnist(normalize=True, one_hot_label=False)

# 数据预处理
X_train = X_train.reshape(-1, 28*28) / 255.0
X_test = X_test.reshape(-1, 28*28) / 255.0

# 将标签转换为 one-hot 编码
y_train = np.eye(10)[y_train]
y_test = np.eye(10)[y_test]

# 定义神经网络类
class NeuralNetwork:
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        
        # 初始化权重和偏置
        self.W = np.random.randn(input_size, output_size)
        self.B = np.zeros((1, output_size))
        
        # 初始化损失函数对象
        self.loss_function = LossFunction()

    def forward(self, X):
        # 计算 Y
        Y = np.dot(X, self.W) + self.B
        return Y
    
    def train(self, X_train_pos, X_train_neg, num_epochs=10000, learning_rate=1e-2):
        for epoch in range(num_epochs):
            # 计算梯度
            loss, dW, dB = self.loss_function.compute_gradients(X_train_pos, X_train_neg, self.W, self.B)
            
            # 更新参数
            self.W -= learning_rate * dW
            self.B -= learning_rate * dB
            
            # 打印损失
            # if (epoch+1) % 10 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}')

            # 评估模型
            # if (epoch+1) % 10 == 0:
            y_pred =  self.predict(X_train_pos)
            y_true = np.argmax(y_train, axis=1)
            accuracy = self.eval(y_pred, y_true)

            print(y_pred, y_true)
            print('\n', y_pred[:30], '\n', y_true[:30])
            print(f'Accuracy: {accuracy:.4f}')

    
    def predict(self, x):
        goodness_per_label = []
        for label in range(10): # 对每一个标签进行预测
            label = np.eye(10)[label] # 将标签转换为 one-hot 编码
            h = overlay_y_on_x(x, label) # h是输入x和标签label的叠加
            goodness = [] # goodness是一个列表，里面存放了每一层的结果向量的均方
            # for layer in self.layers: # 对每一层进行前传
            #     h = layer(h) # h是每一层的输出
            h = self.forward(h) # h是每一层的输出
            # print(h.shape): (60000, 10)
            # goodness += [(h**2).mean(1)] # goodness是每一层的结果向量的均方。h.pow(2)是h的每一个元素的平方，mean(1)是对每一行求均值
            goodness_per_label.append((h**2).mean(1)) # goodness_per_label是每一层的结果向量的均方的和
        # goodness_per_label = torch.cat(goodness_per_label, 1) # goodness_per_label是每一层的结果向量的均方的和的列表
        return np.array(goodness_per_label).argmax(0) # 返回的是goodness_per_label中每一行最大值的索引，也就是说，返回的是每一行最大值的列索引
        # # 计算 Y
        # Y = np.dot(X_test, self.W) + self.B
        
        # # 计算准确率
        # accuracy = np.mean(np.argmax(Y, axis=1) == np.argmax(y_test, axis=1))
        # print(f'Accuracy: {accuracy:.4f}')
                
    def eval(self, y_pred, y_true):
        return np.mean(y_pred == y_true)

np.random.seed(0)
# 初始化神经网络
input_size = X_train.shape[1]
output_size = 500
nn = NeuralNetwork(input_size, output_size)

# 定义正负样本
x_pos = overlay_y_on_x(X_train, y_train)
# rnd = torch.randperm(x.size(0)) # 生成一个从0到n-1的随机整数序列
# 将上一句用numpy重写
rnd = np.random.permutation(X_train.shape[0])
x_neg = overlay_y_on_x(X_train, y_train[rnd])

# 使用相同的数据作为正负样本进行训练
nn.train(x_pos, x_neg, learning_rate=3e-3)


Epoch 1/10000, Loss: 25.0149
[2 9 9 ... 2 2 9] [5 0 4 ... 5 6 8]

 [2 9 9 3 3 3 3 1 3 9 1 6 1 3 3 3 3 3 2 3 3 9 3 1 3 2 1 3 9 3] 
 [5 0 4 1 9 2 1 3 1 4 3 5 3 6 1 7 2 8 6 9 4 0 9 1 1 2 4 3 2 7]
Accuracy: 0.0992
Epoch 2/10000, Loss: 13.5292
[2 9 9 ... 3 2 9] [5 0 4 ... 5 6 8]

 [2 9 9 3 3 3 3 1 3 9 1 6 1 3 3 3 3 3 2 3 3 9 3 1 3 2 1 3 9 3] 
 [5 0 4 1 9 2 1 3 1 4 3 5 3 6 1 7 2 8 6 9 4 0 9 1 1 2 4 3 2 7]
Accuracy: 0.1043
Epoch 3/10000, Loss: 2.0436
[2 9 9 ... 3 2 9] [5 0 4 ... 5 6 8]

 [2 9 9 3 3 3 3 1 3 9 1 6 1 3 3 3 3 3 2 3 3 9 3 1 3 2 1 3 9 3] 
 [5 0 4 1 9 2 1 3 1 4 3 5 3 6 1 7 2 8 6 9 4 0 9 1 1 2 4 3 2 7]
Accuracy: 0.1097
Epoch 4/10000, Loss: -9.4422
[2 9 9 ... 3 2 9] [5 0 4 ... 5 6 8]

 [2 9 9 3 3 3 3 1 3 9 1 6 3 3 3 3 3 3 2 3 3 9 3 1 3 2 1 3 9 3] 
 [5 0 4 1 9 2 1 3 1 4 3 5 3 6 1 7 2 8 6 9 4 0 9 1 1 2 4 3 2 7]
Accuracy: 0.1159
Epoch 5/10000, Loss: -20.9284
[2 9 9 ... 3 2 9] [5 0 4 ... 5 6 8]

 [2 9 9 3 3 3 3 1 3 9 3 6 3 3 3 3 3 3 2 3 3 9 3 1 3 2 1 3 9 3] 
 [5 0 4 1 9 2 1 3 1 4 3 5 3 6 

In [None]:
x_pos[0][:10], x_neg[0][:10]

In [None]:
x.max(), y, x_te.shape, y_te

重写Layer，打开loss的细节：

In [None]:
import pdb

class Layer_dev(nn.Linear):
    def __init__(self, in_features, out_features,
                 bias=True, device=None, dtype=None):
        super().__init__(in_features, out_features, bias, device, dtype)
        self.relu = torch.nn.ReLU()
        self.opt = Adam(self.parameters(), lr=0.03)
        self.threshold = 2.0
        self.num_epochs = 1000 # 训练的次数是1000次

    def forward(self, x):
        x_direction = x / (x.norm(2, 1, keepdim=True) + 1e-4)  # 这个是对输入做了归一化，使得输入的模长为1，这在论文里有解释
        x_direction =  self.relu(
            torch.mm(x_direction, self.weight.T) +
            self.bias.unsqueeze(0) # 这个是对输入做了最基本的前向传播，得到了结果向量
            ) # 注意，在前传之后，随即使用了relu激活函数，这意味着每一层的所有激活值都是非负的
        return x_direction

    def train(self, x_pos, x_neg):
        # 训练其实就是对每一层分别进行训练，训练的目标是让正样本的结果向量的均方上升，负样本的结果向量的均方下降
        # 每一层的forward方法定义如上一个函数，这里的train方法定义了训练的过程
        for _ in tqdm(range(self.num_epochs)): # epoch为1000，意味着训练1000次
            g_pos = self.forward(x_pos).pow(2).mean(1) # g_pos 是正样本x_pos在该层前向传播得到的结果向量的均方
            g_neg = self.forward(x_neg).pow(2).mean(1) # g_neg 是负样本x_neg在该层前向传播得到的结果向量的均方
            # 论文关于使用L2范数来度量的理由：
            # There are two main reasons for using the squared length of the activity vector as the goodness function.
            # First, it has very simple derivatives. Second, layer normalization removes all trace of the goodness.
            
            # The following loss pushes pos (neg) samples to
            # values larger (smaller) than the self.threshold.
            # 随着训练过程，loss下降，g_pos将上升，g_neg将下降
            loss = torch.log(1 + torch.exp(torch.cat([
                -g_pos + self.threshold,
                g_neg - self.threshold]))).mean() # loss = [log(1+exp(-(g_pos-threshold))) + log(1+exp(g_neg-threshold))] / 2
            print(loss)
            print(type(loss))
            # pdb.set_trace()

            self.opt.zero_grad()
            # this backward just compute the derivative and hence
            # is not considered backpropagation.
            loss.backward()
            self.opt.step()
            # 关于这里为什么能够work：
            # 1. loss是权重的函数，loss的核心思想是让g_pos上升，g_neg下降
            # 2. g_pos和g_neg是x_pos和x_neg的函数，x_pos和x_neg反映了客观世界，是这样要学习的对象。有了x_pos和x_neg，就能够计算出g_pos和g_neg，有了g_pos和g_neg，就能够计算出loss
            # 3. 通过loss.backward()，计算loss对权重的梯度，使得loss下降，g_pos上升，g_neg下降
            # 4. 通过self.opt.step()，更新了self.weight和self.bias
        return self.forward(x_pos).detach(), self.forward(x_neg).detach()

layer_dev = Layer_dev(784, 784).cuda()

In [None]:
h_pos, h_neg = layer_dev.train(x_pos, x_neg)
h_pos, h_neg

In [None]:
x_pos.shape, 28*28

In [None]:
layer_dev.forward(x_pos).shape

In [None]:
h_pos, h_neg = layer_dev.train(x_pos, x_neg)
h_pos, h_neg

In [None]:
from matplotlib import pyplot as plt
plt.imshow(h_pos[0].cpu().reshape(28, 28), cmap='gray')
plt.show()

In [None]:
x, y, x.shape, y.shape